In [97]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
import os
import pytorch_lightning
import pytorchvideo.data
import torch.utils.data

from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    RemoveKey,
    ShortSideScale,
    UniformTemporalSubsample
)

from torchvision.transforms import (
    Compose,
    Lambda,
    RandomCrop,
    RandomHorizontalFlip,
    RandomRotation
)

import os 
import csv
import json
from typing import List

import torch
import torch.nn.functional as F
import torchvision.transforms as T
from PIL import Image
from pytorchvideo.data.encoded_video import EncodedVideo
from torchvision.transforms._transforms_video import NormalizeVideo

from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample,
)

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from ipywidgets import Video

In [1]:
!free -g

               total        used        free      shared  buff/cache   available
Mem:              94           1          90           0           1          91
Swap:              0           0           0


In [98]:
device = "cuda" if torch.cuda.is_available() else "cpu" 

In [None]:
num_frames = 30
sampling_rate = 2
frames_per_second = 30

clip_duration = (num_frames * sampling_rate) / frames_per_second

video_transform = ApplyTransformToKey(
    key="video",
    transform=T.Compose(
        [
            UniformTemporalSubsample(num_frames), 
            T.Lambda(lambda x: x / 255.0),  
            ShortSideScale(size=256),
            RandomHorizontalFlip(p=0.5),
            NormalizeVideo(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            # RandomRotation(10)
        ]
    ),
)

Unknown instance spec: 

In [149]:
video_path = "data/train/2_5395803543229709261.mp4"

In [162]:
video = EncodedVideo.from_path(video_path)

int(video.duration)

315

In [154]:
# Initialize an EncodedVideo helper class
video = EncodedVideo.from_path(video_path)

video
# Load the desired clip and specify the start and end duration.
# The start_sec should correspond to where the action occurs in the video
video_data = video.get_clip(start_sec=0, end_sec=2.0)

# Apply a transform to normalize the video input
video_data = video_transform(video_data)

# Move the inputs to the desired device
video_inputs = video_data["video"]

# Take the first clip 
# The model expects inputs of shape: B x C x T x H x W
video_input = video_inputs[0][None, ...]

{'video': tensor([[[[  9.,   9.,   9.,  ...,  46.,  47.,  47.],
          [  9.,   9.,   9.,  ...,  47.,  48.,  48.],
          [  9.,   9.,   9.,  ...,  53.,  53.,  53.],
          ...,
          [122., 132., 123.,  ...,  57.,  61.,  64.],
          [131., 142., 132.,  ...,  56.,  58.,  62.],
          [136., 146., 137.,  ...,  54.,  57.,  61.]],

         [[  9.,   9.,   9.,  ...,  62.,  62.,  64.],
          [  9.,   9.,   9.,  ...,  64.,  66.,  67.],
          [  9.,   9.,   9.,  ...,  78.,  82.,  82.],
          ...,
          [ 58.,  54.,  49.,  ...,  80.,  75.,  80.],
          [ 81.,  86.,  69.,  ..., 100.,  70.,  65.],
          [ 94.,  90.,  55.,  ...,  93.,  73.,  48.]],

         [[ 66.,  71.,  68.,  ...,  54.,  57.,  55.],
          [ 63.,  66.,  64.,  ...,  54.,  57.,  55.],
          [ 57.,  59.,  59.,  ...,  54.,  57.,  55.],
          ...,
          [ 54.,  69.,  68.,  ...,  15.,  15.,  15.],
          [ 63., 105.,  81.,  ...,  15.,  17.,  17.],
          [ 39.,  66., 

In [153]:
video_inputs.shape

torch.Size([3, 160, 256, 455])

In [151]:
# plt.imshow(np.array(video_inputs).transpose((1, 2, 3, 0))[100])

In [147]:
class KineticsDataModule(pytorch_lightning.LightningDataModule):
    _DATA_PATH = "data"
    _CLIP_DURATION = 2
    _BATCH_SIZE = 8
    _NUM_WORKERS = 8
    
    def train_dataloader(self):
        train_transform = ApplyTransformToKey(
            key="video",
            transform=T.Compose(
                [
                    UniformTemporalSubsample(num_frames), 
                    T.Lambda(lambda x: x / 255.0),  
                    ShortSideScale(size=256),
                    RandomHorizontalFlip(p=0.5),
                    NormalizeVideo(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                    # RandomRotation(10)
                ]
            ),
        )
        train_dataset = pytorchvideo.data.Kinetics(
            data_path=os.path.join(self._DATA_PATH, "train.csv"),
            clip_sampler=pytorchvideo.data.make_clip_sampler("random", self._CLIP_DURATION),
            transform=train_transform
        )
        return torch.utils.data.DataLoader(
            train_dataset,
            batch_size=self._BATCH_SIZE,
            num_workers=self._NUM_WORKERS,
        )

In [12]:
module = KineticsDataModule()

In [None]:
tdl = module.train_dataloader()

FileNotFoundError: data/train.csv not found.

In [11]:
import pytorchvideo.models.resnet

def make_kinetics_resnet():
  return pytorchvideo.models.resnet.create_resnet(
      input_channel=3, # RGB input from Kinetics
      model_depth=50, # For the tutorial let's just use a 50 layer network
      model_num_class=2,
      norm=nn.BatchNorm3d,
      activation=nn.ReLU,
  )

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class VideoClassificationLightningModule(pytorch_lightning.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = make_kinetics_resnet()

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        # The model expects a video tensor of shape (B, C, T, H, W), which is the
        # format provided by the dataset
        y_hat = self.model(batch["video"])

        # Compute cross entropy loss, loss.backwards will be called behind the scenes
        # by PyTorchLightning after being returned from this method.
        loss = F.cross_entropy(y_hat, batch["label"])

        # Log the train loss to Tensorboard
        self.log("train_loss", loss.item())

        return loss

    def validation_step(self, batch, batch_idx):
        y_hat = self.model(batch["video"])
        loss = F.cross_entropy(y_hat, batch["label"])
        self.log("val_loss", loss)
        return loss

    def configure_optimizers(self):
        """
        Setup the Adam optimizer. Note, that this function also can return a lr scheduler, which is
        usually useful for training video models.
        """
        return torch.optim.Adam(self.parameters(), lr=1e-1)

In [9]:
def train():
    classification_module = VideoClassificationLightningModule()
    data_module = KineticsDataModule()
    trainer = pytorch_lightning.Trainer()
    trainer.fit(classification_module, data_module)
    return trainer