In [None]:
# first time setup only
#!python ./fetch_dataset.py
#!unzip ./frames.zip # only for first time setup

# mount the drive
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

# move to the git repo
%cd /content/drive/Shareddrives/StellarSummarizers/the-stellar-summarizers-sp22

In [None]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import ToTensor, ToPILImage
import torchvision.models as models
from torchvision.io import read_image
from torchvision.utils import make_grid
import torchvision.transforms.functional as F

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os


class SumMeDataset(Dataset):
    def __init__(self, annotations_filename, img_dir, transform=None, target_transform=None):

        self.annotation_filename = annotations_filename
        self.annotation = pd.read_csv(annotations_filename, header=0)

        self.img_dir = img_dir
        self.video_name = img_dir.split('/')[-1]
        self.frame_labels = self.annotation[self.annotation['video_name'] == self.video_name]['gt_score']

        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.frame_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, 'img_' + str(idx + 1).zfill(5) + '.jpg')
        image = read_image(img_path)
        label = self.frame_labels.iloc[idx]

        if self.transform:
            image = self.transform(ToPILImage()(image.to('cuda'))) # transform to float point 
        if self.target_transform:
            label = self.target_transform(label)

        return image, label, idx

def show(imgs):
    """Arrange images in a grid formation; helper function of make_grid"""
    if not isinstance(imgs, list):
        imgs = [imgs]
    fix, axs = plt.subplots(ncols=len(imgs), squeeze=False, figsize=(20, 16))
    for i, img in enumerate(imgs):
        img = img.detach()
        img = F.to_pil_image(img)
        axs[0, i].imshow(np.asarray(img))
        axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])

In [None]:
# get device for training
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using: {device.upper()}")

annotations_filename = './frames/annotation.csv'
batch_size = 32

# instantiating the dataset
videos_root = './frames/Jumps'
dataset = SumMeDataset(annotations_filename, videos_root, transform=ToTensor())
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, drop_last=True)
print(f'{videos_root} dataset has {len(dataset)} samples')

# features, labels = next(iter(dataloader))
# print("Data Loader:")
# print(f"Feature batch shape: {features.size()}")
# print(f"Labels batch shape: {labels.size()}")

# load validation dataset too
validation_video_root = './frames/Cooking'
validation_dataset = SumMeDataset(annotations_filename, validation_video_root, transform=ToTensor())
validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
print(f'{validation_video_root} dataset has {len(validation_dataset)} samples')

## Encoder

In [None]:
# encoder
class new_resNext(torch.nn.Module):
    def __init__(self, fc_size=2048, large=False, pretrained=True):
        super(new_resNext,self).__init__()
        if large:
            self.model = models.resnext101_32x8d(pretrained=pretrained)
        else:
            self.model = models.resnext50_32x4d(pretrained=pretrained)

        self.model.fc = torch.nn.Linear(fc_size, 10)

    def forward(self,x):
        x = self.model(x)
        return x

new_resNext = new_resNext().to(device) # models move to different device inplace

## RNN

In [None]:
class Model(nn.Module):
    def __init__(self, input_size, output_size, hidden_size, n_layers):
        super(Model, self).__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers

        # defining the layers
        self.rnn = nn.LSTM(input_size, hidden_size, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input):
        output, hidden = self.rnn(input)
        output = self.fc(output)

        return output, hidden

model = Model(10, 1, 20, 20)
model.to(device)

# define loss and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

## Training

In [None]:
from tqdm.notebook import tqdm

def test_loop(dataloader, model, num_samples):
    """Iterate through the whole dataset and calculate average loss"""
    test_loss = 0
    preds = np.zeros(num_samples) - 1 # 1286 samples in cooking

    with torch.no_grad():
        for features, labels, idxs in tqdm(dataloader):

            # encode the features
            features = features.to(device)
            labels = labels.to(device)
            labels = torch.reshape(labels, (1, batch_size, 1))

            encoded_features = new_resNext(features)
            batched_seq = torch.reshape(encoded_features, (1, batch_size, 10))

            pred, _= model(batched_seq)
            test_loss += criterion(pred, labels.float()).item()

            for index, pred in zip(idxs, torch.flatten(pred)):
                preds[index] = pred

    test_loss /= batch_size
    print(f'test_loss = {test_loss}')
    return test_loss, preds

# TRAINING LOOP
def train_loop():
    """One iteration of training"""
    for features, labels, _ in tqdm(dataloader):

        # move features and labels to GPU
        features = features.to(device) # data doesn't move to different device inplace ! 
        labels = labels.to(device)
        labels = torch.reshape(labels, (1, batch_size, 1))

        optimizer.zero_grad()

        # encode image features using resNext and reshape
        encoded_features = new_resNext(features)
        batched_seq = torch.reshape(encoded_features, (1, batch_size, 10))
        batched_seq.to(device)

        output, hidden = model(batched_seq) # forward pass
        loss = criterion(output, labels.float()) # calcualte MSE
        loss.backward()
        optimizer.step()

# Train for num_epochs
num_epochs = 5
for epoch in range(num_epochs):
    print(f'epoch {epoch}')
    train_loop(dataloader, model)
    test_loss, _ = test_loop(dataloader, model, len(dataset))

## Testing / Qualitative Evaluation

In [None]:
test_loss, preds = test_loop(validation_dataloader, model)
np.min(preds), np.max(preds), np.average(preds) # show the stats of predicted values

In [None]:
# find top ranked frames
best_feature = []
top_imgs_i = np.sort(np.argsort(preds)[-5:])
for img_i in top_imgs_i:
    feature, label, _ = validation_dataset[img_i]
    print(preds[img_i], label)
    best_feature.append(feature)

In [None]:
grid = make_grid(best_feature, nrow=5)
show(grid)