# Classification with a vanilla model vs trained model.

## Install pre requisites

In [None]:
!pip install -q git+https://github.com/Atze00/MoViNet-pytorch.git
!pip install -q av
!pip install -q -U aperturedb

## Util functions

### Load datasets as clips (of 16 frames), sampled at 5fps

In [None]:
from torchvision.transforms import v2 as T
import torch

def get_common():
    """
    Just common parameters.
    Applies to the training and data loading sections.
    """
    torch.manual_seed(97)
    num_frames = 16
    clip_steps = 2
    Bs_Train = 16
    Bs_Test = 16

    transform = T.Compose([
                                    T.Lambda(lambda x: x.permute(3, 0, 1, 2) / 255.),
                                    T.Resize((200, 200)),
                                    T.RandomHorizontalFlip(),
                                    # T.Normalize(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989]),
                                    T.RandomCrop((172, 172))])
    transform_test = T.Compose([
                                    T.Lambda(lambda x: x.permute(3, 0, 1, 2) / 255.),
                                    # T.ToTensor()/255.0,
                                    # T.ToTensor(),
                                    T.Resize((200, 200)),
                                    # T.Normalize(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989]),
                                    T.CenterCrop((172, 172))])
    return num_frames, clip_steps, Bs_Train, Bs_Test, transform, transform_test


In [None]:
from AHMDB51 import AHMDB51

def get_data_sets():
    """
    Get the datasets from aperturedb.
    The data has been ingested previously.
    """
    num_frames, clip_steps, Bs_Train, Bs_Test, transform, transform_test = get_common()

    hmdb51_test = AHMDB51(
        num_workers=1,
        frame_rate=5,
        frames_per_clip=num_frames,
        step_between_clips=clip_steps,
        train=False,
        transform=transform_test
        )


    return None, hmdb51_test


### Utility function to show a tensor.


In [None]:
from IPython.display import Video, display
import torchvision

def show_tensor(tensor):
    with open("tmp_video.mp4", "wb") as f:
        torchvision.io.write_video(f.name, tensor, fps=5, video_codec="h264")
        f.seek(0)
        display(Video("tmp_video.mp4"))


## Instantiate a off the shelf model

In [None]:
from movinets import MoViNet
from movinets.config import _C

# Use the original movinet based on Kinetics400 dataset when we get pretrained.
model_vanilla = MoViNet(_C.MODEL.MoViNetA0, causal = False, pretrained = True )


## Make a model from trained movinet with hmdb51

In [None]:
import torch

# Load the model trained on HMDB51. It has been trained for 1 epoch.
model_trained = torch.load("movinet_hmdb51_1.pth")


In [None]:
train, test = get_data_sets()
test.classes = {v: k for k, v in test.ci.items()}

## See the shape of the tensor passsed through model.

This point is good to have, and troubleshoot any problems with the input going into the model.

In [None]:
data = test[333]
video, audio, class_index = data
print(video.shape)
x = video.permute(1, 2, 3, 0)
x=(x*255).type(torch.uint8)
show_tensor(x)

In [None]:
ground_truth = class_index
all_classes = test.classes
print(f"{len(all_classes)=} \r\n {all_classes=}\r\n {all_classes[ground_truth]=}")

In [None]:
# Add an extra dim to video tensor to make it compatible with model.
p = video[None, :]
y = model_trained(p)

# Get predictions from the trained movinet
preds = torch.topk(y, 5, largest=True)

#show the top k predictions.
for i in preds.indices[0]:
    print(test.classes[int(i)])


### 

### Predict with vanilla

Take a random clip from the test Dataset (specified as an index between 0 and len test)

Some indices will be out of 51 range, as the model had 600 classifications.

In [None]:

video, _, ground_truth=test[10]
show_tensor((video.permute(1, 2, 3, 0)*255).type(torch.uint8))

display(f"{test.classes[ground_truth]=}")

y = model_vanilla(video[None, :])
op = torch.nn.Softmax(dim=1)
preds = torch.topk(op(y), 5, largest=True)
print("Predictions:")
for i, prob in zip(preds.indices[0], preds.values[0]):
    try:
        prediction = test.classes[int(i)]
        probability = float(prob)
        print(f"{prediction=}, {probability=}")
    except IndexError:
        print(f"Cannot find class for index={i}")
    except KeyError:
        print(f"Cannot find class for index={i}")

### Predict with trained.

Way better predictions.

In [None]:
video, _, ground_truth=test[120]
show_tensor((video.permute(1, 2, 3, 0)*255).type(torch.uint8))


y = model_trained(video[None, :])
op = torch.nn.Softmax(dim=1)
preds = torch.topk(op(y), 5, largest=True)
print("Predictions:")
for i, prob in zip(preds.indices[0], preds.values[0]):
    try:
        prediction = test.classes[int(i)]
        probability = float(prob)
        print(f"{prediction=}, {probability=}")
    except AttributeError as e:
        print(f"Cannot find class for index={i}")