# HAR Predictions

### 1. Importing Modules

In [558]:
import os
import cv2
import sys
import importlib
import torch
import torchvision
import numpy as np

sys.path.insert(0, "../")

# For videos displaying into Notebook
import io
import base64
from IPython.display import HTML

In [559]:
from data_parser import WebmDataset
from data_loader_av import VideoFolder

from models.multi_column import MultiColumn
from transforms_video import *

from utils import load_json_config, remove_module_from_checkpoint_state_dict
from pprint import pprint

### 2. Configuration: Consisting Model Structure (Pretrained), File Paths, No. of classes.

In [560]:
configurations = {
    "model_name": "model3D_1_left_right",
    "output_dir": "trained_models/pretrained",

    "input_mode": "av",

    "data_folder": "/home/avinash/Downloads/20bn-something-something-v2/videos/",

    "json_data_train": "/home/avinash/Downloads/20bn-something-something-v2/annotations/something-something-v2-train.json",
    "json_data_val": "/home/avinash/Downloads/20bn-something-something-v2/annotations/something-something-v2-validation.json",
    "json_data_test": "/home/avinash/Downloads/20bn-something-something-v2/annotations/something-something-v2-test.json",

    "json_file_labels": "/home/avinash/Downloads/20bn-something-something-v2/annotations/something-something-v2-labels.json",

    "num_workers": 8,

    "num_classes": 174,
    "batch_size": 40,
    "clip_size": 72,

    "nclips_train": 1,
    "nclips_val": 1,

    "upscale_factor_train": 1.4,
    "upscale_factor_eval": 1.0,

    "step_size_train": 1,
    "step_size_val": 1,

    "augmentation_mappings_json": "assets/augmentation_mappings.json",
    "augmentation_types_todo": ["left/right", "left/right agnostic", "jitter_fps"],

    "lr": 0.01,
    "last_lr": 0.00001,
    "momentum": 0.9,
    "weight_decay": 0.00001,
    "num_epochs": -1,
    "print_freq": 100,

    "conv_model": "models.model3D_1_224",
    "input_spatial_size": 84,

    "column_units": 512,
    "save_features": True
}

### 3. Model Initialization and Paths

In [561]:
# Initializing Column Model
column_cnn_def = importlib.import_module("{}".format(config['conv_model']))

# checkpoint path to a trained model
checkpoint_path = os.path.join("../", config["output_dir"], config["model_name"], "model_best.pth.tar")

### 4. Loading Model

In [562]:
model = MultiColumn(config['num_classes'], column_cnn_def.Model, int(config["column_units"]))
print (model)

MultiColumn(
  (conv_column): Model(
    (block1): Sequential(
      (0): Conv3d(3, 32, kernel_size=(3, 5, 5), stride=(1, 2, 2), padding=(1, 2, 2))
      (1): BatchNorm3d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): Dropout3d(p=0.2, inplace=False)
    )
    (block2): Sequential(
      (0): Conv3d(32, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
      (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): Conv3d(64, 128, kernel_size=(3, 3, 3), stride=(1, 2, 2), padding=(1, 1, 1))
      (4): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): ReLU(inplace=True)
      (6): Dropout3d(p=0.2, inplace=False)
    )
    (block3): Sequential(
      (0): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
      (1): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_runni

In [563]:
checkpoint = torch.load(checkpoint_path)
checkpoint['state_dict'] = remove_module_from_checkpoint_state_dict(
                                              checkpoint['state_dict'])
model.load_state_dict(checkpoint['state_dict'])
print("=> loaded checkpoint '{}' (epoch {})"
      .format(checkpoint_path, checkpoint['epoch']))

=> loaded checkpoint '../trained_models/pretrained/model3D_1_left_right/model_best.pth.tar' (epoch 84)


### 5. Data Preprocessing

In [564]:
# By applying transformations Scale, ToPILImage & CenterCrop and then
# Compose Mixing those transformations we get following :-
transform_eval_pre = ComposeMix([
        [Scale(config['input_spatial_size']), "img"],
        [torchvision.transforms.ToPILImage(), "img"],
        [torchvision.transforms.CenterCrop(config['input_spatial_size']), "img"]
         ])

# By applying transformations ToTensor, Normalize and then
# Compose Mixing those transformations we get following :-
transform_post = ComposeMix([
        [torchvision.transforms.ToTensor(), "img"],
        [torchvision.transforms.Normalize(
                   mean=[0.485, 0.456, 0.406],  # default values for imagenet
                   std=[0.229, 0.224, 0.225]), "img"]
         ])

# Loading the Validation Dataset
val_data = VideoFolder(root=config['data_folder'],
                       json_file_input=config['json_data_val'],
                       json_file_labels=config['json_file_labels'],
                       clip_size=config['clip_size'],
                       nclips=config['nclips_val'],
                       step_size=config['step_size_val'],
                       is_val=True,
                       transform_pre=transform_eval_pre,
                       transform_post=transform_post,
                       get_item_id=True,
                       )
dict_two_way = val_data.classes_dict

### 6. Predicting

#### 6.1. Selecting a sample from the Validation Set

In [565]:
selected_indx = 4840

#### 6.2 Preprocessing Video Data

In [566]:
input_data, target, item_id = val_data[selected_indx]
input_data = input_data.unsqueeze(0)

In [567]:
if config['nclips_val'] > 1:
    input_var = list(input_data.split(config['clip_size'], 2))
    for idx, inp in enumerate(input_var):
        input_var[idx] = torch.autograd.Variable(inp)
else:
    input_var = [torch.autograd.Variable(input_data)]

#### 6.3 Predicting the Output from the Model

In [568]:
output = model(input_var).squeeze(0)
output = torch.nn.functional.softmax(output, dim=0)

In [569]:
# compute top5 predictions
pred_prob, pred_top3 = output.data.topk(3)
pred_prob = pred_prob.numpy()
pred_top3 = pred_top3.numpy()

#### 6.4 Predicted Class and the represented video

In [570]:
path_to_vid = os.path.join(config["data_folder"], item_id + ".webm")
video = io.open(path_to_vid, 'r+b').read()
encoded = base64.b64encode(video)
HTML(data='''<video alt="test" controls>
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii')))

In [571]:
print("True label --> {} ({})".format(target, dict_two_way[target]))
print("\nTop-3 Predictions:")
for i, pred in enumerate(pred_top3):
    print("Top {} :== {}. Prob := {:.2f}%".format(i + 1, dict_two_way[pred], pred_prob[i] * 100))

True label --> 140 (Spinning something that quickly stops spinning)

Top-3 Predictions:
Top 1 :== Spinning something that quickly stops spinning. Prob := 27.86%
Top 2 :== Pushing something so it spins. Prob := 17.15%
Top 3 :== Spinning something so it continues spinning. Prob := 14.09%


In [572]:
# ----------------------------------------------------------

selected_indx = 19887

input_data, target, item_id = val_data[selected_indx]
input_data = input_data.unsqueeze(0)

if config['nclips_val'] > 1:
    input_var = list(input_data.split(config['clip_size'], 2))
    for idx, inp in enumerate(input_var):
        input_var[idx] = torch.autograd.Variable(inp)
else:
    input_var = [torch.autograd.Variable(input_data)]

output = model(input_var).squeeze(0)
output = torch.nn.functional.softmax(output, dim=0)

# compute top5 predictions
pred_prob, pred_top3 = output.data.topk(3)
pred_prob = pred_prob.numpy()
pred_top3 = pred_top3.numpy()

path_to_vid = os.path.join(config["data_folder"], item_id + ".webm")
video = io.open(path_to_vid, 'r+b').read()
encoded = base64.b64encode(video)
HTML(data='''<video alt="test" controls>
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii')))

In [573]:
print("True label --> {} ({})".format(target, dict_two_way[target]))
print("\nTop-3 Predictions:")
for i, pred in enumerate(pred_top3):
    print("Top {} :== {}. Prob := {:.2f}%".format(i + 1, dict_two_way[pred], pred_prob[i] * 100))

True label --> 101 (Pushing something with something)

Top-3 Predictions:
Top 1 :== Pushing something with something. Prob := 28.41%
Top 2 :== Pushing something from left to right. Prob := 7.59%
Top 3 :== Pushing something off of something. Prob := 5.11%


In [574]:
# ----------------------------------------------------------

selected_indx = 4588

input_data, target, item_id = val_data[selected_indx]
input_data = input_data.unsqueeze(0)

if config['nclips_val'] > 1:
    input_var = list(input_data.split(config['clip_size'], 2))
    for idx, inp in enumerate(input_var):
        input_var[idx] = torch.autograd.Variable(inp)
else:
    input_var = [torch.autograd.Variable(input_data)]

output = model(input_var).squeeze(0)
output = torch.nn.functional.softmax(output, dim=0)

# compute top5 predictions
pred_prob, pred_top3 = output.data.topk(3)
pred_prob = pred_prob.numpy()
pred_top3 = pred_top3.numpy()

path_to_vid = os.path.join(config["data_folder"], item_id + ".webm")
video = io.open(path_to_vid, 'r+b').read()
encoded = base64.b64encode(video)
HTML(data='''<video alt="test" controls>
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii')))

In [575]:
print("True label --> {} ({})".format(target, dict_two_way[target]))
print("\nTop-3 Predictions:")
for i, pred in enumerate(pred_top3):
    print("Top {} :== {}. Prob := {:.2f}%".format(i + 1, dict_two_way[pred], pred_prob[i] * 100))

True label --> 94 (Pushing something from right to left)

Top-3 Predictions:
Top 1 :== Pushing something from right to left. Prob := 26.25%
Top 2 :== Moving something across a surface until it falls down. Prob := 10.40%
Top 3 :== Tipping something over. Prob := 6.02%
