In [1]:
import os
import cv2
import sys
import importlib
import torch
import torchvision
import numpy as np

sys.path.insert(0, "../")

import io
import base64
from IPython.display import HTML

In [2]:
from pytorch_implementation_new_way import WebMDataset
from pytorch_implementation_new_way import VideoFolder
from pytorch_implementation_new_way import MultiColumn, Model
from pytorch_implementation_new_way import ComposeMix, Scale
from pytorch_implementation_new_way import remove_module_from_checkpoint_state_dict

 > Using device: cuda
 > Active GPU ids: [0]


In [3]:
config = {
    "model_name": "3D_model_9_classes",
    "output_dir": "D:/capstone-project-2-webapp/flask-server/data/trained_models/",

    "input_mode": "av",

    "data_folder": "D:/capstone-project-2-webapp/flask-server/data/videos/",

    "json_data_val": "D:/capstone-project-2-webapp/flask-server/data/files/validation_data_9_classes.json",

    "json_file_labels": "D:/capstone-project-2-webapp/flask-server/data/files/labels_9_classes.json",

    "num_workers": 5,

    "num_classes": 9,
    "batch_size": 30,
    "clip_size": 72,
    
    "nclips_train": 1,
    "nclips_val": 1,

    "upscale_factor_train": 1.4,
    "upscale_factor_eval": 1.0,

    "step_size_train": 1,
    "step_size_val": 1,

    "lr": 0.008,
    "last_lr": 0.00001,
    "momentum": 0.9,
    "weight_decay": 0.00001,
    "num_epochs": -1,
    "print_freq": 100,

    "conv_model": "D:/something-something-project/smth-smth-v2-baseline-with-models/trained_models/pretrained.model3D_1",
    "input_spatial_size": 84,

    "column_units": 512,
    "save_features": True
}

In [4]:
# set column model
# column_cnn_def = importlib.import_module("{}".format(config['conv_model']))
model_name = config["model_name"]

print("=> Name of the model -- {}".format(model_name))

# checkpoint path to a trained model
checkpoint_path = os.path.join("../", config["output_dir"], config["model_name"], "model_best.pth.tar")
print("=> Checkpoint path --> {}".format(checkpoint_path))

=> Name of the model -- 3D_model_9_classes
=> Checkpoint path --> D:/capstone-project-2-webapp/flask-server/data/trained_models/3D_model_9_classes\model_best.pth.tar


In [5]:
model = MultiColumn(config['num_classes'], Model, int(config["column_units"]))
model.eval();

In [6]:
print("=> loading checkpoint")
checkpoint = torch.load(checkpoint_path)
checkpoint['state_dict'] = remove_module_from_checkpoint_state_dict(
                                              checkpoint['state_dict'])
model.load_state_dict(checkpoint['state_dict'])
print("=> loaded checkpoint '{}' (epoch {})"
      .format(checkpoint_path, checkpoint['epoch']))

=> loading checkpoint
=> loaded checkpoint 'D:/capstone-project-2-webapp/flask-server/data/trained_models/3D_model_9_classes\model_best.pth.tar' (epoch 24)


In [7]:
import json
# Center crop videos during evaluation
transform_eval_pre = ComposeMix([
        [Scale(config['input_spatial_size']), "img"],
        [torchvision.transforms.ToPILImage(), "img"],
        [torchvision.transforms.CenterCrop(config['input_spatial_size']), "img"]
         ])

transform_post = ComposeMix([
        [torchvision.transforms.ToTensor(), "img"],
        [torchvision.transforms.Normalize(
                   mean=[0.485, 0.456, 0.406],  # default values for imagenet
                   std=[0.229, 0.224, 0.225]), "img"]
         ])

val_data = VideoFolder(root=config['data_folder'],
                       json_file_input=config['json_data_val'],
                       json_file_labels=config['json_file_labels'],
                       clip_size=config['clip_size'],
                       nclips=config['nclips_val'],
                       step_size=config['step_size_val'],
                       is_val=True,
                       transform_pre=transform_eval_pre,
                       transform_post=transform_post,
                       get_item_id=True,
                       )
dict_two_way = val_data.classes_dict

In [8]:
len(val_data)

106

In [9]:
dict_two_way

{'Dropping something': 0,
 0: 'Dropping something',
 'Holding something': 1,
 1: 'Holding something',
 'Moving something': 2,
 2: 'Moving something',
 'Picking something': 3,
 3: 'Picking something',
 'Poking something': 4,
 4: 'Poking something',
 'Pouring something': 5,
 5: 'Pouring something',
 'Putting something': 6,
 6: 'Putting something',
 'Showing something': 7,
 7: 'Showing something',
 'Tearing something': 8,
 8: 'Tearing something'}

In [10]:
type(val_data)

pytorch_implementation_new_way.VideoFolder

In [11]:
selected_indx = np.random.randint(len(val_data))
# selected_indx = 48
print (selected_indx)

77


In [12]:
import av
input_data, target, item_id = val_data[selected_indx]
input_data = input_data.unsqueeze(0)
print("Id of the video sample = {}".format(item_id))
print("True label --> {} ({})".format(target, dict_two_way[target]))

Id of the video sample = 108436
True label --> 3 (Picking something)


  imgs = [f.to_rgb().to_nd_array() for f in reader.decode(video=0)]


In [13]:
if config['nclips_val'] > 1:
    input_var = list(input_data.split(config['clip_size'], 2))
    for idx, inp in enumerate(input_var):
        input_var[idx] = torch.autograd.Variable(inp)
else:
    input_var = [torch.autograd.Variable(input_data)]

In [14]:
output = model(input_var).squeeze(0)
output = torch.nn.functional.softmax(output, dim=0)
print (output)

tensor([0.0292, 0.6241, 0.1500, 0.0366, 0.0341, 0.0100, 0.1030, 0.0115, 0.0016],
       grad_fn=<SoftmaxBackward>)


In [15]:
# compute top5 predictions
pred_prob, pred_top5 = output.data.topk(4)
pred_prob = pred_prob.numpy()
pred_top5 = pred_top5.numpy()

In [16]:
print (pred_prob)
print (pred_top5)

[0.6241079  0.15003286 0.10297551 0.03657099]
[1 2 6 3]


In [17]:
print("Id of the video sample = {}".format(item_id))
print("True label --> {} ({})".format(target, dict_two_way[target]))
print("\nTop-4 Predictions:")
for i, pred in enumerate(pred_top5):
    print("Top {} :== {}. Prob := {:.2f}%".format(i + 1, dict_two_way[pred], pred_prob[i] * 100))

Id of the video sample = 108436
True label --> 3 (Picking something)

Top-4 Predictions:
Top 1 :== Holding something. Prob := 62.41%
Top 2 :== Moving something. Prob := 15.00%
Top 3 :== Putting something. Prob := 10.30%
Top 4 :== Picking something. Prob := 3.66%


In [19]:
path_to_vid = os.path.join(config["data_folder"], item_id + ".webm")
video = io.open(path_to_vid, 'r+b').read()
encoded = base64.b64encode(video)
HTML(data='''<video alt="test" controls>
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii')))