# install dependencies

In [58]:
import pandas as pd
import os
import cv2
import numpy as np
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms


In [60]:
n_classes = 10
actions = ['sister','hurry','hungry','meal','brother','tree','heavy','cry','family','wise']

In [56]:
def convert_relu_to_swish(model: nn.Module):
    for child_name, child in model.named_children():
        if isinstance(child, nn.ReLU):
            setattr(model, child_name, nn.SiLU(True))
        else:
            convert_relu_to_swish(child)


            
            
class Swish(nn.Module):
    def __init(self):
        super().__init__()

    def forward(self, x):
        return x.mult_(torch.sigmoid(x))
    
    
    
class r2plus1d_18(nn.Module):
    def __init__(self, pretrained=True, n_classes=3, dropout_p=0.5):
        super(r2plus1d_18, self).__init__()
        self.pretrained = pretrained
        self.n_classes = n_classes

        model = torchvision.models.video.r2plus1d_18(pretrained=self.pretrained)
        modules = list(model.children())[:-1]
        self.r2plus1d_18 = nn.Sequential(*modules)
        convert_relu_to_swish(self.r2plus1d_18)
        self.fc1 = nn.Linear(model.fc.in_features, self.n_classes)
        self.dropout = nn.Dropout(dropout_p, inplace=True)

    def forward(self, x):
        # (b, f, c, h, w) = x.size()
        # x = x.view(b, c, f, h, w)

        out = self.r2plus1d_18(x)
        out = out.flatten(1)
        out = self.dropout(out)
        out = self.fc1(out)

        return out

In [59]:
h, w = 128, 128
mean = [0.43216, 0.394666, 0.37645]
std = [0.22803, 0.22145, 0.216989]

## Model Testing

#### 1. *Load pretrained weights*

In [61]:
m = r2plus1d_18(pretrained=False, n_classes=n_classes)
best_checkpoint = torch.load("pytorch_weights.tar")
m.load_state_dict(best_checkpoint["model_state_dict"])


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [62]:
best_checkpoint.keys()

dict_keys(['epochs', 'last_epoch', 'best_epoch', 'model_state_dict', 'optimizer_state_dict', 'lr_scheduler_state_dict', 'loss_state_dict', 'hist'])

## Try Real time

In [63]:
from scipy import stats

colors = [(245,117,16), (117,245,16), (16,117,245)]
def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        prob = max(0,prob)
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num%3], -1)
        cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
        
    return output_frame


In [68]:
# 1. New detection variables
from PIL import Image

sequence = []
sentence = []
predictions = []
threshold = 0.5

m = m.to(device)
cap = cv2.VideoCapture(0)
# Set mediapipe model 



resize_transform   = transforms.Resize((h, w))
totensor_transform  = transforms.ToTensor()
normalize_transform = transforms.Normalize(mean, std)

while cap.isOpened():

    # Read feed
    ret, frame = cap.read()
    if(not ret):
        break
#         frame = cv2.resize(frame, (512, 512))


    final_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    final_frame = Image.fromarray(final_frame)
    
    

    final_frame = resize_transform(final_frame)
    final_frame = totensor_transform(final_frame)
    final_frame = normalize_transform(final_frame).to(device)
    
    
    
    sequence.append(final_frame)
    sequence = sequence[-16:]
    arg_max=-1
    if len(sequence) == 16:
        first_seq = torch.stack(sequence).to(device)
        output_seq = torch.unsqueeze(first_seq, dim=0).permute(0, 2, 1, 3, 4)
        with torch.no_grad():
            m.eval()
            res = m(output_seq)
            arg_max = int(torch.argmax(res))
        predictions.append(arg_max)


    #3. Viz logic
        if np.unique(predictions[-2:])[0]==arg_max: 
            if res[0][arg_max] > threshold: 

                if len(sentence) > 0: 
                    if actions[arg_max] != sentence[-1]:
                        sentence.append(actions[arg_max])
                else:
                    sentence.append(actions[arg_max])

        if len(sentence) > 5: 
            sentence = sentence[-5:]

        # Viz probabilities
        frame = prob_viz(res.cpu().detach().numpy()[0], actions, frame, colors)

    cv2.rectangle(frame, (0,0), (640, 40), (245, 117, 16), -1)
    cv2.putText(frame, ' '.join(sentence), (3,30), 
                   cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

    # Show to screen
    cv2.imshow('OpenCV Feed', frame)

    # Break gracefully
    if cv2.waitKey(100) & 0xFF == ord('q'):
        break
cap.release()
cv2.destroyAllWindows()

In [138]:
cap.release()


In [139]:
cv2.destroyAllWindows()

In [253]:
import gc
gc.collect()
torch.cuda.empty_cache()

8795