### in this notebook, we'll try to use our model in a video stream

In [1]:
import numpy as np
import cv2 as cv

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset
from torch.utils.data import random_split
import os

In [5]:
class Vgg(nn.Module):
    def __init__(self, drop=0.2):
        super().__init__()

        self.conv1a = nn.Conv2d(in_channels=1, out_channels=64, kernel_size=3, padding=1)
        self.conv1b = nn.Conv2d(64, out_channels=64, kernel_size=3, padding=1)

        self.conv2a = nn.Conv2d(64, 128, 3, padding=1)
        self.conv2b = nn.Conv2d(128, 128, 3, padding=1)

        self.conv3a = nn.Conv2d(128, 256, 3, padding=1)
        self.conv3b = nn.Conv2d(256, 256, 3, padding=1)

        self.conv4a = nn.Conv2d(256, 512, 3, padding=1)
        self.conv4b = nn.Conv2d(512, 512, 3, padding=1)

        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

        self.bn1a = nn.BatchNorm2d(64)
        self.bn1b = nn.BatchNorm2d(64)

        self.bn2a = nn.BatchNorm2d(128)
        self.bn2b = nn.BatchNorm2d(128)

        self.bn3a = nn.BatchNorm2d(256)
        self.bn3b = nn.BatchNorm2d(256)

        self.bn4a = nn.BatchNorm2d(512)
        self.bn4b = nn.BatchNorm2d(512)

        # self.lin1 = nn.Linear(512 * 2 * 2, 4096)
        self.lin1 = nn.Linear(512 * 3 * 3, 4096)
        self.lin2 = nn.Linear(4096, 4096)

        # self.lin3 = nn.Linear(4096, 7)
        # output size changed to 9 because we're using the fer plus labels, not the fer 2013 labels
        self.lin3 = nn.Linear(4096, 9)

        self.drop = nn.Dropout(p=drop)

    def forward(self, x):
        x = F.relu(self.bn1a(self.conv1a(x)))
        x = F.relu(self.bn1b(self.conv1b(x)))
        x = self.pool(x)

        x = F.relu(self.bn2a(self.conv2a(x)))
        x = F.relu(self.bn2b(self.conv2b(x)))
        x = self.pool(x)

        x = F.relu(self.bn3a(self.conv3a(x)))
        x = F.relu(self.bn3b(self.conv3b(x)))
        x = self.pool(x)

        x = F.relu(self.bn4a(self.conv4a(x)))
        x = F.relu(self.bn4b(self.conv4b(x)))
        x = self.pool(x)
        
        # x = x.view(-1, 512 * 2 * 2)
        x = torch.flatten(x, 1)
        x = F.relu(self.drop(self.lin1(x)))
        x = F.relu(self.drop(self.lin2(x)))
        x = self.lin3(x)
        
        return x


In [6]:
model = Vgg()

In [7]:
device = torch.device('cpu')

In [8]:
model.load_state_dict(torch.load('saved_model/vggmodel_state_dict.pth', map_location=device))

<All keys matched successfully>

In [9]:
transform = transforms.Compose([transforms.ToPILImage(),
                            transforms.Resize((48,48)),
                            transforms.Grayscale(),
                            transforms.ToTensor()])

### create a function that accepts an image and reurns the prediction and the label

In [16]:
def predict(img):
    labels = ['neutral', 'happiness', 'surprise', 'sadness', 'anger', 'disgust', 'fear', 'contempt', 'unknown']
    img_transformed = transform(img)
    img_transformed.unsqueeze_(1)
    pred = model(img_transformed)
    label = labels[torch.argmax(pred)]
    pred = pred.detach().numpy()[0]
    
    return pred, label

In [17]:
pred = np.array([0.613, 1.342, 2.134, -1.435, -2.455, 1.535, 0.257, 0.893, -1.678])

In [18]:
np.max([np.max(pred), np.min(pred)*-1])

2.455

In [13]:
height = 500
width = 500
blank = np.zeros((height,width,3), dtype='uint8')
maxheight = int(height/2.1)
origin = int(height/2)
binwidth = int(width/10)
# maxheight = int(500*(np.max([np.max(pred), np.min(pred)*-1])))
for i,item in enumerate(pred):
    i += 1
    pt1 = (int(binwidth*(i-0.5)),origin)
    pt2 = (int(binwidth*(i+0.5)),int(origin-(maxheight*(item/np.max([np.max(pred), np.min(pred)*-1])))))
    cv.rectangle(blank, pt1, pt2, (0,255,0), thickness=-1)
cv.imshow('rectangle', blank)
cv.waitKey(0)
cv.destroyAllWindows()
cv.waitKey(1)

-1

### create a function that graphs the predicted values

In [29]:
def graph_pred(pred):
    height = 500
    width = 500
    blank = np.zeros((height,width,3), dtype='uint8')
    maxheight = int(height/2.3)
    origin = int(height/2)
    binwidth = int(width/10)
    for i,item in enumerate(pred):
        i += 1
        pt1 = (int(binwidth*(i-0.5)),origin)
        pt2 = (int(binwidth*(i+0.5)),int(origin-(maxheight*(item/np.max([np.max(pred), np.min(pred)*-1])))))
        cv.rectangle(blank, pt1, pt2, (0,255,0), thickness=-1)
    labels = ['neutral', 'happiness', 'surprise', 'sadness', 'anger', 'disgust', 'fear', 'contempt', 'unknown']
    cv.putText(blank, labels[0], (int(binwidth*(1-0.5)),height-10), cv.FONT_HERSHEY_SIMPLEX, 0.3, (0, 255, 0), 2, cv.LINE_AA)
    cv.putText(blank, labels[1], (int(binwidth*(2-0.5)),height-10), cv.FONT_HERSHEY_SIMPLEX, 0.3, (0, 255, 0), 2, cv.LINE_AA)
    cv.putText(blank, labels[2], (int(binwidth*(3-0.5)),height-10), cv.FONT_HERSHEY_SIMPLEX, 0.3, (0, 255, 0), 2, cv.LINE_AA)
    cv.putText(blank, labels[3], (int(binwidth*(4-0.5)),height-10), cv.FONT_HERSHEY_SIMPLEX, 0.3, (0, 255, 0), 2, cv.LINE_AA)
    cv.putText(blank, labels[4], (int(binwidth*(5-0.5)),height-10), cv.FONT_HERSHEY_SIMPLEX, 0.3, (0, 255, 0), 2, cv.LINE_AA)
    cv.putText(blank, labels[5], (int(binwidth*(6-0.5)),height-10), cv.FONT_HERSHEY_SIMPLEX, 0.3, (0, 255, 0), 2, cv.LINE_AA)
    cv.putText(blank, labels[6], (int(binwidth*(7-0.5)),height-10), cv.FONT_HERSHEY_SIMPLEX, 0.3, (0, 255, 0), 2, cv.LINE_AA)
    cv.putText(blank, labels[7], (int(binwidth*(8-0.5)),height-10), cv.FONT_HERSHEY_SIMPLEX, 0.3, (0, 255, 0), 2, cv.LINE_AA)
    cv.putText(blank, labels[8], (int(binwidth*(9-0.5)),height-10), cv.FONT_HERSHEY_SIMPLEX, 0.3, (0, 255, 0), 2, cv.LINE_AA)
    cv.imshow('rectangle', blank)

### capture the webcam video and display the results

In [30]:
cap = cv.VideoCapture(0)
shapelist = []
if not cap.isOpened():
    print("Cannot open camera")
    exit()
while True:
    # Capture frame-by-frame
    ret, frame = cap.read()
    # if frame is read correctly ret is True
    if not ret:
        print("Can't receive frame (stream end?). Exiting ...")
        break
    # Our operations on the frame come here
#     gray = cv.cvtColor(frame, cv.COLOR_BGR2GRAY)
    img = frame
    img = cv.flip(img, 1)
    pred, label = predict(img)
    
    predtext = np.array2string(pred)
        
    cv.putText(img, label, (10,450), cv.FONT_HERSHEY_SIMPLEX, 3, (0, 255, 0), 2, cv.LINE_AA)
    # Display the resulting frameq
    cv.imshow('frame', img)
    
    graph_pred(pred)
    
#     graph = np.ones((300,800,1))
#     cv.putText(graph, predtext, (10,300-10), cv.FONT_HERSHEY_SIMPLEX, 0.35, (0, 255, 0), 2, cv.LINE_AA)
#     cv.imshow('pred', graph)
    
    if cv.waitKey(1) == ord('q'):
        break
    
    shapelist.append(frame.shape)
# When everything done, release the capture
cap.release()
cv.destroyAllWindows()

### Conclusion

An app was created that uses opencv and a PyTorch NN model, that takes in a video feed from the webcam and outputs the detected emotion and the predicted value for each emotion 