In [None]:
import numpy as np
import time
import cv2
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import imutils
from imutils.video import VideoStream, WebcamVideoStream, FPS
import pyautogui
from collections import deque
from PIL import Image

queue_size = 32  # size of queue to retain for 3D conv input
stable_queue_size = 4 # size of queue for prediction stabilisation
num_classes = 27
threshold = 0.25 # above the threshold will be recognized as gesture
verbose = 1
rgb_mean = (0.485, 0.456, 0.406)
rgb_std = (0.229, 0.224, 0.225)
expand = 1
resolution = (96 * expand, 160* expand)

transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.CenterCrop((96,160)),
        transforms.ToTensor(),
        transforms.Normalize(rgb_mean,rgb_std)
    ])

MobileFaceNet_BottleNeck_Setting = [
    # t, c , n ,s
    [2, 64, 5, 1],
    [4, 128, 1, 1],
    [2, 128, 6, 2],
    [4, 128, 1, 2],
    [2, 128, 2, 2]
]

class BottleNeck(nn.Module):
    def __init__(self, inp, oup, stride, expansion):
        super(BottleNeck, self).__init__()
        self.connect = stride == 1 and inp == oup

        self.conv = nn.Sequential(
            # 1*1 conv
            nn.Conv3d(inp, inp * expansion, 1, 1, 0, bias=False),
            nn.BatchNorm3d(inp * expansion),
            nn.PReLU(inp * expansion),

            # 3*3 depth wise conv
            nn.Conv3d(inp * expansion, inp * expansion, 3, stride, 1, groups=inp * expansion, bias=False),
            nn.BatchNorm3d(inp * expansion),
            nn.PReLU(inp * expansion),

            # 1*1 conv
            nn.Conv3d(inp * expansion, oup, 1, 1, 0, bias=False),
            nn.BatchNorm3d(oup),
        )

    def forward(self, x):
        if self.connect:
            return x + self.conv(x)
        else:
            return self.conv(x)

class ConvBlock(nn.Module):
    def __init__(self, inp, oup, k, s, p, depthwidth=False, linear=False):
        super(ConvBlock, self).__init__()
        self.linear = linear
        if depthwidth:
            self.conv = nn.Conv3d(inp, oup, k, s, p, groups=inp, bias=False)
        else:
            self.conv = nn.Conv3d(inp, oup, k, s, p, bias=False)

        self.bn = nn.BatchNorm3d(oup)
        if not linear:
            self.prelu = nn.PReLU(oup)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        if self.linear:
            return x
        else:
            return self.prelu(x)

class MobileFaceNet(nn.Module):
    def __init__(self, feature_dim=256, num_classes=27, bottleneck_setting=MobileFaceNet_BottleNeck_Setting):
        super(MobileFaceNet, self).__init__()
        self.conv1 = ConvBlock(3, 64, (3, 7, 7), (1, 2, 2), (1, 3, 3))
        self.dw_conv1 = ConvBlock(64, 64, 3, 2, 1, depthwidth=True)

        self.cur_channel = 64
        block = BottleNeck
        self.blocks = self._make_layer(block, bottleneck_setting)

        self.conv2 = ConvBlock(128, 512, 1, 1, 0)
        self.linear7 = ConvBlock(512, 512, (1, 3, 5), 1, 0, depthwidth=True, linear=True)
        self.linear1 = ConvBlock(512, feature_dim, 1, 1, 0, linear=True)
        self.bn = nn.BatchNorm3d(feature_dim)
        self.out = nn.Linear(feature_dim, num_classes)

        for layer in self.modules():
            if isinstance(layer, nn.Conv3d):
                nn.init.kaiming_normal_(layer.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(layer, nn.BatchNorm3d):
                nn.init.constant_(layer.weight, val=1.0)
                nn.init.constant_(layer.bias, val=0.0)
            elif isinstance(layer, nn.Linear):
                nn.init.kaiming_normal_(layer.weight, mode='fan_out', nonlinearity='relu')
                nn.init.constant_(layer.bias, val=0.0)

    def _make_layer(self, block, setting):
        layers = []
        for t, c, n, s in setting:
            for i in range(n):
                if i == 0:
                    layers.append(block(self.cur_channel, c, s, t))
                else:
                    layers.append(block(self.cur_channel, c, 1, t))
                self.cur_channel = c

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.dw_conv1(x)
        x = self.blocks(x)
        x = self.conv2(x)
        x = self.linear7(x)
        x = self.linear1(x)
        x = self.bn(x)
        x = x.view(x.size(0), -1)
        output = self.out(x)
        return output

#load the model
model = MobileFaceNet()
model = nn.DataParallel(model)
model.load_state_dict(torch.load('project_classifier_mobile9228.pth', map_location=torch.device('cpu')))
device = torch.device('cuda')
model.to(device)

gesture_dict = {
     'Swiping Left': 0, 0: 'Swiping Left',
     'Swiping Right': 1, 1: 'Swiping Right',
     'Swiping Down': 2, 2: 'Swiping Down',
     'Swiping Up': 3, 3: 'Swiping Up',
     'Pushing Hand Away': 4, 4: 'Pushing Hand Away',
     'Pulling Hand In': 5, 5: 'Pulling Hand In',
     'Sliding Two Fingers Left': 6, 6: 'Sliding Two Fingers Left',
     'Sliding Two Fingers Right': 7, 7: 'Sliding Two Fingers Right',
     'Sliding Two Fingers Down': 8, 8: 'Sliding Two Fingers Down',
     'Sliding Two Fingers Up': 9, 9: 'Sliding Two Fingers Up',
     'Pushing Two Fingers Away': 10, 10: 'Pushing Two Fingers Away',
     'Pulling Two Fingers In': 11, 11: 'Pulling Two Fingers In',
     'Rolling Hand Forward': 12, 12: 'Rolling Hand Forward',
     'Rolling Hand Backward': 13, 13: 'Rolling Hand Backward',
     'Turning Hand Clockwise': 14, 14: 'Turning Hand Clockwise',
     'Turning Hand Counterclockwise': 15, 15: 'Turning Hand Counterclockwise',
     'Zooming In With Full Hand': 16, 16: 'Zooming In With Full Hand',
     'Zooming Out With Full Hand': 17, 17: 'Zooming Out With Full Hand',
     'Zooming In With Two Fingers': 18, 18: 'Zooming In With Two Fingers',
     'Zooming Out With Two Fingers': 19, 19: 'Zooming Out With Two Fingers',
     'Thumb Up': 20, 20: 'Thumb Up',
     'Thumb Down': 21, 21: 'Thumb Down',
     'Shaking Hand': 22, 22: 'Shaking Hand',
     'Stop Sign':23, 23: 'Stop Sign',
     'Drumming Fingers': 24, 24: 'Drumming Fingers',
     'No gesture': 25, 25: 'No gesture',
     'Doing other things': 26, 26: 'Doing other things'
}

action_dict = {
     'Swiping Left': (1, "left"), 
     'Swiping Right': (1, "right"), 
     'Swiping Down': (1, "down"), 
     'Swiping Up': (1, "up"), 
     'Pushing Hand Away': (4, 'volumedown'), 
     'Pulling Hand In': (4, 'volumeup'), 
     'Sliding Two Fingers Left': (1, "left"),
     'Sliding Two Fingers Right': (1, "right"), 
     'Sliding Two Fingers Down': (1, "down"), 
     'Sliding Two Fingers Up': (1, "up"), 
     'Pushing Two Fingers Away': (4, 'volumedown'),
     'Pulling Two Fingers In': (4, 'volumeup'),
     'Rolling Hand Forward': 0, 
     'Rolling Hand Backward': 0, 
     'Turning Hand Clockwise': 0, 
     'Turning Hand Counterclockwise': 0, 
     'Zooming In With Full Hand': (2, 'ctrl', "+"), 
     'Zooming Out With Full Hand': (2, 'ctrl', "-"), 
     'Zooming In With Two Fingers': (2, 'ctrl', "+"), 
     'Zooming Out With Two Fingers': (2, 'ctrl', "-"), 
     'Thumb Up': 0, 
     'Thumb Down': 0,
     'Shaking Hand': 0, 
     'Stop Sign':(3, 'space'), 
     'Drumming Fingers': 0, 
     'No gesture': 0,
     'Doing other things': 0,
}


def do_it(action_code):
    if action_code != 0:
        # moving pic
        if action_code[0] == 1:
#             pyautogui.keyDown(action_code[1])
#             starttime = time.time()
#             endtime = time.time()
#             while endtime - starttime < 0.5:
#                 endtime = time.time()
#             pyautogui.keyUp(action_code[1])
            
#             pyautogui.click()
            pyautogui.hotkey(action_code[1])
            
        elif action_code[0] == 2:
            pyautogui.click()
            pyautogui.hotkey(action_code[1], action_code[2])
        
        elif action_code[0] == 3:
            pyautogui.press(action_code[1])
        
        elif action_code[0] == 4:
            pyautogui.press(action_code[1])



if verbose>0: print("[INFO] Attemping to start video stream...")
vs = VideoStream(0, usePiCamera=False, resolution=resolution, framerate=12).start()

time.sleep(2.0)
fps = FPS().start()
Queue = deque(maxlen=queue_size)
Stable_Queue = deque(maxlen=stable_queue_size)

# act = deque(['No gesture', "No gesture"], maxlen=3)

# read the frames from video stream
frame = vs.read()

if frame is None:
    print('[ERROR] No video stream is available')

else:
# initialize the queue with the first frame
    for i in range(queue_size):
        Queue.append(frame)
    if (verbose > 0): print('[INFO] Video stream started...')

# Action candidate pool
candidate_pool = []

# loop over the frames to get sample video
while (True):
    # read the frames from video stream
    frame = vs.read()
    if frame is None:
        print('[ERROR] No video stream is available')
        break

    # resize maximum height of 100 pixels (jester v1 dataset video height)

    frame = imutils.resize(frame, height=100)
    
    raw_frame = frame.copy()
    
    Queue.append(frame)

    # format data to torch
    
#     imgs = []
#     for img_beforetransform in Queue:
#         img = transform(img_beforetransform)
# #         img2 = transform2(img1)
# #         cv2.imshow('sample', np.array(img2))
#         imgs.append(torch.unsqueeze(img, 0))
#     video = torch.cat(imgs)
    video = torch.zeros(queue_size, 3, 96, 160)
    for i in range(queue_size):
        video[i] = transform(Queue[i])
    data = video.permute(1, 0, 2, 3).unsqueeze(0)
    data = data[:,:,0:32:2,:,:].to(device)
    
    model.eval()
    output = model(data)
    output = F.softmax(output, dim=1)

    k = 5
    ts, pred = output.detach().cpu().topk(k, 1, True, True)

    top5 = pred[0]
    ps = ts[0]
    top1 = top5[0] if ps[0] > threshold else 26

    hist = {}
    for i in range(num_classes):
        hist[i] = 0
    for i in range(k):
        hist[top5[i].item()] = ps[i].item()

    Stable_Queue.append(list(hist.values()))

    ave_pred = np.array(Stable_Queue).mean(axis=0)
    top1 = np.argmax(ave_pred) if max(ave_pred) > threshold else 26
    
    cv2.imshow('test', raw_frame)
    
    if top1 != 25 and top1 != 26:
        candidate_pool.append(top1)
    num_candidate = len(candidate_pool)
    
    action = None
    if top1 == 25 or top1 == 26:
        trimed_candidate_pool = candidate_pool[int(0.2*num_candidate):int(0.85*num_candidate)]
        num = len(trimed_candidate_pool)
        if num != 0:
#             print(trimed_candidate_pool)
            if num <= 7:
                candidate_pool = []
            else:
                action_index = max(set(trimed_candidate_pool), key=trimed_candidate_pool.count)
                action = gesture_dict[action_index]
                print(action)
                candidate_pool = []
    
    if action is not None:
        do_it(action_dict[action])
                
    
#     ts, pred = output.detach().cpu().topk(k, 1, True, True)
#     top5 = [gesture_dict[pred[0][i].item()] for i in range(k)]

#     pi = [pred[0][i].item() for i in range(k)]
#     ps = [ts[0][i].item() for i in range(k)]
#     top1 = top5[0] if ps[0] > threshold else gesture_dict[26]

#     hist = {}
#     for i in range(num_classes):
#         hist[i] = 0
#     for i in range(len(pi)):
#         hist[pi[i]] = ps[i]
#         print(pi[i])
#         print(ps[i])

#     Stable_Queue.append(list(hist.values()))

#     ave_pred = np.array(Stable_Queue).mean(axis=0)

#     top1 = gesture_dict[np.argmax(ave_pred)] if max(ave_pred) > threshold else gesture_dict[26]
#     cv2.imshow('test', raw_frame)
#     top1 = top1.lower()
    
#     if top1 != 'doing other things' and top1 != 'no gesture':
#         candidate_pool.append(top1)
#     num_candidate = len(candidate_pool)
    
#     if top1 == 'doing other things' or top1 == 'no gesture':
#         trimed_candidate_pool = candidate_pool[int(0.15*num_candidate):int(0.85*num_candidate)]

#         if len(trimed_candidate_pool) != 0:
#             print(trimed_candidate_pool)
#             action = max(set(trimed_candidate_pool), key=trimed_candidate_pool.count)
#             print(action)
#             candidate_pool = []
            

    # if (act[0] != act[1] and len(set(list(act)[1:])) == 1):
    #     if top1 in action.keys():
    #         t = action[top1]['fn']
    #         k = action[top1]['keys']
    #
    #         if verbose > 1: print('[DEBUG]', top1, '-- ', t, str(k))
    #         if t == 'typewrite':
    #             pyautogui.typewrite(k)
    #         elif t == 'press':
    #             pyautogui.press(k)
    #         elif t == 'hotkey':
    #             for key in k:
    #                 pyautogui.keyDown(key)
    #             for key in k[::-1]:
    #                 pyautogui.keyUp(key)
                # pyautogui.hotkey(",".join(k))

    key = cv2.waitKey(1) & 0xFF

    # if the `q` key was pressed, break from the loop
    if key == ord("q"):
        break

    # update the FPS counter
    fps.update()

# stop the timer and display FPS information
fps.stop()
print("[INFO] elasped time: {:.2f}".format(fps.elapsed()))
print("[INFO] approx. FPS: {:.2f}".format(fps.fps()))

# cleaning up
cv2.destroyAllWindows()
vs.stop()