# GameFace
HCI With Head Pose Using Computer Vision

---
Practice Module: Intelligent Sensing System (ISS)


# 0. File Path & Library Setup

In [1]:
# Load All Necessary Packages

import os
# from google.colab import drive

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
import tensorflow as tf
import time
from math import ceil
import math
import winsound
import webbrowser
import pyautogui
import pygetwindow as gw
from scipy.spatial.transform import Rotation
from IPython.display import clear_output
from math import cos
from math import sin
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger, LearningRateScheduler
from tensorflow.keras.models import Model
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Activation
from tensorflow.keras import optimizers
from tensorflow.keras import regularizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPool2D
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow.keras.layers import LSTM
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.nn import softmax
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model

from tensorflow.keras.applications.efficientnet import EfficientNetB0

import absl.logging
absl.logging.set_verbosity(absl.logging.ERROR)

seed = 12

print("Versions of key libraries")
print("-------------------------")
print("pandas:      ", pd.__version__)
print("numpy:       ", np.__version__)
print("opencv:      ", cv2.__version__)
print("tensorflow:  ", tf.__version__)
print("GPU Accress  ", tf.config.list_physical_devices('GPU'))

Versions of key libraries
-------------------------
pandas:       1.3.2
numpy:        1.21.2
opencv:       4.5.4-dev
tensorflow:   2.7.0
GPU Accress   [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
# # Mounting to Google Drive
# drive.mount('/content/gdrive')

# # Change Working Directory
# os.chdir('/content/gdrive/My Drive/iss/prs_pm/training')

print('Working Directory: ')
# !pwd
os.getcwd()

Working Directory: 


'C:\\Users\\Kennedy\\OneDrive\\Documents\\NUS\\Projects\\ISSM\\GIT\\NUS_ISS_ITSS_Project_GameFace\\Experimentation\\HeadPose_mouth_and_eyes_combined'

# 1. Load Model

## Ultralight Face Detection Model

In [3]:
caffemodel = '..\\models_and_weights\\Slim-320\\slim-320.caffemodel'
prototxt = '..\\models_and_weights\\Slim-320\\slim-320.prototxt'
net = cv2.dnn.readNetFromCaffe(prototxt, caffemodel)

## Head Pose Estimation Model

In [4]:
model_file = os.path.join(os.getcwd(), '..//models_and_weights//HPN_EfficientNetB0_FT_16lyr_v2')
tf.get_logger().setLevel('ERROR')
hpn = load_model(model_file, compile=False)

# 2. Application

## Face Detection Utility Functions

In [5]:
image_mean = np.array([127, 127, 127])
image_std = 128.0
iou_threshold = 0.3
center_variance = 0.1
size_variance = 0.2
min_boxes = [[10.0, 16.0, 24.0], [32.0, 48.0], [64.0, 96.0], [128.0, 192.0, 256.0]]
strides = [8.0, 16.0, 32.0, 64.0]


def define_img_size(image_size):
    shrinkage_list = []
    feature_map_w_h_list = []
    for size in image_size:
        feature_map = [int(ceil(size / stride)) for stride in strides]
        feature_map_w_h_list.append(feature_map)

    for i in range(0, len(image_size)):
        shrinkage_list.append(strides)
    priors = generate_priors(feature_map_w_h_list, shrinkage_list, image_size, min_boxes)
    return priors


def generate_priors(feature_map_list, shrinkage_list, image_size, min_boxes):
    priors = []
    for index in range(0, len(feature_map_list[0])):
        scale_w = image_size[0] / shrinkage_list[0][index]
        scale_h = image_size[1] / shrinkage_list[1][index]
        for j in range(0, feature_map_list[1][index]):
            for i in range(0, feature_map_list[0][index]):
                x_center = (i + 0.5) / scale_w
                y_center = (j + 0.5) / scale_h

                for min_box in min_boxes[index]:
                    w = min_box / image_size[0]
                    h = min_box / image_size[1]
                    priors.append([
                        x_center,
                        y_center,
                        w,
                        h
                    ])
    # print("priors nums:{}".format(len(priors)))
    return np.clip(priors, 0.0, 1.0)


def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200):
    scores = box_scores[:, -1]
    boxes = box_scores[:, :-1]
    picked = []
    indexes = np.argsort(scores)
    indexes = indexes[-candidate_size:]
    while len(indexes) > 0:
        current = indexes[-1]
        picked.append(current)
        if 0 < top_k == len(picked) or len(indexes) == 1:
            break
        current_box = boxes[current, :]
        indexes = indexes[:-1]
        rest_boxes = boxes[indexes, :]
        iou = iou_of(
            rest_boxes,
            np.expand_dims(current_box, axis=0),
        )
        indexes = indexes[iou <= iou_threshold]
    return box_scores[picked, :]


def area_of(left_top, right_bottom):
    hw = np.clip(right_bottom - left_top, 0.0, None)
    return hw[..., 0] * hw[..., 1]


def iou_of(boxes0, boxes1, eps=1e-5):
    overlap_left_top = np.maximum(boxes0[..., :2], boxes1[..., :2])
    overlap_right_bottom = np.minimum(boxes0[..., 2:], boxes1[..., 2:])

    overlap_area = area_of(overlap_left_top, overlap_right_bottom)
    area0 = area_of(boxes0[..., :2], boxes0[..., 2:])
    area1 = area_of(boxes1[..., :2], boxes1[..., 2:])
    return overlap_area / (area0 + area1 - overlap_area + eps)


def predict(width, height, confidences, boxes, prob_threshold, iou_threshold=0.3, top_k=-1):
    boxes = boxes[0]
    confidences = confidences[0]
    picked_box_probs = []
    picked_labels = []
    for class_index in range(1, confidences.shape[1]):
        probs = confidences[:, class_index]
        mask = probs > prob_threshold
        probs = probs[mask]
        if probs.shape[0] == 0:
            continue
        subset_boxes = boxes[mask, :]
        box_probs = np.concatenate([subset_boxes, probs.reshape(-1, 1)], axis=1)
        box_probs = hard_nms(box_probs,
                             iou_threshold=iou_threshold,
                             top_k=top_k,
                             )
        picked_box_probs.append(box_probs)
        picked_labels.extend([class_index] * box_probs.shape[0])
    if not picked_box_probs:
        return np.array([]), np.array([]), np.array([])
    picked_box_probs = np.concatenate(picked_box_probs)
    picked_box_probs[:, 0] *= width
    picked_box_probs[:, 1] *= height
    picked_box_probs[:, 2] *= width
    picked_box_probs[:, 3] *= height
    return picked_box_probs[:, :4].astype(np.int32), np.array(picked_labels), picked_box_probs[:, 4]


def convert_locations_to_boxes(locations, priors, center_variance,
                               size_variance):
    if len(priors.shape) + 1 == len(locations.shape):
        priors = np.expand_dims(priors, 0)
    return np.concatenate([
        locations[..., :2] * center_variance * priors[..., 2:] + priors[..., :2],
        np.exp(locations[..., 2:] * size_variance) * priors[..., 2:]
    ], axis=len(locations.shape) - 1)


def center_form_to_corner_form(locations):
    return np.concatenate([locations[..., :2] - locations[..., 2:] / 2,
                           locations[..., :2] + locations[..., 2:] / 2], len(locations.shape) - 1)

def get_bbox(rawimg):
    priors = define_img_size([input_width, input_height])
    img = cv2.resize(rawimg, (input_width, input_height))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    blob = cv2.dnn.blobFromImage(img, 1/input_std, (input_width, input_height), 127)
    net.setInput(blob)
    boxes, scores = net.forward(["boxes", "scores"])
    boxes = np.expand_dims(np.reshape(boxes, (-1, 4)), axis=0)
    scores = np.expand_dims(np.reshape(scores, (-1, 2)), axis=0)
    boxes = convert_locations_to_boxes(boxes, priors, center_variance, size_variance)
    boxes = center_form_to_corner_form(boxes)
    boxes, labels, probs = predict(rawimg.shape[1], rawimg.shape[0], scores, boxes, threshold)
    return boxes

def get_best_bbox(boxes, rawimg_shape):
    img_center = np.array([rawimg_shape[0], rawimg_shape[1]])/2
    center_diff = []
    for i in range(len(boxes)):
        box_x = (boxes[i][2] - boxes[i][0]) / 2 + boxes[i][0]
        box_y = (boxes[i][3] - boxes[i][1]) / 2 + boxes[i][1]
        box_center = np.array([box_x, box_y])
        euc_dist = np.linalg.norm(box_center - img_center)
        center_diff.append(euc_dist)
    best_box_idx = np.argmin(np.array(center_diff))
    mid_box = boxes[best_box_idx]
    return mid_box

def get_euler(anno_txt):
    file = open(anno_txt)
    lines = file.readlines()
    R = []
    for i in range(3):
        line = lines[i]
        line = line.rstrip().split(' ')
        R.append(np.array(line, dtype=np.float))
    R = np.transpose(R)
    pitch = math.atan2(R[2,1] , R[2,2])* 180 / np.pi
    yaw = -math.atan2(-R[2,0], math.sqrt(R[2,1]**2 + R[2,2]**2))* 180 / np.pi
    roll = -math.atan2(R[1,0], R[0,0])* 180 / np.pi
    return np.array([pitch, yaw, roll], dtype=np.float)

In [6]:
# Draw Euler Axis

def draw_axis(img, yaw, pitch, roll, tdx=None, tdy=None, size = 100):

    pitch = pitch * np.pi / 180
    yaw = -(yaw * np.pi / 180)
    roll = roll * np.pi / 180

    if tdx != None and tdy != None:
        tdx = tdx
        tdy = tdy
    else:
        height, width = img.shape[:2]
        tdx = width / 2
        tdy = height / 2

    # X-Axis pointing to right. drawn in red
    x1 = size * (cos(yaw) * cos(roll)) + tdx
    y1 = size * (cos(pitch) * sin(roll) + cos(roll) * sin(pitch) * sin(yaw)) + tdy

    # Y-Axis | drawn in green
    #        v
    x2 = size * (-cos(yaw) * sin(roll)) + tdx
    y2 = size * (cos(pitch) * cos(roll) - sin(pitch) * sin(yaw) * sin(roll)) + tdy

    # Z-Axis (out of the screen) drawn in blue
    x3 = size * (sin(yaw)) + tdx
    y3 = size * (-cos(yaw) * sin(pitch)) + tdy

    cv2.line(img, (int(tdx), int(tdy)), (int(x1),int(y1)),(0,0,255),3)
    cv2.line(img, (int(tdx), int(tdy)), (int(x2),int(y2)),(0,255,0),3)
    cv2.line(img, (int(tdx), int(tdy)), (int(x3),int(y3)),(255,0,0),2)

    return img

## Real Time Video Inference

In [7]:
# Face Detection
def detect_face(img):
    boxes = get_bbox(img)
    if len(boxes) > 0:
        got_face = True
        bbox = get_best_bbox(boxes, img.shape)
    else:
        got_face = False
        bbox = []
    return got_face, bbox

# Crop Face Image by scale
def crop_face(target_bbox, img, scale, row_adj_ratio=1.8, clm_adj_ratio=1):
    bbox_nclm = target_bbox[2] - target_bbox[0]
    bbox_nrow = target_bbox[3] - target_bbox[1]
    clm_adj = (bbox_nclm * (scale - 1)) // 2
    row_adj = (bbox_nrow * (scale - 1)) // 2
    scaled_bbox = [int(max(target_bbox[0] - clm_adj_ratio*clm_adj,0)), int(max(target_bbox[1] - row_adj_ratio*row_adj, 0)),
                   int(target_bbox[2] + (2-clm_adj_ratio)*clm_adj), int(target_bbox[3] + (2-row_adj_ratio)*row_adj)]
    return scaled_bbox

# Predict Euler
def predict_eur(hpn_model, img, scaled_bbox, nbins=66, angle_range=99):
    bin_index = np.array([idx for idx in range(nbins)], dtype=np.float32)
    bin_degree = 2 * angle_range / nbins
    face_img = img[scaled_bbox[1]:scaled_bbox[3], scaled_bbox[0]:scaled_bbox[2]]
    # input_img = cv2.resize(face_img, (224,224))
    input_img = cv2.resize(face_img, (224,224))
    input_img = np.reshape(input_img, (-1,224,224,3)).astype(np.float32)
    bin_pred = hpn_model(input_img)
    pitch_angle = np.sum(softmax(bin_pred["Pitch"]) * bin_index, 1) * bin_degree - angle_range
    roll_angle = np.sum(softmax(bin_pred["Roll"]) * bin_index, 1) * bin_degree - angle_range
    return pitch_angle, roll_angle
    
# Predict Eye / Mouth
eye = 0
mouth = 1

**Mouth and Eyes**

In [8]:
import mediapipe as mp
import cv2
import time
import os
import tensorflow as tf

In [9]:
mp_face_mesh = mp.solutions.face_mesh

In [10]:
import math 
def are_eyes_closed(landmarks, image):
    coords=[]
    for i in (386, 362, 374, 263, 159, 33, 145, 133):
        x2_loc=int(landmarks.landmark[i].x*image.shape[1])
        y2_loc=int(landmarks.landmark[i].y*image.shape[0])
        coords.append((x2_loc,y2_loc))
        
    sum=0
    right_width = math.sqrt( (coords[1][0] - coords[3][0])**2 +
                          (coords[1][1] - coords[3][1])**2   )
    left_width = math.sqrt( (coords[5][0] - coords[7][0])**2 +
                          (coords[5][1] - coords[7][1])**2   )
    
    # distance between two near points up and down
    right_distance = math.sqrt( (coords[0][0] - coords[2][0])**2 +
                              (coords[0][1] - coords[2][1])**2   )
    left_distance = math.sqrt( (coords[4][0] - coords[6][0])**2 +
                              (coords[4][1] - coords[6][1])**2   )
    
    ratio=0.15
    if right_distance < right_width * ratio and left_distance < left_width * ratio:
        return True
    else:
        return False

In [11]:
def get_lip_height(landmarks, image):
    coords=[]
    for i in (0, 13, 14, 17, 37, 82, 87, 84, 267, 312, 317, 314):
        x2_loc=int(landmarks.landmark[i].x*image.shape[1])
        y2_loc=int(landmarks.landmark[i].y*image.shape[0])
        coords.append((x2_loc,y2_loc))

    sum_top=0
    sum_bottom=0

    for i in [0,4,8]:
        # distance between two near points up and down
        top_distance = math.sqrt( (coords[i][0] - coords[1+i][0])**2 +
                              (coords[i][1] - coords[1+i][1])**2   )
    for i in [2,6,10]:
        # distance between two near points up and down
        bottom_distance = math.sqrt( (coords[i][0] - coords[1+i][0])**2 +
                              (coords[i][1] - coords[1+i][1])**2   )

    sum_top += top_distance
    sum_bottom += bottom_distance
    return (sum_top / 3,sum_bottom / 3)

In [12]:
def get_mouth_height(landmarks, image):
    coords=[]
    for i in (0, 13, 14, 17, 37, 82, 87, 84, 267, 312, 317, 314):
        x2_loc=int(landmarks.landmark[i].x*image.shape[1])
#         print(x2_loc)
        y2_loc=int(landmarks.landmark[i].y*image.shape[0])
        coords.append((x2_loc,y2_loc))
    
    sum=0
    for i in [1,5,9]:
        # distance between two near points up and down
        distance = math.sqrt( (coords[i][0] - coords[i+1][0])**2 + 
                              (coords[i][1] - coords[i+1][1])**2   )
        sum += distance
    return sum / 3

In [17]:
def check_mouth_open(landmarks, image):
    top_lip_height, bottom_lip_height=get_lip_height(landmarks, image)
    mouth_height=get_mouth_height(landmarks, image)
    
    # if mouth is open more than lip height * ratio, return true.
    ratio = 1.0
    if mouth_height > (top_lip_height + bottom_lip_height) * ratio:
        return True
    else:
        return False

In [14]:
face_mesh = mp_face_mesh.FaceMesh(max_num_faces=1, refine_landmarks=True, 
                                  min_detection_confidence=0.5, min_tracking_confidence=0.5)

def Eyes_and_mouth(frame):
    results = face_mesh.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    
    if results.multi_face_landmarks is not None:
        closed_eyes=are_eyes_closed(results.multi_face_landmarks[0], frame)

        open_mouth=check_mouth_open(results.multi_face_landmarks[0], frame)

        return (True,closed_eyes, open_mouth)
    else:
        return (False, False, False)

**Play the Game**

In [18]:
# Initialize Webcam
video_capture = cv2.VideoCapture(0)

# Initialize Time Recording
previous = time.time()

# Parameters
input_width = 320
input_height = 240
input_std = 128
threshold = 0.5
frame_count = 0
frame_skip = 1
skip = False

# Initial Position
calibration_frame_size = 5
pitch_calibration = np.zeros(calibration_frame_size)
roll_calibration = np.zeros(calibration_frame_size)
init_pitch = 0
init_roll = 10

# Difference Threshold
pitch_up_thres = 15
pitch_down_thres = 15
roll_left_thres = 17
roll_right_thres = 17

# Status Indicator
stdby_mode = True
play_mode = False
pend_command = True
mode_exec_time = 0
mode_exec_thres = 2
mouth_exec_time = 0
mouth_exec_thres = 1
command_exec_time = 0
command_exec_thres = 0.5
prev_command = None
frame_count = 0

# Initialize Browser
# webbrowser.open_new('https://www.google.com/logos/2010/pacman10-i.html')
webbrowser.open_new('https://www.freetetris.org/game.php')
time.sleep(1)
window_browser = gw.getActiveWindow()
pyautogui.hotkey('winleft', 'right')
window_active = False

# Initialize Controls
first_launch = True

while True:
    # Exit Condition
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
    
    # Start reading Image from Webcam
    ret, frame = video_capture.read()
    
    # Exit Condition if no image capture
    if not ret:
        break
    
    # Preprocessing
    frame = cv2.flip(frame,1)
    
    
    # Update Current Mode
    if stdby_mode and not play_mode:
        cv2.putText(frame, 'Standby Mode', (545, 15), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,0,255), 1, cv2.LINE_AA)
    elif not stdby_mode and play_mode:
        cv2.putText(frame, 'Play Mode', (569, 15), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,0,255), 1, cv2.LINE_AA)
    
    # Update Status of Pending Command or under Latency
    if pend_command:
        cv2.putText(frame, 'Pending Command', (515, 45), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,0,255), 1, cv2.LINE_AA)
    elif not pend_command:
        cv2.putText(frame, 'Breathing Time', (539, 45), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,0,255), 1, cv2.LINE_AA)
    
    # Skip Frame
    if frame_count % frame_skip == 0:
        skip = False
    else:
        skip = True
        
    # Detect Face
    face_ind, bbox = detect_face(frame)
    
    # Infer Head Pose and Eye/Mouth
    if face_ind:
        # Scale and Crop Face Image
        if not skip:
            face_bbox = crop_face(bbox, frame, 1.3)
        cv2.putText(frame, 'Face Detected', (543, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,0,255), 1, cv2.LINE_AA)
        cv2.rectangle(frame, (face_bbox[0], face_bbox[1]), (face_bbox[2], face_bbox[3]), (0, 0, 255), 2)

        # Predict Euler
        if not skip:
            pitch_raw, roll_raw = predict_eur(hpn, frame, face_bbox)
            success_EM, closed_eyes, open_mouth=Eyes_and_mouth(frame)
        pitch = int(pitch_raw[0])
        roll  = int(roll_raw[0])
        cv2.putText(frame, 'Pitch : {0:d}'.format(pitch), (10, 45), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,102,255), 1, cv2.LINE_AA)
        cv2.putText(frame, 'Roll  : {0:d}'.format(roll), (10, 65), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,102,255), 1, cv2.LINE_AA)
        
        # Print Neutral Position for Pitch and Roll
        cv2.putText(frame, 'Neutral Pitch : {0:d}'.format(init_pitch), (10, 85), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,102,255), 1, cv2.LINE_AA)
        cv2.putText(frame, 'Neutral Roll  : {0:d}'.format(init_roll), (10, 100), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,102,255), 1, cv2.LINE_AA)
        
    else:
        cv2.putText(frame, 'No Face Detected', (520, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,0,255), 1, cv2.LINE_AA)
    
    if not success_EM:
        cv2.putText(frame, 'Mouth and Eyes detection failed', (520, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,0,255), 1, cv2.LINE_AA)
            
    # cv2.putText(frame, 'Action', (595, 455), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (51,255,0), 1, cv2.LINE_AA)
    # cv2.putText(frame, 'Up', (616, 470), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (51,255,0), 1, cv2.LINE_AA)
    # cv2.putText(frame, 'Down', (600, 470), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (51,255,0), 1, cv2.LINE_AA)
    # cv2.putText(frame, 'Left', (608, 470), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (51,255,0), 1, cv2.LINE_AA)
    # cv2.putText(frame, 'Right', (601, 470), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (51,255,0), 1, cv2.LINE_AA)
    # cv2.putText(frame, 'Eye Close', (571, 470), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (51,255,0), 1, cv2.LINE_AA)
    # cv2.putText(frame, 'Eye Open', (574, 470), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (51,255,0), 1, cv2.LINE_AA)
    # cv2.putText(frame, 'Mouth Close', (556, 470), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (51,255,0), 1, cv2.LINE_AA)
    # cv2.putText(frame, 'Mouth Open', (558, 470), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (51,255,0), 1, cv2.LINE_AA)
    
    # Execute Action
    if face_ind and not stdby_mode and play_mode and pend_command:
        if (roll-init_roll) > roll_right_thres:
            command_exec_time = time.time()
            prev_command = 'Right'
            pend_command = False
            pyautogui.press('right')
        elif (roll-init_roll) < -roll_left_thres:
            command_exec_time = time.time()
            prev_command = 'Left'
            pend_command = False
            pyautogui.press('left')
        elif (pitch-init_pitch) > pitch_up_thres:
            command_exec_time = time.time()
            prev_command = 'Up'
            pend_command = False
            pyautogui.press('up')
        elif (pitch-init_pitch) < -pitch_down_thres:
            command_exec_time = time.time()
            prev_command = 'Down'
            pend_command = False
            pyautogui.press('down')
        elif open_mouth:
            cv2.putText(frame, 'Mouth Open...', (200, 455), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,0,255), 1, cv2.LINE_AA)
            if mouth_exec_time == 0 and open_mouth:
                mouth_exec_time = time.time()
                cv2.putText(frame, 'Hold Time  : {0:0.2f}'.format(round(time.time()-mouth_exec_time,2)), (200, 470), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,0,255), 1, cv2.LINE_AA)
            elif mouth_exec_time > 0 and open_mouth and (time.time()-mouth_exec_time) >= mouth_exec_thres:
                cv2.putText(frame, 'Hold Time  : {0:0.2f}'.format(round(time.time()-mouth_exec_time,2)), (200, 470), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,0,255), 1, cv2.LINE_AA)
                mouth_exec_time = 0
                pyautogui.press('space')
            elif mouth_exec_time > 0 and open_mouth and (time.time()-mouth_exec_time) < mouth_exec_thres:
                cv2.putText(frame, 'Hold Time  : {0:0.2f}'.format(round(time.time()-mouth_exec_time,2)), (200, 470), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,0,255), 1, cv2.LINE_AA)
        elif mouth_exec_time > 0 and not open_mouth:
            mouth_exec_time=0
    
        # Enter Play Mode, Standby Mode -> Play Mode
    if face_ind and stdby_mode and not play_mode and pend_command:
        # Trigger Play Mode
        if closed_eyes:
            cv2.putText(frame, 'Turning On...', (10, 455), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,0,255), 1, cv2.LINE_AA)
        if mode_exec_time == 0 and closed_eyes:
            mode_exec_time = time.time()
            cv2.putText(frame, 'Hold Time  : {0:0.2f}'.format(round(time.time()-mode_exec_time,2)), (10, 470), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,0,255), 1, cv2.LINE_AA)
        elif mode_exec_time > 0 and closed_eyes and (time.time()-mode_exec_time) >= mode_exec_thres:
            cv2.putText(frame, 'Hold Time  : {0:0.2f}'.format(round(time.time()-mode_exec_time,2)), (10, 470), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,0,255), 1, cv2.LINE_AA)
            stdby_mode=False
            play_mode=True
            mode_exec_time = 0
            winsound.Beep(440, 500)
            if first_launch:
                pyautogui.press('space')
                first_launch = False
            else:
                pyautogui.press('escape')
        elif mode_exec_time > 0 and closed_eyes and (time.time()-mode_exec_time) < mode_exec_thres:
            cv2.putText(frame, 'Hold Time  : {0:0.2f}'.format(round(time.time()-mode_exec_time,2)), (10, 470), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,0,255), 1, cv2.LINE_AA)
        elif mode_exec_time > 0 and not closed_eyes:
            mode_exec_time=0
        # elif mode_exec_time == 0 and (pitch-init_pitch) > -pitch_thres and frame_count % 10 == 0:
        #     pitch_calibration = np.append(pitch_calibration[1:], pitch)
        #     roll_calibration = np.append(roll_calibration[1:], roll)
        #     init_pitch = int(np.mean(pitch_calibration))
        #     init_roll = int(np.mean(roll_calibration))
    
    # Exit Play Mode, Play Mode -> Standby Mode
    if face_ind and not stdby_mode and play_mode and pend_command:
        if closed_eyes:
            cv2.putText(frame, 'Turning Off...', (10, 455), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,0,255), 1, cv2.LINE_AA)
        if mode_exec_time == 0 and closed_eyes:
            mode_exec_time = time.time()
            cv2.putText(frame, 'Hold Time  : {0:0.2f}'.format(round(time.time()-mode_exec_time,2)), (10, 470), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,0,255), 1, cv2.LINE_AA)
        elif mode_exec_time > 0 and closed_eyes and (time.time()-mode_exec_time) >= mode_exec_thres:
            cv2.putText(frame, 'Hold Time  : {0:0.2f}'.format(round(time.time()-mode_exec_time,2)), (10, 470), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,0,255), 1, cv2.LINE_AA)
            stdby_mode=True
            play_mode=False
            mode_exec_time = 0
            winsound.Beep(440, 500)
            pyautogui.press('escape')
        elif mode_exec_time > 0 and closed_eyes and (time.time()-mode_exec_time) < mode_exec_thres:
            cv2.putText(frame, 'Hold Time  : {0:0.2f}'.format(round(time.time()-mode_exec_time,2)), (10, 470), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,0,255), 1, cv2.LINE_AA)
        elif mode_exec_time > 0 and not closed_eyes:
            mode_exec_time=0
    
    # Breathing Period
    if face_ind and not stdby_mode and play_mode and not pend_command:
        if (time.time()-command_exec_time) >= command_exec_thres:
            command_exec_time = 0
            prev_command = None
            pend_command = True
        else:
            cv2.putText(frame, 'Action', (595, 455), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,0,255), 1, cv2.LINE_AA)
            cv2.putText(frame, f'{prev_command}', (608, 470), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,0,255), 1, cv2.LINE_AA)
        
    
    # Refresh FPS
    if not skip:
        new = time.time()
        f = int(frame_skip/(new - previous))
        previous = new 
    cv2.putText(frame, 'FPS  : {0:d}'.format(f), (10, 25), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,0,0), 1, cv2.LINE_AA)
    
    # Showing Image Result
    winname = "Test"
    cv2.namedWindow(winname)       
    # cv2.moveWindow(winname, 2000,300)
    cv2.imshow(winname, frame)
    
    # Switching to application window
    if not window_active:
        window_active=True
        window_browser.activate()
    
    frame_count += 1
    
video_capture.release()
cv2.destroyAllWindows()