# WELCOME, PLEASE ONLY RUN THE PREDICTION CELL AS THE MODEL HAS ALREADY BEEN TRAINED WITH MY DATASET.

## Model Cell. CNN is used with pooling and the generator structure given in the labs, to train via a series of 64x64 RGB images for real time gesture prediction.

In [None]:
import numpy as np
import operator
import cv2
import sys, os

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.layers import Convolution2D, MaxPooling2D, Flatten, Dense
from keras.models import Sequential
from keras.models import model_from_json
from keras.preprocessing.image import ImageDataGenerator

# Variable definitions:
image_height = 64    # Height of input images
image_width = 64     # Width of input images.
dimensions = 3       # Number of image channels (R G B = 3 -> 255 x 3).
batch_size = 5       # Batch size for model.
gestures = 7         # Number of gestures (model outputs).

# Build the Sequential Convolutional Neural Network to classify our gestures.
Model = Sequential()

# Layer 1 - 32 nodes, first pooling layer.
Model.add(Convolution2D(32, (3, 3), input_shape = (image_height, image_width, dimensions), activation = 'relu'))
Model.add(MaxPooling2D(pool_size = (2, 2)))

# Layer 2 - 32 nodes, second pooling layer.
Model.add(Convolution2D(32, (3, 3), activation = 'relu'))
Model.add(MaxPooling2D(pool_size = (2, 2)))

# Flatten the layers.
Model.add(Flatten())

# Fully Connected Layer / output layer (7 gestures).
Model.add(Dense(units = 128, activation = 'relu'))
Model.add(Dense(units = gestures, activation = 'softmax'))

# Optimizer definition.
optimizer = keras.optimizers.Adam()

# Compile the CNN Model.
Model.compile(optimizer = optimizer, loss = 'categorical_crossentropy', metrics = ['accuracy']) 

# Initialise the generators to train the model with the train/test directories.
train_datagen = ImageDataGenerator()
test_datagen = ImageDataGenerator()

# train / test directories, resize for the 64 x 64 input for model.
train_generator = train_datagen.flow_from_directory('images/train', target_size = (image_height, image_width),
                                                 batch_size = batch_size, color_mode = 'rgb', class_mode = 'categorical')

test_generator = test_datagen.flow_from_directory('images/test', target_size = (image_height, image_width),
                                            batch_size = batch_size, color_mode = 'rgb', class_mode = 'categorical')

# Allocate steps based on the amount of data and the batch size we are using:
steps_per_epoch = len(train_generator) / batch_size + 1
validation_steps = len(test_generator) / batch_size + 1

# Fit and train the model.
Model.fit_generator(train_generator, steps_per_epoch = steps_per_epoch, epochs = 200, validation_data = test_generator,
                         validation_steps = validation_steps)


# Save the model, so that Li does not have to train it.
model_json = Model.to_json()
with open("trained_model.json", "w") as json_file:
    json_file.write(model_json)

# Save the optimal weights for the model.
Model.save_weights('optimal_model_weights.h5')


# RUN CELL BELOW:

## Prediction Cell. The gesture is captured via a smaller region of interest within the camera, so as to eliminate some noise, then scaled down and fed to the model. The most probable gesture is displayed at the top left in real time.

In [None]:
import numpy as np
import operator
import cv2
import sys, os

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.layers import Convolution2D, MaxPooling2D, Flatten, Dense
from keras.models import Sequential
from keras.models import model_from_json
from keras.preprocessing.image import ImageDataGenerator

# Variable redefinitions, for single run use-case.
image_height = 64
image_width = 64
dimensions = 3

# Loading the model
json_file = open("trained_model.json", "r")
model_json = json_file.read()
json_file.close()

Model = model_from_json(model_json)

# load the trained optimal weights into the model.
Model.load_weights("optimal_model_weights.h5")

# Start the camera.
video = cv2.VideoCapture(0)

# Main loop:
while True:
    # Initialisation.
    _, frame = video.read()
    
    # Flip the image in order to display it normally, not inverted.
    frame = cv2.flip(frame, 1)
    
    # Coordinates of the ROI - Region of interest that the gesture will be extracted from.
    x1 = int(0.5 * frame.shape[1])
    y1 = 10
    x2 = frame.shape[1] - 10
    y2 = int(0.5 * frame.shape[1])
    
    # Draw the ROI to the frame, inc/decrement by 1 due to the bounding box.
    cv2.rectangle(frame, (x1 - 1, y1 - 1), (x2 + 1, y2 + 1), (255, 0, 0) , 1)
    
    # Extract, resize and then show the ROI seperately.
    roi = frame[y1:y2, x1:x2]
    roi = cv2.resize(roi, (image_height, image_width)) 
    cv2.imshow("ROI Frame", roi)
    
    # Use our convolutional neural network to make a gesture prediction in real time:
    result = Model.predict(roi.reshape(1, image_height, image_width, dimensions))
    
    # Dictionary of each gesture, mapped to the models corresponding result - 2D array. 
    gesture = {'ZERO': result[0][0], 
                  'ONE': result[0][1], 
                  'TWO': result[0][2],
                  'THREE': result[0][3],
                  'FOUR': result[0][4],
                  'DOWN': result[0][5],
                  'UP': result[0][6]}
    
    # Sort, based upon the top prediction (models most likely gesture) - operator.itemgetter grabs the first item (1).
    gesture = sorted(gesture.items(), key = operator.itemgetter(1), reverse = True)
    
    # Display the prediction to the camera window in real time.
    cv2.putText(frame, gesture[0][0], (10, 120), cv2.FONT_HERSHEY_PLAIN, 1, (0,255,255), 1)    
    cv2.imshow("Gesture Recognition Program", frame)
    
    # End the program on pressing q.
    interrupt = cv2.waitKey(1)
    if interrupt == ord('q'):
        break
        
# Kill the windows and the process.
video.release()
cv2.destroyAllWindows()
