# Final Project
## Author: Yu Mi, yxm319; Boning Zhao, bxz213
Recognizing human actions is one of most popular computer vision method which finds mutiple applications in lots of fields such as video surveillance, customer attributes, shopping behavior analysis.

In our final project, we consider the automated recognition of human actions in some videos. We proposed to build up a 3D CNN model for action recognition. In order to capture motion information from multiple adjacent frames, we proposed to extract features from both spatial and temporal dimensions. Based on this feature extractor, a 3D convolutional neural network will be built up. This CNN will generates multiple channels of information and performs convolution and subsampling separately. The final feature representation is obtained by conbining information from all channels.

In [None]:
# Import standard and supportive libraries
import tensorflow as tf
import os
import matplotlib.pyplot as plt
import numpy as np
import cv2
from sklearn.cross_validation import train_test_split
from sklearn import cross_validation
from sklearn import preprocessing

## Nerual network framework
In this project, we are going to apply [Keras](https://keras.io) as our neural network framework since it is already introduced in Homework3. It is capable of running on top of TensorFlow, CNTK or Theano. It was developed with a focus on enabling fast experimentation, being able to go from idea to result with the least possible delay and good for research.

In [None]:
# Import models and layers
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential, Model
from keras.layers.core import Dense, Flatten, Activation, Flatten
from keras.layers.convolutional import Convolution3D, MaxPooling3D

# Import utilities
from keras.optimizers import SGD,RMSprop
from keras.utils.vis_utils import plot_model
from keras.utils import np_utils, generic_utils

## KTH dataset
[KTH dataset](http://www.nada.kth.se/cvap/actions/) is a database provided by KTH Royal institute of Technology. The current video database contains six tyes of human actions, including walking, jogging, running, boxing, hand waving and hand clapping. All the actions are performed several times by 25 different individuals in for scenarios: outdoors $s1$, outdoors with scale variation $s2$, outdoors with different clothes $s3$ and indoors $s4$ as illustrated below. 
![KTH scenarios and actions](figure/KTH_Intro.gif)
Currently we have $600$ sequences in the dataset and all the sequences were taken over homogeneous backgrounds with a static camera with $25$fps frame rate. The sequences were downsampled to the spatial resolution of $160\times120$ pixels and have a length of four seconds in average.

In [None]:
# image attributes
img_r, img_c, img_d = 16, 16, 15
#Training set
#Entire dataset
Training_set=[]
#Loading boxing class
box_listing = os.listdir('data/kth_database/boxing')
for box_id in box_listing:
    box_id = 'data/kth_database/boxing/'+box_id
    frame_list = []
    capture = cv2.VideoCapture(box_id)
    fps = capture.get(cv2.CAP_PROP_FPS)
    #print("Frames per second using video.get(cv2.CAP_PROP_FPS): {0}".format(fps))
    for i in range(img_d):
        success, frame = capture.read()
        frame = cv2.resize(frame,(img_r,img_c),interpolation=cv2.INTER_AREA)
        gray =  cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        frame_list.append(gray)
        #plt.imshow(gray, cmap = plt.get_cmap('gray'))
        # to hide tick values on X and Y axis
        #plt.xticks([]), plt.yticks([])
        #plt.show()
        #cv2.imshow('frame',gray)
        #Indicates the number of milliseconds to wait. It will wait for a specific number of milliseconds to see if the keyboard has any input. The return value is ASCII. If its parameter is 0, it means indefinitely waiting for keyboard input
        #if cv2.waitKey(1) & 0xFF == ord('q'):
            #break
    capture.release()
    cv2.destroyAllWindows()
    ipt = np.asarray(frame_list)
    #ipt=np.rollaxis(np.rollaxis(ipt,2,0),2,0)
    Training_set.append(ipt)
print("Boxing class has been loaded")  

#Loading hand clapping class
hc_listing = os.listdir('data/kth_database/handclapping')
for hc_id in hc_listing:
    hc_id = 'data/kth_database/handclapping/'+hc_id
    frame_list = []
    capture = cv2.VideoCapture(hc_id)
    fps = capture.get(cv2.CAP_PROP_FPS)
    #print("Frames per second using video.get(cv2.CAP_PROP_FPS): {0}".format(fps))
    for i in range(img_d):
        success, frame = capture.read()
        frame = cv2.resize(frame,(img_r,img_c),interpolation=cv2.INTER_AREA)
        gray =  cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        frame_list.append(gray)
        #plt.imshow(gray, cmap = plt.get_cmap('gray'))
        # to hide tick values on X and Y axis
        #plt.xticks([]), plt.yticks([])
        #plt.show()
        #cv2.imshow('frame',gray)
        #Indicates the number of milliseconds to wait. It will wait for a specific number of milliseconds to see if the keyboard has any input. The return value is ASCII. If its parameter is 0, it means indefinitely waiting for keyboard input
        #if cv2.waitKey(1) & 0xFF == ord('q'):
            #break
    capture.release()
    cv2.destroyAllWindows()
    ipt = np.asarray(frame_list)
    #ipt=np.rollaxis(np.rollaxis(ipt,2,0),2,0)
    Training_set.append(ipt)
print("Hand clapping class has been loaded")

#Loading hand waving class
hw_listing = os.listdir('data/kth_database/handwaving')
for hw_id in hw_listing:
    hw_id = 'data/kth_database/handwaving/'+hc_id
    frame_list = []
    capture = cv2.VideoCapture(hw_id)
    fps = capture.get(cv2.CAP_PROP_FPS)
    #print("Frames per second using video.get(cv2.CAP_PROP_FPS): {0}".format(fps))
    for i in range(img_d):
        success, frame = capture.read()
        frame = cv2.resize(frame,(img_r,img_c),interpolation=cv2.INTER_AREA)
        gray =  cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        frame_list.append(gray)
        #plt.imshow(gray, cmap = plt.get_cmap('gray'))
        # to hide tick values on X and Y axis
        #plt.xticks([]), plt.yticks([])
        #plt.show()
        #cv2.imshow('frame',gray)
        #Indicates the number of milliseconds to wait. It will wait for a specific number of milliseconds to see if the keyboard has any input. The return value is ASCII. If its parameter is 0, it means indefinitely waiting for keyboard input
        #if cv2.waitKey(1) & 0xFF == ord('q'):
            #break
    capture.release()
    cv2.destroyAllWindows()
    ipt = np.asarray(frame_list)
    #ipt=np.rollaxis(np.rollaxis(ipt,2,0),2,0)
    Training_set.append(ipt)
print("Hand waving class has been loaded")

#Loading jogging class
jog_listing = os.listdir('data/kth_database/jogging')
for jog_id in jog_listing:
    jog_id = 'data/kth_database/jogging/'+jog_id
    frame_list = []
    capture = cv2.VideoCapture(jog_id)
    fps = capture.get(cv2.CAP_PROP_FPS)
    #print("Frames per second using video.get(cv2.CAP_PROP_FPS): {0}".format(fps))
    for i in range(img_d):
        success, frame = capture.read()
        frame = cv2.resize(frame,(img_r,img_c),interpolation=cv2.INTER_AREA)
        gray =  cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        frame_list.append(gray)
        #plt.imshow(gray, cmap = plt.get_cmap('gray'))
        # to hide tick values on X and Y axis
        #plt.xticks([]), plt.yticks([])
        #plt.show()
        #cv2.imshow('frame',gray)
        #Indicates the number of milliseconds to wait. It will wait for a specific number of milliseconds to see if the keyboard has any input. The return value is ASCII. If its parameter is 0, it means indefinitely waiting for keyboard input
        #if cv2.waitKey(1) & 0xFF == ord('q'):
            #break
    capture.release()
    cv2.destroyAllWindows()
    ipt = np.asarray(frame_list)
    #ipt=np.rollaxis(np.rollaxis(ipt,2,0),2,0)
    Training_set.append(ipt)
print("Jogging class has been loaded")

#Loading running class
run_listing = os.listdir('data/kth_database/running')
for run_id in run_listing:
    run_id = 'data/kth_database/running/'+run_id
    frame_list = []
    capture = cv2.VideoCapture(run_id)
    fps = capture.get(cv2.CAP_PROP_FPS)
    #print("Frames per second using video.get(cv2.CAP_PROP_FPS): {0}".format(fps))
    for i in range(img_d):
        success, frame = capture.read()
        frame = cv2.resize(frame,(img_r,img_c),interpolation=cv2.INTER_AREA)
        gray =  cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        frame_list.append(gray)
        #plt.imshow(gray, cmap = plt.get_cmap('gray'))
        # to hide tick values on X and Y axis
        #plt.xticks([]), plt.yticks([])
        #plt.show()
        #cv2.imshow('frame',gray)
        #Indicates the number of milliseconds to wait. It will wait for a specific number of milliseconds to see if the keyboard has any input. The return value is ASCII. If its parameter is 0, it means indefinitely waiting for keyboard input
        #if cv2.waitKey(1) & 0xFF == ord('q'):
            #break
    capture.release()
    cv2.destroyAllWindows()
    ipt = np.asarray(frame_list)
    #ipt=np.rollaxis(np.rollaxis(ipt,2,0),2,0)
    Training_set.append(ipt)
print("running class has been loaded")

#Loading walking class
walk_listing = os.listdir('data/kth_database/walking')
for walk_id in walk_listing:
    walk_id = 'data/kth_database/walking/'+walk_id
    frame_list = []
    capture = cv2.VideoCapture(walk_id)
    fps = capture.get(cv2.CAP_PROP_FPS)
    #print("Frames per second using video.get(cv2.CAP_PROP_FPS): {0}".format(fps))
    for i in range(img_d):
        success, frame = capture.read()
        frame = cv2.resize(frame,(img_r,img_c),interpolation=cv2.INTER_AREA)
        gray =  cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        frame_list.append(gray)
        #plt.imshow(gray, cmap = plt.get_cmap('gray'))
        # to hide tick values on X and Y axis
        #plt.xticks([]), plt.yticks([])
        #plt.show()
        #cv2.imshow('frame',gray)
        #Indicates the number of milliseconds to wait. It will wait for a specific number of milliseconds to see if the keyboard has any input. The return value is ASCII. If its parameter is 0, it means indefinitely waiting for keyboard input
        #if cv2.waitKey(1) & 0xFF == ord('q'):
            #break
    capture.release()
    cv2.destroyAllWindows()
    ipt = np.asarray(frame_list)
    #ipt=np.rollaxis(np.rollaxis(ipt,2,0),2,0)
    Training_set.append(ipt)
print("walking class has been loaded")