# Final Project
## Author: Yu Mi, yxm319; Boning Zhao, bxz213
Recognizing human actions is one of most popular computer vision method which finds mutiple applications in lots of fields such as video surveillance, customer attributes, shopping behavior analysis.

In our final project, we consider the automated recognition of human actions in some videos. We proposed to build up a 3D CNN model for action recognition. In order to capture motion information from multiple adjacent frames, we proposed to extract features from both spatial and temporal dimensions. Based on this feature extractor, a 3D convolutional neural network will be built up. This CNN will generates multiple channels of information and performs convolution and subsampling separately. The final feature representation is obtained by conbining information from all channels.

In [1]:
# Import standard and supportive libraries
import tensorflow as tf
import os
import matplotlib.pyplot as plt
import numpy as np
import cv2
from sklearn.cross_validation import train_test_split
from sklearn import cross_validation
from sklearn import preprocessing

  from ._conv import register_converters as _register_converters


## Nerual network framework
In this project, we are going to apply [Keras](https://keras.io) as our neural network framework since it is already introduced in Homework3. It is capable of running on top of TensorFlow, CNTK or Theano. It was developed with a focus on enabling fast experimentation, being able to go from idea to result with the least possible delay and good for research.

In [2]:
# Import models and layers
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential, Model
from keras.layers.core import Dense, Flatten, Activation, Flatten, Dropout
from keras.layers.convolutional import Conv3D, MaxPooling3D

# Import utilities
from keras.optimizers import SGD,RMSprop
from keras.utils.vis_utils import plot_model
from keras.utils import np_utils, generic_utils

Using TensorFlow backend.


## KTH dataset
[KTH dataset](http://www.nada.kth.se/cvap/actions/) is a database provided by KTH Royal institute of Technology. The current video database contains six tyes of human actions, including walking, jogging, running, boxing, hand waving and hand clapping. All the actions are performed several times by 25 different individuals in for scenarios: outdoors $s1$, outdoors with scale variation $s2$, outdoors with different clothes $s3$ and indoors $s4$ as illustrated below. 
![KTH scenarios and actions](figure/KTH_Intro.gif)
Currently we have $600$ sequences in the dataset and all the sequences were taken over homogeneous backgrounds with a static camera with $25$fps frame rate. The sequences were downsampled to the spatial resolution of $160\times120$ pixels and have a length of four seconds in average.

In [None]:
# image attributes
img_r, img_c, img_d = 34, 54, 9
#img_r, img_c, img_d = 15, 15, 16
#Training set
#Entire dataset
Training_set=[]
#Loading boxing class
box_listing = os.listdir('data/kth_database/boxing')
for box_id in box_listing:
    box_id = 'data/kth_database/boxing/'+box_id
    frame_list = []
    capture = cv2.VideoCapture(box_id)
    fps = capture.get(cv2.CAP_PROP_FPS)
    #print("Frames per second using video.get(cv2.CAP_PROP_FPS): {0}".format(fps))
    for i in range(img_d):
        success, frame = capture.read()
        frame = cv2.resize(frame,(img_r,img_c),interpolation=cv2.INTER_AREA)
        gray =  cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        frame_list.append(gray)
        #plt.imshow(gray, cmap = plt.get_cmap('gray'))
        # to hide tick values on X and Y axis
        #plt.xticks([]), plt.yticks([])
        #plt.show()
        #cv2.imshow('frame',gray)
        #Indicates the number of milliseconds to wait. It will wait for a specific number of milliseconds to see if the keyboard has any input. The return value is ASCII. If its parameter is 0, it means indefinitely waiting for keyboard input
        #if cv2.waitKey(1) & 0xFF == ord('q'):
            #break
    capture.release()
    cv2.destroyAllWindows()
    ipt = np.asarray(frame_list)
    ipt=np.rollaxis(np.rollaxis(ipt,2,0),2,1)
    Training_set.append(ipt)
print("Boxing class has been loaded")  

#Loading hand clapping class
hc_listing = os.listdir('data/kth_database/handclapping')
for hc_id in hc_listing:
    hc_id = 'data/kth_database/handclapping/'+hc_id
    frame_list = []
    capture = cv2.VideoCapture(hc_id)
    fps = capture.get(cv2.CAP_PROP_FPS)
    #print("Frames per second using video.get(cv2.CAP_PROP_FPS): {0}".format(fps))
    for i in range(img_d):
        success, frame = capture.read()
        frame = cv2.resize(frame,(img_r,img_c),interpolation=cv2.INTER_AREA)
        gray =  cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        frame_list.append(gray)
        #plt.imshow(gray, cmap = plt.get_cmap('gray'))
        # to hide tick values on X and Y axis
        #plt.xticks([]), plt.yticks([])
        #plt.show()
        #cv2.imshow('frame',gray)
        #Indicates the number of milliseconds to wait. It will wait for a specific number of milliseconds to see if the keyboard has any input. The return value is ASCII. If its parameter is 0, it means indefinitely waiting for keyboard input
        #if cv2.waitKey(1) & 0xFF == ord('q'):
            #break
    capture.release()
    cv2.destroyAllWindows()
    ipt = np.asarray(frame_list)
    ipt=np.rollaxis(np.rollaxis(ipt,2,0),2,1)
    Training_set.append(ipt)
print("Hand clapping class has been loaded")

#Loading hand waving class
hw_listing = os.listdir('data/kth_database/handwaving')
for hw_id in hw_listing:
    hw_id = 'data/kth_database/handwaving/'+hw_id
    frame_list = []
    capture = cv2.VideoCapture(hw_id)
    fps = capture.get(cv2.CAP_PROP_FPS)
    #print("Frames per second using video.get(cv2.CAP_PROP_FPS): {0}".format(fps))
    for i in range(img_d):
        success, frame = capture.read()
        frame = cv2.resize(frame,(img_r,img_c),interpolation=cv2.INTER_AREA)
        gray =  cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        frame_list.append(gray)
        #plt.imshow(gray, cmap = plt.get_cmap('gray'))
        # to hide tick values on X and Y axis
        #plt.xticks([]), plt.yticks([])
        #plt.show()
        #cv2.imshow('frame',gray)
        #Indicates the number of milliseconds to wait. It will wait for a specific number of milliseconds to see if the keyboard has any input. The return value is ASCII. If its parameter is 0, it means indefinitely waiting for keyboard input
        #if cv2.waitKey(1) & 0xFF == ord('q'):
            #break
    capture.release()
    cv2.destroyAllWindows()
    ipt = np.asarray(frame_list)
    ipt=np.rollaxis(np.rollaxis(ipt,2,0),2,1)
    Training_set.append(ipt)
print("Hand waving class has been loaded")

#Loading jogging class
jog_listing = os.listdir('data/kth_database/jogging')
for jog_id in jog_listing:
    jog_id = 'data/kth_database/jogging/'+jog_id
    frame_list = []
    capture = cv2.VideoCapture(jog_id)
    fps = capture.get(cv2.CAP_PROP_FPS)
    #print("Frames per second using video.get(cv2.CAP_PROP_FPS): {0}".format(fps))
    for i in range(img_d):
        success, frame = capture.read()
        frame = cv2.resize(frame,(img_r,img_c),interpolation=cv2.INTER_AREA)
        gray =  cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        frame_list.append(gray)
        #plt.imshow(gray, cmap = plt.get_cmap('gray'))
        # to hide tick values on X and Y axis
        #plt.xticks([]), plt.yticks([])
        #plt.show()
        #cv2.imshow('frame',gray)
        #Indicates the number of milliseconds to wait. It will wait for a specific number of milliseconds to see if the keyboard has any input. The return value is ASCII. If its parameter is 0, it means indefinitely waiting for keyboard input
        #if cv2.waitKey(1) & 0xFF == ord('q'):
            #break
    capture.release()
    cv2.destroyAllWindows()
    ipt = np.asarray(frame_list)
    ipt=np.rollaxis(np.rollaxis(ipt,2,0),2,1)
    Training_set.append(ipt)
print("Jogging class has been loaded")

#Loading running class
run_listing = os.listdir('data/kth_database/running')
for run_id in run_listing:
    run_id = 'data/kth_database/running/'+run_id
    frame_list = []
    capture = cv2.VideoCapture(run_id)
    fps = capture.get(cv2.CAP_PROP_FPS)
    #print("Frames per second using video.get(cv2.CAP_PROP_FPS): {0}".format(fps))
    for i in range(img_d):
        success, frame = capture.read()
        frame = cv2.resize(frame,(img_r,img_c),interpolation=cv2.INTER_AREA)
        gray =  cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        frame_list.append(gray)
        #plt.imshow(gray, cmap = plt.get_cmap('gray'))
        # to hide tick values on X and Y axis
        #plt.xticks([]), plt.yticks([])
        #plt.show()
        #cv2.imshow('frame',gray)
        #Indicates the number of milliseconds to wait. It will wait for a specific number of milliseconds to see if the keyboard has any input. The return value is ASCII. If its parameter is 0, it means indefinitely waiting for keyboard input
        #if cv2.waitKey(1) & 0xFF == ord('q'):
            #break
    capture.release()
    cv2.destroyAllWindows()
    ipt = np.asarray(frame_list)
    ipt=np.rollaxis(np.rollaxis(ipt,2,0),2,1)
    Training_set.append(ipt)
print("running class has been loaded")

#Loading walking class
walk_listing = os.listdir('data/kth_database/walking')
for walk_id in walk_listing:
    walk_id = 'data/kth_database/walking/'+walk_id
    frame_list = []
    capture = cv2.VideoCapture(walk_id)
    fps = capture.get(cv2.CAP_PROP_FPS)
    #print("Frames per second using video.get(cv2.CAP_PROP_FPS): {0}".format(fps))
    for i in range(img_d):
        success, frame = capture.read()
        frame = cv2.resize(frame,(img_r,img_c),interpolation=cv2.INTER_AREA)
        gray =  cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        frame_list.append(gray)
        #plt.imshow(gray, cmap = plt.get_cmap('gray'))
        # to hide tick values on X and Y axis
        #plt.xticks([]), plt.yticks([])
        #plt.show()
        #cv2.imshow('frame',gray)
        #Indicates the number of milliseconds to wait. It will wait for a specific number of milliseconds to see if the keyboard has any input. The return value is ASCII. If its parameter is 0, it means indefinitely waiting for keyboard input
        #if cv2.waitKey(1) & 0xFF == ord('q'):
            #break
    capture.release()
    cv2.destroyAllWindows()
    ipt = np.asarray(frame_list)
    ipt=np.rollaxis(np.rollaxis(ipt,2,0),2,1)
    Training_set.append(ipt)
print("walking class has been loaded")

#convert the fram into array
Training_data=np.array(Training_set)
sample_num = len(Training_data)
#Assign Label
label = np.ones((sample_num,),dtype = int)
label[0:100] = 0
label[100:199] = 1
label[199:299] = 2
label[299:399] = 3
label[399:499] = 4
label[499:] = 5
print(Training_data.shape)
print(label.shape)
train= [Training_data,label]
train_set = np.zeros((sample_num, img_r,img_c,img_d,1))
for i in range(sample_num):
    for j in range(img_r):
        for k in range(img_c):
            for l in range(img_d):
                train_set[i][j][k][l][0]=train[0][i,j,k,l]
                #print(i)
#training parameter for CNN
classes = 6
epoch =50
batch_size = 2
#number of frames
patch_size = 15


(X_train, y_train) = (train[0],train[1])
Y_train = np_utils.to_categorical(y_train, classes)

#number of convoluntional filters
filt =[32, # 1st latyer 
       32  # 2nd layer
      ]
#level of pooling 
pool = [3,3]
#level of convolution
conv = [5,5]

#preprocessing part
train_set = train_set.astype('float32')
train_set -= np.mean(train_set)
train_set /= np.max(train_set)

#Building the CNN model

model = Sequential()
'''
model.add(Conv3D(
        filters=7,
        kernel_size = (28,48,5),
        strides=(1, 1, 1),
        activation='relu',
        input_shape = (34,54,9,7)
        ))


model.add(MaxPooling3D(pool_size=(14, 24, 5)))
print('Test')
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Conv3D(
        filters=35,
        kernel_size = (10,20,3),
        strides=(1, 1, 1),
        activation='relu',
        ))

model.add(MaxPooling3D(pool_size=(5, 10, 3)))

model.add(Dropout(0.5))
model.add(Flatten())
model.add(Conv3D(
        filters=5,
        kernel_size = (3,8,1),
        strides=(1, 1, 1),
        activation='relu',
        ))
model.add(Dense(30,init='normal'))
model.add(Dense(6,init='normal'))
'''
model.add(Conv3D(
        filters=filt[0],
        kernel_size = (5,5,5),
        input_shape=(img_r, img_c, img_d,1),
        activation='relu'
    ))
print('Test')
model.add(MaxPooling3D(pool_size=(pool[0], pool[0], pool[0])))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(128, init='normal', activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(classes,init='normal'))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='RMSprop', metrics=['mse', 'accuracy'])


print(Y_train.shape)
#Split the data for Train and Test
X_train_new, X_val_new, y_train_new,y_val_new = train_test_split(train_set, Y_train, test_size=0.2, random_state=4)
print(X_train_new.shape)
print(y_train_new.shape)
#Training
hist = model.fit(
    X_train_new,
    y_train_new,
    validation_data=(X_val_new,y_val_new),
    batch_size=batch_size,
    nb_epoch = epoch,
    shuffle=True
    )

#Testing
score = model.evaluate(
    X_val_new,
    y_val_new,
    batch_size=batch_size,
    #show_accuracy=True
    )

print('Test score:', score)

print('History', hist.history)


Boxing class has been loaded
Hand clapping class has been loaded
Hand waving class has been loaded
Jogging class has been loaded
running class has been loaded
walking class has been loaded
(599, 34, 54, 9)
(599,)
Test
(599, 6)
(479, 34, 54, 9, 1)
(479, 6)




Train on 479 samples, validate on 120 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50

In [96]:
train[0].shape

(599, 34, 54, 9)

In [99]:
train_set.shape

(599, 34, 54, 9, 1)

In [91]:
np.rollaxis(ipt,2,0).shape

(34, 9, 54)

In [92]:
np.rollaxis(np.rollaxis(ipt,2,0),2,1).shape

(34, 54, 9)

In [62]:
train_set[0]

array([[[[-0.03268567, -0.03268567, -0.04198914, ..., -0.04198914,
          -0.04198914, -0.04198914],
         [-0.04198914, -0.03268567, -0.05129261, ..., -0.03268567,
          -0.03268567, -0.03268567],
         [-0.03268567, -0.03268567, -0.05129261, ..., -0.04198914,
          -0.04198914, -0.04198914],
         ...,
         [-0.02338219, -0.04198914, -0.04198914, ..., -0.01407872,
          -0.02338219, -0.02338219],
         [-0.00477524, -0.02338219, -0.02338219, ..., -0.01407872,
          -0.01407872, -0.01407872],
         [ 0.00452823, -0.00477524, -0.01407872, ..., -0.00477524,
          -0.00477524, -0.00477524]],

        [[-0.04198914, -0.04198914, -0.04198914, ..., -0.04198914,
          -0.04198914, -0.04198914],
         [-0.05129261, -0.04198914, -0.04198914, ..., -0.04198914,
          -0.04198914, -0.05129261],
         [-0.05129261, -0.05129261, -0.05129261, ..., -0.05129261,
          -0.05129261, -0.05129261],
         ...,
         [-0.07920304, -0.06989957

In [79]:
X_train_new

array([[[[[-0.14432736],
          [-0.14432736],
          [-0.14432736],
          ...,
          [-0.15363084],
          [-0.15363084],
          [-0.15363084]],

         [[-0.14432736],
          [-0.14432736],
          [-0.14432736],
          ...,
          [-0.14432736],
          [-0.14432736],
          [-0.14432736]],

         [[-0.14432736],
          [-0.14432736],
          [-0.14432736],
          ...,
          [-0.14432736],
          [-0.15363084],
          [-0.15363084]],

         ...,

         [[-0.15363084],
          [-0.15363084],
          [-0.15363084],
          ...,
          [-0.15363084],
          [-0.15363084],
          [-0.15363084]],

         [[-0.1629343 ],
          [-0.1629343 ],
          [-0.1629343 ],
          ...,
          [-0.1629343 ],
          [-0.1629343 ],
          [-0.1629343 ]],

         [[-0.1629343 ],
          [-0.1629343 ],
          [-0.1629343 ],
          ...,
          [-0.17223778],
          [-0.17223778],
          

In [None]:
train.shape

In [75]:
X_val_new.shape

(120, 15, 15, 16, 1)

In [78]:
y_train_new

array([[0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.]])