Auther: Aditya Vora (October 27, 2016)

# Introduction

* This script contains the code to test the fine tuned C3D model. We had fine tuned the current C3D model which had 437 classes to a network which have 2 classes. The script is currently tested for KTH test videos. 

In [None]:
# Import the libraries
import theano 
import theano.tensor as T
import numpy as np
import time
import cv2
from cv2.cv import *
dtensor5 = theano.tensor.TensorType(theano.config.floatX, (False,)*5)
input_var = dtensor5('inputs')
APPROXIMATE_MEAN = 127.0

In [None]:
import lasagne
from lasagne.layers.shape import PadLayer
from lasagne.layers import InputLayer, DenseLayer, NonlinearityLayer
from lasagne.layers.dnn import Conv3DDNNLayer, MaxPool3DDNNLayer
from lasagne.nonlinearities import softmax

In [None]:
# Define the model as per the changes made during fine tuning
net = {}
net['input'] = InputLayer((None, 3, 16, 112, 112))

# ----------- 1st layer group ---------------
net['conv1a'] = Conv3DDNNLayer(net['input'], 64, (3,3,3), pad=1,nonlinearity=lasagne.nonlinearities.rectify,
                               flip_filters=False)
net['pool1']  = MaxPool3DDNNLayer(net['conv1a'],pool_size=(1,2,2),stride=(1,2,2))

# ------------- 2nd layer group --------------
net['conv2a'] = Conv3DDNNLayer(net['pool1'], 128, (3,3,3), pad=1,nonlinearity=lasagne.nonlinearities.rectify)
net['pool2']  = MaxPool3DDNNLayer(net['conv2a'],pool_size=(2,2,2),stride=(2,2,2))

# ----------------- 3rd layer group --------------
net['conv3a'] = Conv3DDNNLayer(net['pool2'], 256, (3,3,3), pad=1,nonlinearity=lasagne.nonlinearities.rectify)
net['conv3b'] = Conv3DDNNLayer(net['conv3a'], 256, (3,3,3), pad=1,nonlinearity=lasagne.nonlinearities.rectify)
net['pool3']  = MaxPool3DDNNLayer(net['conv3b'],pool_size=(2,2,2),stride=(2,2,2))

# ----------------- 4th layer group --------------
net['conv4a'] = Conv3DDNNLayer(net['pool3'], 512, (3,3,3), pad=1,nonlinearity=lasagne.nonlinearities.rectify)
net['conv4b'] = Conv3DDNNLayer(net['conv4a'], 512, (3,3,3), pad=1,nonlinearity=lasagne.nonlinearities.rectify)
net['pool4']  = MaxPool3DDNNLayer(net['conv4b'],pool_size=(2,2,2),stride=(2,2,2))

# ----------------- 5th layer group --------------
net['conv5a'] = Conv3DDNNLayer(net['pool4'], 512, (3,3,3), pad=1,nonlinearity=lasagne.nonlinearities.rectify)
net['conv5b'] = Conv3DDNNLayer(net['conv5a'], 512, (3,3,3), pad=1,nonlinearity=lasagne.nonlinearities.rectify)
# We need a padding layer, as C3D only pads on the right, which cannot be done with a theano pooling layer
net['pad']    = PadLayer(net['conv5b'],width=[(0,1),(0,1)], batch_ndim=3)
net['pool5']  = MaxPool3DDNNLayer(net['pad'],pool_size=(2,2,2),pad=(0,0,0),stride=(2,2,2))
net['fc6-1']  = DenseLayer(net['pool5'], num_units=4096,nonlinearity=lasagne.nonlinearities.rectify)
net['fc7-1']  = DenseLayer(net['fc6-1'], num_units=4096,nonlinearity=lasagne.nonlinearities.rectify)
net['output_layer']  = DenseLayer(net['fc7-1'], num_units=2, nonlinearity=lasagne.nonlinearities.softmax)

In [None]:
# Load the parameters and set the current weights of the network with this parameters.
with np.load('./best-models/c3d-0001/best_model.npz') as f:
        param_values = [f['arr_%d' % i] for i in range(len(f.files))]
lasagne.layers.set_all_param_values(net['output_layer'], param_values)

In [None]:
prediction = lasagne.layers.get_output(net['output_layer'],input_var, deterministic=True)
pred_func = theano.function([input_var], [prediction])

In [None]:
videopath = "./test-videos/person23_walking_d1_uncomp_mod.avi"
capture = cv2.VideoCapture()
success = capture.open(videopath)

In [None]:
if not success:
        print "Couldn't open video %s." % videopath

In [None]:
import time
frame_count = int(capture.get(CV_CAP_PROP_FRAME_COUNT))
clip = np.empty((1,3,frame_count,112,112), dtype=np.int16)
start_frame = 0
end_frame = 16 

In [None]:
# Classification section.
toc = time.time()    
for i in xrange(frame_count):
    frame_available, frame = capture.read()
    if not frame_available:
        print "Ran out of frames when reading", videopath
        break  
    clip[:,:,i] = frame.transpose(2,0,1) - APPROXIMATE_MEAN
    if(i>=15):
        probabilities = pred_func(clip[:,:,start_frame:end_frame])
        start_frame += 1
        end_frame += 1
        arr = probabilities[0]
        out = arr[0]
        if out[0] > out[1]:
            text = "running"
        else:
            text = "walking"
        print text
tic = time.time()
print "Time Taken: %0.4f msec"%(1000*(tic-toc))