In [1]:
from skimage import io
from skimage.transform import resize, pyramid_gaussian
import numpy as np
import random
from sklearn import svm
from skimage.feature import hog
from sklearn.naive_bayes import GaussianNB
import os

In [20]:
class GestureRecognizer(object):

    """class to perform gesture recognition"""

    def __init__(self, data_directory):

        """
            data_directory : path like /home/sanket/mlproj/dataset/
            includes the dataset folder with '/'

            Initialize all your variables here
        """
        self.base_dir = data_directory
        # self.base_dir = os.path.abspath('.') + '/dataset/'
        self.win_size = 128
        self.clf_gesture = None
        self.clf_hnh = None

    def IOU(self, A, B):
        x_overlap = max(0, min(B[0],B[2]) - max(A[0],A[2]))
        y_overlap = max(0, min(B[1],B[3]) - max(A[1],A[3]))
        inter = x_overlap * y_overlap;
        
        A_area = (A[2] - A[0] + 1)*(A[3] - A[1] + 1)
        B_area = (B[2] - B[0] + 1)*(B[3] - B[1] + 1)
        
        union = (A_area + B_area - inter)*1.0
        inter = inter*1.0
        
        return inter/union
    
    def train(self, train_list):

        """
            train_list : list of users to use for training
            eg ["user_1", "user_2", "user_3"]

            The train function should train all your classifiers,
            both binary and multiclass on the given list of users
        """
        
        train_x_pos_ = []
        train_x_neg_ = []
        train_y = []
        
        for user in train_list:
            csv_file = self.base_dir + user + '/' + user + '_loc.csv'
            with open(csv_file,'r') as f:
                f.readline()
                for line in f:
                    data = line.strip().split(',')
                    file_name = data[0]
                    x1,y1,x2,y2 = map(int, data[1:])
                    
                    img = io.imread(self.base_dir + file_name,as_grey=True)
                    h,w = img.shape[:2]
                    imgg = img[y1:y2,x1:x2]
                    imgg = resize(imgg, (self.win_size, self.win_size))
                    imgg_hog = hog(imgg)
                    
                    label = ord(file_name.split('/')[1][0])
                    
                    train_x_pos_.append(imgg_hog)
                    train_y.append(label)
                    
                    count = 0
                    
                    A = [x1,y1,x2,y2]
                    
                    while True:
                        x1_r = random.randrange(0,w - self.win_size)
                        y1_r = random.randrange(0,h - self.win_size)
                        x2_r = x1_r + self.win_size
                        y2_r = y1_r + self.win_size
                        
                        if y2_r >= h or x2_r>=w:
                            continue
                        
                        B = [x1_r, y1_r, x2_r, y2_r]
                        
                        if self.IOU(A,B) < 0.1:
                            train_x_neg_.append(hog(img[y1_r:y2_r,x1_r:x2_r]))
                            count += 1
                        
                        if count >= 2:
                            break
        
        self.train_x_pos = np.asarray(train_x_pos_)
        del train_x_pos_
        self.train_x_neg = np.asarray(train_x_neg_)
        del train_x_neg_
        train_y = np.asarray(train_y)
        
        self.clf_gesture =  svm.LinearSVC()
        self.clf_gesture.fit(self.train_x_pos, train_y)
        score_gesture = self.clf_gesture.score(self.train_x_pos, train_y)
        print 'Training accuracy for gesture classifier : %f' %(score_gesture)
        
#         self.clf_hnh = svm.LinearSVC()
        self.clf_hnh = GaussianNB()
        train_x_hnh = np.concatenate((self.train_x_pos , self.train_x_neg))
        train_y_hnh = np.asarray( [1] * len(self.train_x_pos) + [0] * len(self.train_x_neg))
        self.clf_hnh.partial_fit(train_x_hnh, train_y_hnh, classes = np.asarray([0,1]))
        score_hnh = self.clf_hnh.score(train_x_hnh, train_y_hnh)
        print 'Training accuracy for Hand/Non-hand classifier : %f' %(score_hnh)
    
    
    def test(self, test_list):

        """
            train_list : list of users to use for training
            eg ["user_1", "user_2", "user_3"]

            The train function should train all your classifiers,
            both binary and multiclass on the given list of users
        """
        
        if self.clf_hnh is None or self.clf_gesture is None:
            print 'Classifiers not trained'
            return
        
        test_x_pos = []
        test_x_neg = []
        test_y = []
        
        for user in test_list:
            csv_file = self.base_dir + user + '/' + user + '_loc.csv'
            with open(csv_file,'r') as f:
                f.readline()
                for line in f:
                    data = line.strip().split(',')
                    file_name = data[0]
                    x1,y1,x2,y2 = map(int, data[1:])
                    
                    img = io.imread(self.base_dir + file_name,as_grey=True)
                    h,w = img.shape[:2]
                    imgg = img[y1:y2,x1:x2]
                    imgg = resize(imgg, (self.win_size, self.win_size))
                    imgg_hog = hog(imgg)
                    
                    label = ord(file_name.split('/')[1][0])
                    
                    test_x_pos.append(imgg_hog)
                    test_y.append(label)
                    
                    count = 0
                    
                    A = [x1,y1,x2,y2]
                    
                    while True:
                        x1_r = random.randrange(0,w - self.win_size)
                        y1_r = random.randrange(0,h - self.win_size)
                        x2_r = x1_r + self.win_size
                        y2_r = y1_r + self.win_size
                        
                        if y2_r >= h or x2_r>=w:
                            continue
                        
                        B = [x1_r, y1_r, x2_r, y2_r]
                        
                        if self.IOU(A,B) < 0.1:
                            test_x_neg.append(hog(img[y1_r:y2_r,x1_r:x2_r]))
                            count += 1
                        
                        if count >= 2:
                            break
        
        
        test_x_pos = np.asarray(test_x_pos)
        test_x_neg = np.asarray(test_x_neg)
        test_y = np.asarray(test_y)
        
        score_gesture = self.clf_gesture.score(test_x_pos, test_y)
        print 'Testing accuracy for gesture classifier : %f' %(score_gesture)
        
        
        test_x_hnh = np.concatenate((test_x_pos, test_x_neg))
        test_y_hnh = np.asarray( [1] * len(test_x_pos) + [0] * len(test_x_neg) )
        
        score_hnh = self.clf_hnh.score(test_x_hnh,test_y_hnh)
        print 'Testing accuracy for Hand/Non-hand classifier : %f' %(score_hnh)

    
    def hard_negative_mining(self, no_iter, threshold):
        
        if self.clf_hnh is None or self.clf_gesture is None:
            print 'Classifiers not trained'
            return
        
        for i in xrange(no_iter):
            count = 0
            FP = []
            
            for data in self.train_x_neg:
                if self.clf_hnh.predict([data])[0] == 1:
                    count+=1
                    FP.append(data)
            
            print count
            if count <= threshold:
                break
            
            self.clf_hnh.partial_fit(np.asarray(FP), np.asarray([0] * len(FP)))
        
        Y = np.asarray([1] * self.train_x_pos.shape[0] +  [0] * self.train_x_neg.shape[0])
        score_ = self.clf_hnh.score(np.concatenate((self.train_x_pos, self.train_x_neg)), Y)
        print 'Accuracy for Hand/Non-hand classifier after Hard Negative Mining : %f' %(score_)
        
    def sliding_window(self, img, clf):
        conf_map = np.zeros(img.shape)
        h,w = img.shape[:2]
        stride = 10
        win_size = 128
        for y in range(0,h-win_size+1,stride):
            for x in range(0,w-win_size+1,stride):
                imgg = img[y:y+win_size, x:x+win_size]
                hog_ = hog(imgg)
                class_ = clf.predict_proba(np.asarray([hog_]))[0]
                for i in range(128):
                    for j in range(128):
                        conf_map[y + i][x + j] = max(conf_map[y + i][x + j], class_[1])
        return conf_map
    
    
    def get_bbox(self, image):
        
        if self.clf_hnh is None or self.clf_gesture is None:
            print 'Classifiers not trained'
            return
        
        py = pyramid_gaussian(image, downscale=1.3)
        py_img  = [py.next(), py.next(), py.next()]
        conf_maps = map(lambda x : self.sliding_window(x, self.clf_hnh), py_img)
        max_val = max(map(lambda x: x.max(), conf_maps))
        
        for i in xrange(len(conf_maps)):
            if conf_maps[i].max() == max_val:
                cmap = resize(conf_maps[i], image.shape)
                a,b = np.where(cmap==cmap.max())
                return [a[0],b[0],a[-1], b[-1]]
        
        return []
        
    def recognize_gesture(self, image):

        """
            image : a 320x240 pixel RGB image in the form of a numpy array
            
            This function should locate the hand and classify the gesture.

            returns : (position, label)

            position : a tuple of (x1,y1,x2,y2) coordinates of bounding box
                       x1,y1 is top left corner, x2,y2 is bottom right

            label : a single character. eg 'A' or 'B'
        """
        position = self.get_bbox(image)
        imgg = image[position[0]:position[2], position[1] : position[3]]
        imgg = resize(imgg, (self.win_size, self.win_size))
        label = chr(self.clf_gesture.predict(np.asarray( [hog(imgg)]))[0])
        return position, label

    def translate_video(self, image_array):

        """
            image_array : a list of images as described above.
                          can be of arbitrary length

            This function classifies the video into a 5 character string

            returns : word (a string of 5 characters)
                    no two consecutive characters are identical
        """

        return word
    
    def test_labelled_images(self):
        
        if self.clf_hnh is None or self.clf_gesture is None:
            print 'Classifiers not trained'
            return

        test_list = [3,4,5,6,7,9,10,11,12,13,14,15,16,17,18,19]
        test_list = map(lambda x : 'user_' + str(x), test_list)
        
        count = 0
        total = 0
        for user in test_list:
            csv_file = self.base_dir + user + '/' + user + '_loc.csv'
            with open(csv_file,'r') as f:
                f.readline()
                for line in f:
                    data = line.strip().split(',')
                    file_name = data[0]
                    x1,y1,x2,y2 = map(int, data[1:])
                    
                    img = io.imread(self.base_dir + file_name,as_grey=True)
                    total+=1
                    h,w = img.shape[:2]
                    label = file_name.split('/')[1][0]
                    
                    pos, label_ = self.recognize_gesture(img)
                    
                    if label == label_:
                        count+=1
        
        score_gesture = (count*1.0)/(total*1.0)
        print 'Testing accuracy for gesture classifier class : %f' %(score_gesture)
    

In [22]:
G = GestureRecognizer(os.path.abspath('.') + '/dataset/')
print G.base_dir

/home/ayush/GPU_ML/ML/workspace/project/complete_data/dataset/


In [23]:
user = [3,4,5,6,7,9,10,11,12,13,14,15]
user = map(lambda x : 'user_' + str(x), user)
print user

user_test = [16,17,18,19]
user_test = map(lambda x : 'user_' + str(x), user_test)
print user_test

['user_3', 'user_4', 'user_5', 'user_6', 'user_7', 'user_9', 'user_10', 'user_11', 'user_12', 'user_13', 'user_14', 'user_15']
['user_16', 'user_17', 'user_18', 'user_19']


In [24]:
G.train(user)

Training accuracy for gesture classifier : 1.000000
Training accuracy for Hand/Non-hand classifier : 0.902778


In [25]:
G.hard_negative_mining(50,5)

711
533
446
401
360
326
312
288
279
249
227
212
196
180
179
161
159
152
150
138
133
133
122
113
106
104
100
95
87
85
79
75
72
69
64
70
65
58
61
61
54
53
50
47
45
38
40
42
37
34
Accuracy for Hand/Non-hand classifier after Hard Negative Mining : 0.965162


In [26]:
G.test(user_test)

Testing accuracy for gesture classifier : 0.764583
Testing accuracy for Hand/Non-hand classifier : 0.921875


In [27]:
from sklearn.externals import joblib

In [18]:
joblib.dump(G.clf_hnh, 'clf_hnh_good_957.pkl')

['clf_hnh_good_957.pkl']

In [28]:
G.test_labelled_images()

ValueError: Buffer not C contiguous.

In [29]:
del G