In [1]:
import tensorflow as tf
import cv2
import numpy as np
import pickle
sess = tf.InteractiveSession()

In [2]:
def bias(num_units):
    return tf.Variable(tf.constant(0.1, shape=[num_units]))

def weights(shape):
    return tf.Variable(tf.truncated_normal(shape, stddev=0.1))

def conv2d(inp, b, w):
    conv = tf.nn.relu(tf.nn.conv2d(inp, w, strides=[1,1,1,1], padding='SAME') + b)
    return conv

def pool3d(inp):
    return tf.nn.max_pool3d(inp, ksize=[1, 2, 2, 2, 1], strides=[1, 2, 2, 2, 1], padding='SAME')

def layer(inp, num_units, filter_shape):
    b = bias(num_units)
    w = weights(filter_shape + [num_units])
    conv = [conv2d(i, b, w) for i in inp]
    i_pool = [tf.expand_dims(c, [4]) for c in conv]
    o_pool = [tf.squeeze(pool3d(i), [4]) for i in i_pool]
    out = [p for p in o_pool]
    return b,w,conv,i_pool,o_pool,out

In [3]:
x_h = tf.placeholder(tf.float32, shape=[None, 2, 64, 64, 32])
x_m = tf.placeholder(tf.float32, shape=[None, 2, 64, 64, 32])

inp_h = tf.unpack(x_h, axis=1)
inp_m = tf.unpack(x_m, axis=1)
y = tf.placeholder(tf.float32, shape=[None, 20])

b_conv1_h, w_conv1_h, h_conv1_h, i_pool1_h, o_pool1_h, i_conv2_h = layer(inp_h, 16, [5, 5, 32])
b_conv2_h, w_conv2_h, h_conv2_h, i_pool2_h, o_pool2_h, i_conv3_h = layer(i_conv2_h, 32, [5, 5, 8])
b_conv3_h, w_conv3_h, h_conv3_h, i_pool3_h, o_pool3_h, o_h = layer(i_conv3_h, 48, [4, 4, 16])

b_conv1_m, w_conv1_m, h_conv1_m, i_pool1_m, o_pool1_m, i_conv2_m = layer(inp_m, 16, [5, 5, 32])
b_conv2_m, w_conv2_m, h_conv2_m, i_pool2_m, o_pool2_m, i_conv3_m = layer(i_conv2_m, 32, [5, 5, 8])
b_conv3_m, w_conv3_m, h_conv3_m, i_pool3_m, o_pool3_m, o_m = layer(i_conv3_m, 48, [4, 4, 16])

flat = tf.reshape(tf.pack(o_h + o_m, axis=1), [-1, 4 * 8 * 8 * 24])
b_fc1 = bias(512)
w_fc1 = weights([4 * 8 * 8 * 24, 512])
h_fc1 = tf.nn.relu(tf.matmul(flat, w_fc1) + b_fc1)
b_fc2 = bias(20)
w_fc2 = weights([512, 20])
h_fc2 = tf.matmul(h_fc1, w_fc2) + b_fc2

In [4]:
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(h_fc2, y))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(h_fc2,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [5]:
def read_video(sample_num):
    base_dir = "/media/amey/76D076A5D0766B6F/chalap"
    uber_path = "{}/{}/uber/Sample{:04d}".format(base_dir, "train", sample_num)
    video_path = uber_path + ".mp4"
    labels_path = uber_path + ".pkl"
    
    vid = cv2.VideoCapture(video_path)
    frames = []
    while True:
        ret, frame = vid.read()
        if not ret:
            break
        frames.append(cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY))
    
    with open(labels_path, 'rb') as f:
        labels = pickle.load(f)
        
    return frames, labels

def split_frame(frame):
    gray_hand = frame[:64,:64]
    depth_hand = frame[:64,64:]
    gray_main = frame[64:,:64]
    depth_main = frame[64:,64:]
    
    hand = np.stack([gray_hand, depth_hand], axis=0)
    main = np.stack([gray_main, depth_main], axis=0)
    
    return hand, main

def pad_frames(vid):
    diff = 32 - len(vid)
    before = diff // 2
    after = diff - before
    return [np.zeros((2, 64, 64))] * before + vid + [np.zeros((2, 64, 64))] * after

def logit(num):
    out = np.zeros(20)
    out[num-1] = 1
    return out

def read_segments(sample_num):
    base_dir = "/media/amey/76D076A5D0766B6F/chalap"
    video_path = "{}/{}/uber/Sample{:04d}.mp4".format(base_dir, "train", sample_num)
    labels_path = "{}/{}/labels/Sample{:04d}.csv".format(base_dir, "train", sample_num)
    
    vid = cv2.VideoCapture(video_path)
    frames = []
    while True:
        ret, frame = vid.read()
        if not ret:
            break
        frames.append(cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY))
        
    segments = []
    with open(labels_path, 'r') as f:
        for l in f:
            [label,start,end] = [int(n) for n in l.split(',')]
            hand, main = zip(*[split_frame(f) for f in frames[start:end+1]])
            hand, main = list(hand), list(main)
            if len(hand) < 32:
                hand = pad_frames(hand)
            if len(main) < 32:
                main = pad_frames(main)
            segments.append((hand, main, label))
                
    return segments

def get_intervals(segment):
    hand, main, label = segment
    intervals = []

    for i in range(0, len(hand) - 31):
        intervals.append((np.stack(hand[i:i+32], axis=3), np.stack(main[i:i+32], axis=3), logit(label)))
        
    return intervals

def get_batch(size):
    sample_num = 1
    segments = []
    segment = None
    batch = []
    intervals = []
    
    while True:
        if (len(batch) == size):
            yield list(zip(*batch))
            batch = []
        if sample_num > 470:
            sample_num = 1
            print("Rolled over")
        if len(segments) == 0:
            #print("Read " + str(sample_num))
            segments = read_segments(sample_num)
            sample_num += 1
        if segment == None:
            segment = segments[0]
            segments = segments[1:]
        if len(intervals) == 0:
            intervals = get_intervals(segment)
            segment = None
            
        rem = size - len(batch)
        batch = batch + intervals[:rem]
        intervals = intervals[rem:]
        

In [6]:
acc = []

In [7]:
sess.run(tf.initialize_all_variables())
batcher = get_batch(20)
for i in range(20000):
    batch = next(batcher)
    if i%100 == 0:
        train_accuracy = accuracy.eval(feed_dict={x_h:batch[0], x_m:batch[1], y: batch[2]})
        acc.append((i, train_accuracy))
        print("step %d, training accuracy %g"%(i, train_accuracy))
    train_step.run(feed_dict={x_h:batch[0], x_m:batch[1], y: batch[2]})

step 0, training accuracy 0
step 100, training accuracy 0
step 200, training accuracy 0
step 300, training accuracy 0
step 400, training accuracy 0.05
step 500, training accuracy 0
step 600, training accuracy 0.8
step 700, training accuracy 0
step 800, training accuracy 0.05
step 900, training accuracy 0.05
step 1000, training accuracy 0
step 1100, training accuracy 0.15
step 1200, training accuracy 0.15
step 1300, training accuracy 0
step 1400, training accuracy 0.1
step 1500, training accuracy 0.1
step 1600, training accuracy 0.1
step 1700, training accuracy 0.2
step 1800, training accuracy 0.35
step 1900, training accuracy 0
step 2000, training accuracy 0.2
step 2100, training accuracy 0.05
step 2200, training accuracy 0.05
step 2300, training accuracy 0
step 2400, training accuracy 0
step 2500, training accuracy 0.05
step 2600, training accuracy 0.1
step 2700, training accuracy 0.05
step 2800, training accuracy 0.1
step 2900, training accuracy 0
step 3000, training accuracy 0.05
st