In [31]:
%matplotlib inline

import math
import numpy as np
import cv2
import matplotlib.pyplot as plt
from matplotlib import rcParams
from collections import namedtuple
from typing import List, NamedTuple, Tuple
import subprocess as sp
import warnings
import pickle

In [72]:
DataPoint = namedtuple('DataPoint', ['gray', 'depth', 'user', 'skel', 'label'])
Coords = namedtuple('Coords', ['x', 'y'])
Joints = namedtuple('Joints', ['head', 'neck', 'left', 'right'])
Bounds = namedtuple('Bounds', ['min', 'max'])

def read_sample(sample_num):
    base_dir = "/media/amey/76D076A5D0766B6F/chalap"
    train_dir = base_dir + "/train"

    color_path = "{}/{}/Sample{:04d}.mp4".format(train_dir, "color", sample_num)
    depth_path = "{}/{}/Sample{:04d}.mp4".format(train_dir, "depth", sample_num)
    user_path = "{}/{}/Sample{:04d}.mp4".format(train_dir, "user", sample_num)
    labels_path = "{}/{}/Sample{:04d}.csv".format(train_dir, "labels", sample_num)
    skel_path = "{}/{}/Sample{:04d}.csv".format(train_dir, "skel", sample_num)
    
    data = []
    color_vid = cv2.VideoCapture(color_path)
    depth_vid = cv2.VideoCapture(depth_path)
    user_vid = cv2.VideoCapture(user_path)
    i = 0

    with open(labels_path) as labels_file:
        with open(skel_path) as skels_file:
            for gesture_frames in labels_file:
                [label,frame_start,frame_end] = [int(n) for n in gesture_frames.split(',')]
                while i < frame_start:
                    point = next_data_point(color_vid, depth_vid, user_vid, skels_file, 0)
                    if (point is not None):
                        data.append(point)
                    i += 1
                while i <= frame_end:
                    point = next_data_point(color_vid, depth_vid, user_vid, skels_file, label)
                    if (point is not None):
                        data.append(point)
                    i += 1
                    
    return data

def read_and_convert(video):
    ret, frame = video.read()
    return cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) if ret else None

def next_data_point(color_vid, depth_vid, user_vid, skels_file, label):
    gray = read_and_convert(color_vid)
    depth = read_and_convert(depth_vid)
    user = read_and_convert(user_vid)
    
    if (gray is not None and depth is not None and user is not None):
        return DataPoint(gray=gray, depth=depth, user=user,
                         skel=get_joint_locs(skels_file.readline().split(',')),
                         label=label)
    else:
        return None

def get_joint_locs(joints: List[int]):
    for i in range(0, len(joints), 9):
        if (i // 9) not in [2,3,7,11]:
            continue
        if (i // 9) == 2:
            neckX = int(joints[i+7])
            neckY = int(joints[i+8])
        if (i // 9) == 3:
            headX = int(joints[i+7])
            headY = int(joints[i+8])
        # X and Y are reversed
        elif (i // 9) == 7:
            leftX = int(joints[i+7])
            leftY = int(joints[i+8])
        # X and Y are reversed
        elif (i // 9) == 11:
            rightX = int(joints[i+7])
            rightY = int(joints[i+8])
            
    return Joints(head=Coords(x=headX, y=headY),
                  neck=Coords(x=neckX, y=neckY),
                  left=Coords(x=leftX, y=leftY),
                  right=Coords(x=rightX, y=rightY))

def valid_coords(joint_locs: Joints):
    if (joint_locs.head.x == 0 and joint_locs.head.y == 0 and
        joint_locs.neck.x == 0 and joint_locs.neck.y == 0 and
        joint_locs.left.x == 0 and joint_locs.left.y == 0 and
        joint_locs.right.x == 0 and joint_locs.right.y == 0):
        return False
    
    return True

def get_hand_bounds(data: List[DataPoint]):
    minX, minY, maxX, maxY = 640, 480, 0, 0
    for point in data:
        if (point.label != 0) and valid_coords(point.skel):
            minX = min(minX, point.skel.left.x, point.skel.right.x)
            minY = min(minY, point.skel.left.y, point.skel.right.y)
            maxX = max(maxX, point.skel.left.x, point.skel.right.x)
            maxY = max(maxY, point.skel.left.y, point.skel.right.y)
            
    if (minX, minY, maxX, maxY) == (640, 480, 0, 0):
        minX, minY, maxX, maxY = 0, 0, 640, 480
            
    return Bounds(min=Coords(x=minX, y=minY), max=Coords(x=maxX, y=maxY))

def crop_frame(frame: np.ndarray, bounds: Bounds):
    minX, maxX, _, maxY = bounds.min.x, bounds.max.x, bounds.min.y, bounds.max.y
    if maxX - minX > maxY:
        maxY = maxX - minX
    else:
        diff = (maxY - (maxX - minX))
        minX = max(0, minX - (diff // 2))
        maxX = min(640, (maxY + minX))
        
    return frame[:maxY, minX:maxX]

def write_video(frames: List[np.ndarray], name: str):
    writer = cv2.VideoWriter(name, cv2.VideoWriter_fourcc(*'H264'), 20.0, (frames[0].shape[1], frames[0].shape[0]))
    for frame in frames:
        writer.write(cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB))
    writer.release()
    
def write_video_ffmpeg(frames: List[np.ndarray], name: str):
    command = [ "ffmpeg",
        '-y', # (optional) overwrite output file if it exists
        '-vcodec', 'rawvideo',
        '-f', 'rawvideo',
        '-s', get_resolution(frames[0]), # size of one frame
        '-pix_fmt', 'rgb24',
        '-r', '20', # frames per second
        '-i', '-', # The imput comes from a pipe
        '-an', # Tells FFMPEG not to expect any audio
        '-vcodec', 'mpeg4',
        name ]

    with sp.Popen( command, stdin=sp.PIPE, stderr=sp.PIPE) as pipe:
        for f in frames:
            pipe.stdin.write(cv2.cvtColor(f, cv2.COLOR_GRAY2RGB).tostring())
    
def get_resolution(frame: np.ndarray):
    return ("{}x{}".format(frame.shape[1], frame.shape[0]))

def crop_video(vid: List[np.ndarray], bounds: Bounds):
    return [crop_frame(f, bounds) for f in vid]

def get_hand(frame: np.ndarray, coords: Coords, shape: Tuple):
    minX = max(0, coords.x - shape[1] * 3 // 4)
    maxX = min(frame.shape[1], minX + shape[1])
    minY = max(0, coords.y - shape[0] * 3 // 4)
    maxY = min(frame.shape[0], minY + shape[0])
    
    if (maxX - minX) != shape[1]:
        if maxX == frame.shape[1]:
            minX = maxX - shape[1]
        else:
            maxX = minX + shape[1]
    if (maxY - minY) != shape[0]:
        if maxY == frame.shape[0]:
            minY = maxY - shape[0]
        else:
            maxY = minY + shape[0]
    
    return frame[minY:maxY, minX:maxX]

def get_left_hand_vid(data: List[np.ndarray], joints: List[Joints], shape: Tuple):
    return [get_hand(d[0], d[1].left, shape) for d in zip(data, joints)]

def get_right_hand_vid(data: List[np.ndarray], joints: List[Joints], shape: Tuple):
    return [get_hand(d[0], d[1].right, shape) for d in zip(data, joints)]

def get_higher_hand_vid(left: List[np.ndarray], right: List[np.ndarray], whole: List[np.ndarray],
                        joints: List[Joints], labels: List[int]):
    current_label = -1
    higher_hand = []
    main = []
    count_left = 0
    count_right = 0
    hand_left = []
    hand_right = []
    main_buffer = []
    
    for p in zip(left, right, whole, joints, labels):
        if p[4] != current_label:
            if count_left > count_right:
                higher_hand += [np.fliplr(f) for f in hand_left]
                main += [np.fliplr(f) for f in main_buffer]
            else:
                higher_hand += hand_right
                main += main_buffer
            current_label = -1
            count_left = 0
            count_right = 0
            hand_left = []
            hand_right = []
            main_buffer = []
            current_label = p[4]
        if p[3].left.y < p[3].right.y:
            count_left += 1
        else:
            count_right += 1
        hand_left.append(p[0])
        hand_right.append(p[1])
        main_buffer.append(p[2])
        
    if count_left > count_right:
        higher_hand += [np.fliplr(f) for f in hand_left]
        main += [np.fliplr(f) for f in main_buffer]
    else:
        higher_hand += hand_right
        main += main_buffer
        
    return higher_hand, main

def remove_background(vid, user_vid):
    modified_vid = []
    for i in range(len(vid)):
        modified_vid.append(np.ma.masked_array(vid[i], mask=user_vid[i] == 0, filled_value=124).filled(255))
        
    return modified_vid

def resize_video(vid, shape):
    return [cv2.resize(f, shape) for f in vid]

def get_uber_video(data: List[DataPoint], shape: Tuple):
    #warnings.simplefilter("error")
    depth_vid = [d.depth for d in data]
    gray_vid = [d.gray for d in data]
    user_vid = [d.user for d in data]
    joints_list = [d.skel for d in data]
    labels_list = [d.label for d in data]
    bounds = get_hand_bounds(data)
    
    cropped_gray_vid = crop_video(gray_vid, bounds)
    modified_depth_vid = remove_background(depth_vid, user_vid)
    cropped_depth_vid = crop_video(modified_depth_vid, bounds)
    cropped_user_vid = crop_video(user_vid, bounds)
    
    left_gray_vid = get_left_hand_vid(gray_vid, joints_list, shape)
    right_gray_vid = get_right_hand_vid(gray_vid, joints_list, shape)
    left_depth_vid = get_left_hand_vid(depth_vid, joints_list, shape)
    right_depth_vid = get_right_hand_vid(depth_vid, joints_list, shape)
    smaller_gray_vid = resize_video(cropped_gray_vid, shape)
    smaller_depth_vid = resize_video(cropped_depth_vid, shape)
    higher_gray_vid, smaller_gray_vid = get_higher_hand_vid(left_gray_vid, right_gray_vid, smaller_gray_vid, joints_list, labels_list)
    higher_depth_vid, smaller_depth_vid = get_higher_hand_vid(left_depth_vid, right_depth_vid, smaller_depth_vid, joints_list, labels_list)
    
    
    uber_vid = []
    for i in range(len(depth_vid)):
        #uber_frame = np.zeros((shape[0] * 3, shape[1] * 2), dtype='uint8')
        uber_frame = np.zeros((shape[0] * 2, shape[1] * 2), dtype='uint8')
        #uber_frame[:shape[0],:shape[1]] = left_gray_vid[i]
        #uber_frame[:shape[0],shape[1]:2*shape[1]] = right_gray_vid[i]
        #uber_frame[shape[0]:2*shape[0],:shape[1]] = left_depth_vid[i]
        #uber_frame[shape[0]:2*shape[0],shape[1]:2*shape[1]] = right_depth_vid[i]
        #uber_frame[2*shape[0]:3*shape[0],:shape[1]] = smaller_gray_vid[i]
        #uber_frame[2*shape[0]:3*shape[1],shape[1]:2*shape[1]] = smaller_depth_vid[i]
        uber_frame[:shape[0],:shape[1]] = higher_gray_vid[i]
        uber_frame[:shape[0],shape[1]:2*shape[1]] = higher_depth_vid[i]
        uber_frame[shape[0]:2*shape[0],:shape[1]] = smaller_gray_vid[i]
        uber_frame[shape[0]:2*shape[1],shape[1]:2*shape[1]] = smaller_depth_vid[i]
        uber_vid.append(uber_frame)
    
    return uber_vid

In [52]:
data = read_sample(35)
left = get_left_hand_vid([d.gray for d in data], [d.skel for d in data], (64, 64))
right = get_right_hand_vid([d.gray for d in data], [d.skel for d in data], (64, 64))

In [53]:
for i,f in enumerate(left):
    if f.shape != (64, 64):
        print(i, f.shape)

In [269]:
modified = remove_background([d.gray for d in data], [d.user for d in data])
write_video(modified, "/home/amey/tmp/mod.mp4")

In [274]:
left = get_left_hand_vid(data)
write_video(left, "/home/amey/tmp/left.mp4")

In [28]:
uber_vid = get_uber_video(data, (64, 64))
write_video(uber_vid, "/home/amey/tmp/uber.mp4")

In [73]:
base = "/media/amey/76D076A5D0766B6F/chalap/train"
for i in range(417,471):
    data = read_sample(i)
    uber = get_uber_video(data, (64, 64))
    write_video(uber, "{}/uber/Sample{:04d}.mp4".format(base, i))
    with open("{}/uber/Sample{:04d}.pkl".format(base, i), 'wb') as f:
        pickle.dump([d.label for d in data], f)
    print("Wrote {}".format(i))

Wrote 417
Wrote 418
Wrote 419
Wrote 420
Wrote 421
Wrote 422
Wrote 423
Wrote 424
Wrote 425
Wrote 426
Wrote 427
Wrote 428
Wrote 429
Wrote 430
Wrote 431
Wrote 432
Wrote 433
Wrote 434
Wrote 435
Wrote 436
Wrote 437
Wrote 438
Wrote 439
Wrote 440
Wrote 441
Wrote 442
Wrote 443
Wrote 444
Wrote 445
Wrote 446
Wrote 447
Wrote 448
Wrote 449
Wrote 450
Wrote 451
Wrote 452
Wrote 453
Wrote 454
Wrote 455
Wrote 456
Wrote 457
Wrote 458
Wrote 459
Wrote 460
Wrote 461
Wrote 462
Wrote 463
Wrote 464
Wrote 465
Wrote 466
Wrote 467
Wrote 468
Wrote 469
Wrote 470


In [12]:
def foveateFrame(frame, focusLoc, focusSize):
    print(focusLoc)
    height = frame.shape[0] # Get the dimensions
    width = frame.shape[1]

    # Define mask
    mask = 255*np.ones(frame.shape, dtype='uint8')

    # Draw circle at x = 100, y = 70 of radius 25 and fill this in with 0
    cv2.circle(mask, focusLoc, focusSize, 0, -1)    

    # Apply distance transform to mask
    out = cv2.distanceTransform(mask, cv2.DIST_L2, 3)

    # Define scale factor
    scale_factor = 10

    # Create output image that is the same as the original
    filtered = frame.copy() 

    # Create floating point copy for precision
    frame_float = frame.copy().astype('float')
    
    # Temp
    mask = np.ceil(out / scale_factor)

    # For each pixel in the input...
    for y in range(height):
        for x in range(width):
            # If distance transform is 0, skip
            if out[y,x] == 0.0:
                continue

            # Calculate M = d / S
            #mask_val = np.ceil(out[y,x] / scale_factor)
            mask_val = out[y,x]

            # If M is too small, set the mask size to the smallest possible value
            if mask_val <= 3:
                mask_val = 3

            # Get beginning and ending x and y coordinates for neighbourhood
            # and ensure they are within bounds
            beginx = x-int(mask_val/2)
            if beginx < 0:
                beginx = 0

            beginy = y-int(mask_val/2)
            if beginy < 0:
                beginy = 0

            endx = x+int(mask_val/2)
            if endx >= width:
                endx = width-1

            endy = y+int(mask_val/2)
            if endy >= height:
                endy = height-1

            # Get the coordinates of where we need to grab pixels
            xvals = np.arange(beginx, endx+1)
            yvals = np.arange(beginy, endy+1)
            (col_neigh,row_neigh) = np.meshgrid(xvals, yvals)
            col_neigh = col_neigh.astype('int')
            row_neigh = row_neigh.astype('int')

            pix = frame[row_neigh, col_neigh].ravel()

            # Calculate the average and set it to be the output
            filtered[y,x] = int(np.mean(pix))

    return filtered

In [25]:
h,l,r = processFrame(data[1], skel[400])

(240, 347)
(244, 249)
(295, 74)
295 74 20
