In [3]:
import pandas as pd
import os
import ujson
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import sampler

from utils import calc_prf, calc_prf_hand

torch.manual_seed(1)

<torch._C.Generator at 0x7f2dd7892370>

# load/preprocess openpose data into train, val, test

In [80]:
SAMPLE_DF_PATH = 'new_gold_sample_big.json' #Change this
new_gold = pd.read_json(SAMPLE_DF_PATH)
new_gold.head()

Unnamed: 0,level_0,face_openpose,face_openpose_nose,face_present,frame,hand_openpose,hand_openpose_wrist,hand_present,index,vid_name,vid_path,face_keypoints,pose_keypoints,hand_left_keypoints,hand_right_keypoints
0,0,0,1,1,3515,0,,1,0,S_20141112_2426_03.mp4,/scratch/groups/mcfrank/Home_Headcam_new/Samca...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0.471533, 0.0980919, 0.37534799999999996, 0....","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,1,0,0,0,4925,0,,1,1,S_20131127_1310_04.mp4,/scratch/groups/mcfrank/Home_Headcam_new/Samca...,[],[],[],[]
10,10,1,1,1,8785,1,,1,10005,S_20141228_2611_08.mp4,/scratch/groups/mcfrank/Home_Headcam_new/Samca...,"[[0.281036, 0.524218, 0.0931592, 0.281939, 0.5...","[[0.295931, 0.558211, 0.7980659999999999, 0.37...","[[0.319839, 0.901076, 0.47536000000000006, 0.3...","[[0.183019, 0.7725029999999999, 0.0520404, 0.1..."
100,100,0,0,1,14425,0,,0,10087,S_20130619_0802_03.mp4,/scratch/groups/mcfrank/Home_Headcam_new/Samca...,[],[],[],[]
1000,1000,0,0,1,1470,0,,0,10898,S_20141115_2429_01.mp4,/scratch/groups/mcfrank/Home_Headcam_new/Samca...,[],[],[],[]


Folded below: utility functions for face presence calculations

In [81]:
import ntpath

def get_op_xyconf(keypt_lists):
    x = []
    y = []
    conf = []
    for keypt in keypt_lists:
        x.append(keypt[0::3]) 
        y.append(keypt[1::3])
        conf.append(keypt[2::3])
    if x == [] or y == [] or conf == []:
        return [], [], []
    
    return x, y, conf

def get_pose_keypoints(vid_path, frame):
    vid_name = ntpath.basename(vid_path)[:-4]
    frame_num = str(frame).zfill(12)
    filename = f'{vid_name}_{frame_num}_keypoints.json'
    fp = os.path.join('/scratch/users/agrawalk/headcam-algo-output/gold_sample_openpose/', vid_name, filename)
    if not os.path.exists(fp):
        print('near start or end of video')
        return []
    keypts = ujson.load(open(fp, 'r'))
    return [person['pose_keypoints'] for person in keypts['people']]

def get_face_keypoints(vid_path, frame):
    vid_name = ntpath.basename(vid_path)[:-4]
    frame_num = str(frame).zfill(12)
    filename = f'{vid_name}_{frame_num}_keypoints.json'
    fp = os.path.join('/scratch/users/agrawalk/headcam-algo-output/gold_sample_openpose/', vid_name, filename)
    if not os.path.exists(fp):
        print('near start or end of video')
        return []
    keypts = ujson.load(open(fp, 'r'))
    return [person['face_keypoints'] for person in keypts['people']]

def get_hand_left_keypoints(vid_path, frame):
    vid_name = ntpath.basename(vid_path)[:-4]
    frame_num = str(frame).zfill(12)
    filename = f'{vid_name}_{frame_num}_keypoints.json'
    fp = os.path.join('/scratch/users/agrawalk/headcam-algo-output/gold_sample_openpose/', vid_name, filename)
    if not os.path.exists(fp):
        print('near start or end of video')
        return []
    keypts = ujson.load(open(fp, 'r'))
    return [person['hand_left_keypoints'] for person in keypts['people']]

def get_hand_right_keypoints(vid_path, frame):
    vid_name = ntpath.basename(vid_path)[:-4]
    frame_num = str(frame).zfill(12)
    filename = f'{vid_name}_{frame_num}_keypoints.json'
    fp = os.path.join('/scratch/users/agrawalk/headcam-algo-output/gold_sample_openpose/', vid_name, filename)
    if not os.path.exists(fp):
        print('near start or end of video')
        return []
    keypts = ujson.load(open(fp, 'r'))
    return [person['hand_right_keypoints'] for person in keypts['people']]

In [64]:
#Doesn't need to be run; already in the df
# new_gold['face_keypoints'] = new_gold.apply(lambda row: get_face_keypoints(row['vid_path'], row['frame']), axis=1)
# print('pose')
# new_gold['pose_keypoints'] = new_gold.apply(lambda row: get_pose_keypoints(row['vid_path'], row['frame']), axis=1)
# print('left')
# new_gold['hand_left_keypoints'] = new_gold.apply(lambda row: get_hand_left_keypoints(row['vid_path'], row['frame']), axis=1)
# print('right')
# new_gold['hand_right_keypoints'] = new_gold.apply(lambda row: get_hand_right_keypoints(row['vid_path'], row['frame']), axis=1)

pose
left
right


In [37]:
#Functions to be applied row-wise to dataframes to calculate columns

def face_openpose(row):
    return 1 if np.sum(row['face_keypoints']) != 0 else 0

def face_openpose_nose(row):
    nose_keypts = [person_pose[0*3+2] for person_pose in row['pose_keypoints']]
    return 1 if np.sum(nose_keypts) != 0 else 0

def hand_openpose(row):
    return 1 if np.sum(row['hand_left_keypoints']) != 0 or np.sum(row['hand_right_keypoints'])  != 0 else 0

def hand_openpose_wrist(row):
    #turns out to be the same as hand_openpose
    hand_keypts = [np.array(person_pose[[4*3+2, 7*3+2]]) for person_pose in row['pose_keypoints']]
    return 1 if np.sum(hand_keypts) != 0 else 0

"""Note: you need the files for this one; coming soon."""
# def get_keypts_tuple(row, keypt_type, tuple_size=5):
#     vid_name = row['vid_name'][:-4]
#     middle_frame = row['frame']
#     keypts_tuple = []
    
#     for frame in range(middle_frame - tuple_size//2, middle_frame + tuple_size//2 + 1):
#         frame = str(frame).zfill(12)
#         filename = f'{vid_name}_{frame}_keypoints.json'
#         fp = os.path.join('/scratch/users/agrawalk/headcam-algo-output/gold_sample_openpose/', vid_name, filename)
        
#         if not os.path.exists(fp):
#             if frame == middle_frame:
#                 return -1 #if the center frame doesn't exist, mark it for discarding
#             keypts_tuple.append([0]*70*3)
#             continue 
            
#         keypts = ujson.load(open(fp, 'r'))
#         keypts = [person[f'{keypt_type}_keypoints'] for person in keypts['people']]
#         keypts_tuple.append([0]*70*3 if len(keypts) == 0 else keypts[0])
    
#     return keypts_tuple

In [38]:
# new_gold['face_openpose'] = new_gold.apply(face_openpose, axis=1)
# new_gold['face_openpose_nose'] = new_gold.apply(face_openpose_nose, axis=1)
# new_gold['hand_openpose'] = new_gold.apply(hand_openpose, axis=1)
# new_gold['hand_openpose_wrist'] = new_gold.apply(hand_openpose_wrist, axis=1)

In [68]:
print('Face PRF Scores: Raw')
prf = calc_prf(new_gold['face_openpose'], new_gold['face_present'])
print(f'face_openpose: {prf}')
prf = calc_prf(new_gold['face_openpose_nose'], new_gold['face_present'])
print(f'face_openpose_nose: {prf}')
print()

print('Hand PRF Scores: Raw')
prf = calc_prf(new_gold['hand_openpose'], new_gold['hand_present'])
print(f'hand_openpose: {prf}')
# prf = calc_prf(new_gold['hand_openpose_wrist'], new_gold['hand_present'])
# print(f'hand_openpose_wrist: {prf}')

Face PRF Scores: Raw
4407 5792 3204
face_openpose: (0.7270251872021783, 0.5531767955801105, 0.6282968918521423)
6579 5792 4073
face_openpose_nose: (0.6190910472716218, 0.7032113259668509, 0.6584754668175572)

Hand PRF Scores: Raw
5426 10261 4033
hand_openpose: (0.7432731293770733, 0.3930416138777897, 0.5141837190029961)


In [8]:
"""Code coming soon for this"""
# print('Getting face tuples...')
# new_gold['face_tuple'] = new_gold.apply(lambda row: get_keypts_tuple(row, 'face'), axis=1)
# print('Getting pose tuples...')
# new_gold['pose_tuple'] = new_gold.apply(lambda row: get_keypts_tuple(row, 'pose'), axis=1)

Getting face tuples...
Getting pose tuples...


In [79]:
#Next up: (maybe later: tuple of xy+conf, xy+conf) tuple of conf, conf
# X = new_gold['face_tuple'].values #NOTE: need the tuples data for this one-- too big for Github, coming soon.
X = new_gold['face_keypoints'].values 
#Making it a consistent shape
X = np.array([[0] * 210 if x == [] else x[0] for x in X]) #NOTE: Right now this selects only the first person.
#TODO: Change to sum all people's confidences

y = new_gold['face_present'].values
X = np.array([np.array(x) for x in X])
y = np.array([np.array(x) for x in y])

# X = X[:, :, 2::3] #only keypoint confidences, not x/y's (N, seq_len, 210) => (N, seq_len, 70); comment out to keep it
print(X.shape)
X = X[:, 2::3] #only keypoint confidences, not x/y's (N, 210) => (N, 70); comment out to keep it
print(X.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape)

(24000, 210)
(24000, 70)
(15360, 70) (15360,) (3840, 70) (3840,) (4800, 70) (4800,)


# create classifiers

In [58]:
mlp = MLPClassifier(hidden_layer_sizes=(100, 100), activation='relu', solver='adam', alpha=0.0001, 
                    batch_size='auto', learning_rate='constant', learning_rate_init=0.001, 
                    power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, 
                    verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, 
                    early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, 
                    epsilon=1e-08, n_iter_no_change=10)

In [15]:
logreg = LogisticRegression()

In [85]:
class OpenposeLSTM(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, tagset_size):
        super(OpenposeLSTM, self).__init__()
        self.hidden_dim = hidden_dim

#         self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim*2*2, tagset_size)

    def init_hidden(self):
        # This is what we'll initialise our hidden state as
        return (torch.zeros(1, BATCH_SIZE, self.hidden_dim),
                torch.zeros(1, BATCH_SIZE, self.hidden_dim))
    
    def forward(self, keypts):
        keypts = torch.Tensor(keypts)
#         embeds = self.word_embeddings(sentence)
#         print(keypts.shape)
        lstm_out, _ = self.lstm(keypts)
#         print(lstm_out[:, 0, :].shape)

        # concatenating the first and last sequence element outputs 
        # (the ends of the reverse and forward chains, respectively)
        lstm_out = torch.cat((lstm_out[:, 0], lstm_out[:, -1]), dim=1)
#         print(lstm_out.shape)
        tag_space = self.hidden2tag(lstm_out)
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_space

In [86]:
EMBEDDING_DIM = 70
HIDDEN_DIM = 64
lstm = OpenposeLSTM(EMBEDDING_DIM, HIDDEN_DIM, 2)

# train classifiers on openpose data

In [21]:
def middle_frame(X):
    return X[:, X.shape[1]//2, :]

In [61]:
logreg.fit(middle_frame(X_train), y_train)
mlp.fit(middle_frame(X_train), y_train)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [62]:
print('Logistic regression:')
y_pred = logreg.predict(middle_frame(X_val))
print(classification_report(y_val, y_pred))

print('MLP:')
y_pred = mlp.predict(middle_frame(X_val))
print(classification_report(y_val, y_pred))

#THe F-scores for the positive (1) detections are in the second row.

Logistic regression:
              precision    recall  f1-score   support

           0       0.82      1.00      0.90      1499
           1       0.93      0.19      0.32       402

   micro avg       0.83      0.83      0.83      1901
   macro avg       0.87      0.59      0.61      1901
weighted avg       0.84      0.83      0.78      1901

MLP:
              precision    recall  f1-score   support

           0       0.85      0.96      0.90      1499
           1       0.72      0.37      0.49       402

   micro avg       0.84      0.84      0.84      1901
   macro avg       0.79      0.67      0.70      1901
weighted avg       0.82      0.84      0.82      1901



In [83]:
def train_part34(model, optimizer, epochs=1):
    """
    Train a model on CIFAR-10 using the PyTorch Module API.
    
    Inputs:
    - model: A PyTorch Module giving the model to train.
    - optimizer: An Optimizer object we will use to train the model
    - epochs: (Optional) A Python integer giving the number of epochs to train for
    
    Returns: Nothing, but prints model accuracies during training.
    """
#     loss_fn = nn.MSELoss(size_average=False)
    model = model.to(device=device)  # move the model parameters to CPU/GPU
    for e in range(epochs):
        print(f'Epoch: {e}')
#             t, x, y = e, X_train, y_train
        for t, (x, y) in enumerate(loader_train):
#             print(x, y)
#             if i == 1:
#                 break
#             else:
#                 i+=1
            model.train()  # put model to training mode
            # Clear stored gradient
            model.zero_grad()

            # Initialise hidden state
            # Don't do this if you want your LSTM to be stateful
            model.hidden = model.init_hidden()
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=torch.long)
            print(x.shape)
            print(y.shape)

            scores = model(x)
#             loss_fn = nn.LLoss()
            print(scores.shape)
#             loss = loss_fn(scores, y)
#             loss = F.cross_entropy(scores, y)

            # Zero out all of the gradients for the variables which the optimizer
            # will update.
            optimizer.zero_grad()

            # This is the backwards pass: compute the gradient of the loss with
            # respect to each  parameter of the model.
            loss.backward()

            # Actually update the parameters of the model using the gradients
            # computed by the backwards pass.
            optimizer.step()

            if t % 10 == 0:
                print('Iteration %d, loss = %.4f' % (t, loss.item()))
#                 print(f'Val acc: {(model(X_val).max(1)[1] == torch.Tensor(y_val).to(dtype=torch.long)).sum() / len(y_val)}')
                check_accuracy_part34(loader_val, model)
                print()

In [69]:
class RandomDataset(Dataset):
    def __init__(self):
        self.train = True
        X_r = np.random.random((10000, 5, 210))
        y_r = np.random.random((10000,))
        self.len = len(X_r)
        self.x_data = X_r
        self.y_data = y_r
        
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]
    
    def __len__(self):
        return self.len
    
class OpenposeTrainDataset(Dataset):
    def __init__(self):
        self.train = True
        self.len = len(X_train)
        self.x_data = X_train
        self.y_data = y_train
        
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]
    
    def __len__(self):
        return self.len

class OpenposeValDataset(Dataset):
    def __init__(self):
        self.train = True
        self.len = len(X_val)
        self.x_data = X_val
        self.y_data = y_val
        
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]
    
    def __len__(self):
        return self.len

class OpenposeTestDataset(Dataset):
    def __init__(self):
        self.train = False
        self.len = len(X_test)
        self.x_data = X_test
        self.y_data = y_test
        
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]
    
    def __len__(self):
        return self.len

In [73]:
BATCH_SIZE = 100
train_data = OpenposeTrainDataset()
test_data = OpenposeTestDataset()
val_data = OpenposeValDataset()
loader_train = DataLoader(train_data, batch_size=BATCH_SIZE, sampler=sampler.SubsetRandomSampler(range(len(train_data))))
loader_test = DataLoader(test_data, batch_size=BATCH_SIZE, sampler=sampler.SubsetRandomSampler(range(len(test_data))))
loader_val = DataLoader(val_data, batch_size=BATCH_SIZE, sampler=sampler.SubsetRandomSampler(range(len(val_data))))

In [80]:
USE_GPU = True

print_every = 100
dtype = torch.float32 # we will be using float throughout this tutorial

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [81]:
learning_rate = 1e-3

betas = (0.9, 0.999)

# optimizer = optim.Adam(model.parameters(), lr=learning_rate, betas=betas)
optimizer = optim.SGD(lstm.parameters(), lr=learning_rate)

In [None]:
train_part34(lstm, optimizer, epochs=10)

In [59]:
#Now try on the sequence of frames
logreg.fit((X_train.reshape(X_train.shape[0], -1)), y_train)
mlp.fit((X_train.reshape(X_train.shape[0], -1)), y_train)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [60]:
print('Logistic regression:')
y_pred = logreg.predict((X_val.reshape(X_val.shape[0], -1)))
print(classification_report(y_val, y_pred))

print('MLP:')
y_pred = mlp.predict((X_val.reshape(X_val.shape[0], -1)))
print(classification_report(y_val, y_pred))

#THe F-scores for the positive (1) detections are in the second column.

Logistic regression:
              precision    recall  f1-score   support

           0       0.83      1.00      0.91      1499
           1       0.94      0.26      0.40       402

   micro avg       0.84      0.84      0.84      1901
   macro avg       0.89      0.63      0.66      1901
weighted avg       0.86      0.84      0.80      1901

MLP:
              precision    recall  f1-score   support

           0       0.86      0.97      0.91      1499
           1       0.78      0.42      0.55       402

   micro avg       0.85      0.85      0.85      1901
   macro avg       0.82      0.69      0.73      1901
weighted avg       0.84      0.85      0.83      1901



In [30]:
df.to_json('/scratch/users/agrawalk/headcam-algo-output/alice_sample.json')

In [29]:
df.head()

Unnamed: 0,vid_name,vid_path,frame,face_present,hand_present,face_openpose,face_openpose_nose,hand_openpose,hand_openpose_wrist,face_tuple,pose_tuple
0,A_20141124_2611_01.mp4,/scratch/groups/mcfrank/Home_Headcam_new/Alice...,53210,1,1,0,0,0,1,"[[0.025443499999999997, 0.348607, 0.0002138550...","[[0.0408498, 0.403014, 0.27369099999999996, 0...."
1,A_20140115_1602_01.mp4,/scratch/groups/mcfrank/Home_Headcam_new/Alice...,4655,0,0,0,0,0,0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
10,A_20130617_0904_01.mp4,/scratch/groups/mcfrank/Home_Headcam_new/Alice...,2190,1,1,1,1,1,1,"[[0.6433479999999999, 0.0514446, 0.000182589, ...","[[0.687879, 0.00672222, 0.706591, 0.6757369999..."
100,A_20150425_3112_01.mp4,/scratch/groups/mcfrank/Home_Headcam_new/Alice...,2070,0,0,0,0,0,0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1000,A_20130925_1212_01.mp4,/scratch/groups/mcfrank/Home_Headcam_new/Alice...,24145,1,1,1,1,1,1,"[[0.8232600000000001, 0.293527, 0.000214731000...","[[0.765446, 0.24773099999999998, 0.628132, 0.8..."


In [None]:
#Sklearn LSTM

In [42]:
#Sanity checking to see if there's even any extra non-zero info to be gained from looking at surrounding frames (esp. in FN cases)
def extra_info(row):
    return 1 if np.sum(np.array(row['face_tuple'])[[0,1,3,4], :]) != 0 else 0

In [53]:
alice_fp = df.query('face_present == 0 and face_openpose == 1')
len(alice_fp)

698

In [54]:
extra = alice_fp.apply(extra_info, axis=1).values
extra.sum()/len(extra)

0.7091690544412608

In [56]:
alice_fn = df.query('face_present == 1 and face_openpose == 0')
len(alice_fn)

1181

In [57]:
#OK, so there's certainly some to be gained on the FN frames. Why aren't the classifiers picking up on it, then?
extra = alice_fn.apply(extra_info, axis=1).values
extra.sum()/len(extra)

0.2726502963590178