In [1]:
import numpy as np
import pandas as pd
import os
import re
import ujson
from sklearn import metrics
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression

# Data handling

In [2]:
# label the sentences as one of the three types: polar, wh, statement
def label_sentence_type(s):
    if re.search("\?", s):
        if re.search("\?[a-zA-Z]", s):
            return "statement"
        elif re.search("[wW](ho|hat|hen|here|hy|hich)|[hH]ow", s):
            return "wh"
        else:
            return "polar"
    else:
        return "statement"

# columns for the y-coord of each eyebrow keypoint (left: 17-21, right: 22-26), subtracted by top nose (27)
# one col for SENTENCE_NAME
keypoints_range = (3 * pd.DataFrame.from_dict(list(range(17,28))) + 1)[0]

# read the keypoint data for a given sentence
def extract_keypoints(sentence_name):
    folder = 'openpose_output/json/' + sentence_name + '/'
    json_files = map(lambda file: folder + file, os.listdir(folder))
    extractor = lambda file: pd.DataFrame(pd.read_json(file, encoding='unicode_escape', encoding_errors='replace')['people'].iloc[0]['face_keypoints_2d']).transpose()[keypoints_range]
    kp = pd.concat(map(extractor, json_files), sort=False, ignore_index=True)
    kp.columns = list(range(17,28))
    kp['SENTENCE_NAME'] = sentence_name
    return kp

In [3]:
# read the combined csv file to load all of the sentence data
csvname = 'how2sign_realigned_combined.csv'
with open(csvname, 'r', encoding='utf8') as csvfile:
    # construct the sentence data frame
    sentence_data = pd.read_csv(csvfile, sep='\t')
    # label the sentences
    sentence_data['SENTENCE_TYPE'] = sentence_data['SENTENCE'].apply(label_sentence_type)

# list all of the json files (frame keypoint data)
json_files = pd.DataFrame(data={'FOLDER_NAME': os.listdir('openpose_output/json/')})

# remove any entry whose SENTENCE_NAME that doesn't have matching video data
sentence_data = pd.merge(sentence_data, json_files['FOLDER_NAME'], how='inner', left_on='SENTENCE_NAME', right_on='FOLDER_NAME').drop(columns=['FOLDER_NAME'])

In [4]:
N = 450
split_frac = 0.8
seed = 42069

# form working dataset: take 450 random statements and all of the non-statements (polar and wh)
wd = pd.concat([sentence_data[sentence_data['SENTENCE_TYPE'] == 'statement'].sample(n=N, random_state=seed),
                sentence_data[sentence_data['SENTENCE_TYPE'] != 'statement']], ignore_index=True)
wd['SENTENCE_TYPE'].value_counts()

polar        456
statement    450
wh           435
Name: SENTENCE_TYPE, dtype: int64

In [5]:
wd

Unnamed: 0,VIDEO_ID,VIDEO_NAME,SENTENCE_ID,SENTENCE_NAME,START_REALIGNED,END_REALIGNED,SENTENCE,SENTENCE_TYPE
0,BfS0S717ykQ,BfS0S717ykQ-2-rgb_front,BfS0S717ykQ_8,BfS0S717ykQ_8-2-rgb_front,44.19,47.96,It will also help you grow hair by stimulating...,statement
1,1KKlZKlWdTM,1KKlZKlWdTM-5-rgb_front,1KKlZKlWdTM_15,1KKlZKlWdTM_15-5-rgb_front,102.71,107.43,"And lastly your lip gloss, that the most impor...",statement
2,1bjHvgrcZu4,1bjHvgrcZu4-8-rgb_front,1bjHvgrcZu4_2,1bjHvgrcZu4_2-8-rgb_front,21.29,36.66,"Basically, take some kind of wasp, bee killer ...",statement
3,15HbVwQP1Qw,15HbVwQP1Qw-5-rgb_front,15HbVwQP1Qw_16,15HbVwQP1Qw_16-5-rgb_front,104.52,120.34,"You want to, you want to be careful with acne ...",statement
4,0yeKUlT9LnU,0yeKUlT9LnU-5-rgb_front,0yeKUlT9LnU_1,0yeKUlT9LnU_1-5-rgb_front,11.73,17.05,"In soccer, it's very important for you to warm...",statement
...,...,...,...,...,...,...,...,...
1336,cw5evdziBB4,cw5evdziBB4-8-rgb_front,cw5evdziBB4_3,cw5evdziBB4_3-8-rgb_front,19.58,23.02,So Molinia do you care to demonstrate with me?,polar
1337,cw5evdziBB4,cw5evdziBB4-8-rgb_front,cw5evdziBB4_6,cw5evdziBB4_6-8-rgb_front,33.7,35.26,Can you swing your legs around?,polar
1338,eLv9Uhs89IQ,eLv9Uhs89IQ-8-rgb_front,eLv9Uhs89IQ_0_1,eLv9Uhs89IQ_0_1-8-rgb_front,1.87,8.16,"Hello, have you ever wondered how to prepare f...",wh
1339,eahjYz2685g,eahjYz2685g-8-rgb_front,eahjYz2685g_14,eahjYz2685g_14-8-rgb_front,85.09,86.4,"Ok, so what?",wh


In [6]:
# extract keypoints for the working dataset
kp = pd.concat(map(extract_keypoints, wd['SENTENCE_NAME']), sort=False, ignore_index=True)
kp

Unnamed: 0,17,18,19,20,21,22,23,24,25,26,27,SENTENCE_NAME
0,296.711,294.382,294.382,295.935,299.429,296.711,292.052,289.723,290.111,293.605,311.077,BfS0S717ykQ_8-2-rgb_front
1,272.412,267.796,265.950,267.334,269.181,269.181,265.488,265.488,268.719,273.335,283.952,BfS0S717ykQ_8-2-rgb_front
2,281.313,276.691,274.590,275.430,278.371,276.271,272.489,271.229,274.590,280.472,293.497,BfS0S717ykQ_8-2-rgb_front
3,286.144,281.063,278.098,278.522,281.486,278.098,274.287,273.440,274.711,280.639,295.461,BfS0S717ykQ_8-2-rgb_front
4,284.804,279.462,278.229,278.229,280.695,277.818,273.297,272.475,275.352,280.284,295.079,BfS0S717ykQ_8-2-rgb_front
...,...,...,...,...,...,...,...,...,...,...,...,...
159034,262.350,256.261,255.754,258.291,261.843,258.291,252.202,249.158,246.621,250.680,271.991,fE6xxSbjVV8_7-8-rgb_front
159035,262.701,256.232,256.232,259.218,263.199,261.706,255.237,251.257,250.261,253.247,275.141,fE6xxSbjVV8_7-8-rgb_front
159036,262.953,257.183,257.183,260.549,263.914,261.030,255.740,251.413,249.490,252.855,275.454,fE6xxSbjVV8_7-8-rgb_front
159037,262.318,256.284,255.781,258.295,261.815,258.295,252.261,248.741,246.730,250.753,271.871,fE6xxSbjVV8_7-8-rgb_front


# Simple max and average models

In [7]:
# brow distances
def brow_dist(kp):
    pts = kp[['SENTENCE_NAME', 'SENTENCE_TYPE']].copy()
    pts['d_in'] = ((kp[27] - kp[21]) + (kp[27] - kp[22])) / 2
    pts['d_out'] = ((kp[27] - kp[18]) + (kp[27] - kp[25])) / 2
    pts['d_avg'] = (pts['d_in'] + pts['d_out']) / 2
    return pts

# helper function to help determine the sentence type
def min_label(pt):
    if pt['dpo'] < pt['dwh']:
        if pt['dpo'] < pt['dst']:
            return 'polar'
    else:
        if pt['dwh'] < pt['dst']:
            return 'wh'
    return 'statement'

# p is the test set keypoints
def predictor(p, model, pts):
    # apply the model to p
    pts_p = model(p)

    # calculate the distance between pts_p and polar question
    pts_p['dpo'] = (pts_p['d_avg'] - pts.loc['polar', 'd_avg']).abs()
    # calculate the distance between pts_p and wh question
    pts_p['dwh'] = (pts_p['d_avg'] - pts.loc['wh', 'd_avg']).abs()
    # calculate the distance between pts_p and statement
    pts_p['dst'] = (pts_p['d_avg'] - pts.loc['statement', 'd_avg']).abs()
    
    # apply min_label to the 3 columns row-by-row (axis=1)
    pts_p['PREDICT_SENTENCE_TYPE'] = pts_p[['dpo', 'dwh', 'dst']].apply(min_label, axis=1)
    
    return pts_p

def simple_max(kp):
    # calculate the ptserence score for every frame and take the max across the frames of each sentence
    pts = brow_dist(kp)[['SENTENCE_NAME', 'SENTENCE_TYPE', 'd_avg']].groupby('SENTENCE_NAME').max()
    # return 3 scores corresponding to the 3 sentence types
    return pts

def simple_average(kp):
    # calculate the ptserence score for every frame and take the average across the frames of each sentence
    pts = brow_dist(kp)[['SENTENCE_NAME', 'SENTENCE_TYPE', 'd_avg']].groupby('SENTENCE_NAME').mean(numeric_only=True)
    # reinsert the sentence_type column
    mapping = kp[['SENTENCE_NAME', 'SENTENCE_TYPE']].groupby('SENTENCE_NAME').apply(lambda x: x.iloc[0])
    pts = pts.merge(mapping['SENTENCE_TYPE'], on='SENTENCE_NAME')
    # return 3 scores corresponding to the 3 sentence types
    return pts


# Logistic regression max and average models

In [8]:
# subtract keypoint 27 (top of nose) from all other eyebrow points
def sub_all(kp):
    pts = kp.copy()
    for i in range(17,27):
        pts[i] = pts[27] - pts[i]
    return pts

def max_label(prob):
    if prob['polar'] > prob['wh']:
        if prob['polar'] > prob['statement']:
            return 'polar'
    else:
        if prob['wh'] > prob['statement']:
            return 'wh'
    return 'statement'

# train logistic regression models
def train_lr(kp):
    pts = sub_all(kp)
    lr = LogisticRegression(max_iter=1000).fit(pts[range(17,27)], pts['SENTENCE_TYPE'])
    return lr

# prediction function for model 3 - logistic max
def predict_logit_max(p, lr):
    pts = sub_all(p)
    sentence_types = lr.classes_
    probs = lr.predict_proba(pts[range(17,27)])
    probs = pts.merge(pd.DataFrame(data=probs, columns=sentence_types), left_index=True, right_index=True)
    probs = probs[['SENTENCE_NAME', 'polar', 'statement', 'wh']].groupby('SENTENCE_NAME').max()
    probs['PREDICT_SENTENCE_TYPE'] = probs.apply(max_label, axis=1)
    return probs

# prediction function for model 4 - logistic average
def predict_logit_average(p, lr):
    pts = sub_all(p)
    sentence_types = lr.classes_
    probs = lr.predict_proba(pts[range(17,27)])
    probs = pts.merge(pd.DataFrame(data=probs, columns=sentence_types), left_index=True, right_index=True)
    probs = probs[['SENTENCE_NAME', 'polar', 'statement', 'wh']].groupby('SENTENCE_NAME').mean(numeric_only=True)
    probs['PREDICT_SENTENCE_TYPE'] = probs.apply(max_label, axis=1)
    return probs


# Train all models

In [9]:
# use StratifiedShuffleSplit to split each of the types into training and test sets
sss = StratifiedShuffleSplit(n_splits=10, test_size=1-split_frac, train_size=split_frac, random_state=seed)
splitted = sss.split(wd, wd['SENTENCE_TYPE'])
acc = pd.DataFrame()
predictions = pd.DataFrame()

for i, (train, test) in enumerate(splitted):
    # label the working data set with the train/test splits
    # 'SET' indicates whether it's in train or test
    wd.loc[train,'SET'] = 'train'
    wd.loc[test,'SET'] = 'test'
    
    # join keypoint data to the train set
    kp_train = kp.merge(wd[wd['SET'] == 'train'][['SET', 'SENTENCE_NAME', 'SENTENCE_TYPE']], on='SENTENCE_NAME')
    # join keypoint data to the test set
    kp_test = kp.merge(wd[wd['SET'] == 'test'][['SET', 'SENTENCE_NAME', 'SENTENCE_TYPE']], on='SENTENCE_NAME')

    # 0. Basline model that maxes keypoint 17 over a sentence (just to group the data by sentence_name)
    # but then ignores that keypoint entirely and always guesses statement
    results0 = kp_test.copy()[[17, 'SENTENCE_NAME', 'SENTENCE_TYPE']].groupby('SENTENCE_NAME').max()
    # null hypthesis: always guess 'statement'
    results0['PREDICT_SENTENCE_TYPE'] = 'statement'
    # accuracy
    acc.loc[i, 'MODEL0'] = metrics.accuracy_score(results0['PREDICT_SENTENCE_TYPE'],results0['SENTENCE_TYPE'])
    
    # 1. Simple max
    # train
    predictor1 = lambda p: predictor(p, simple_max, simple_max(kp_train).groupby('SENTENCE_TYPE').mean())
    # test
    results1 = predictor1(kp_test)
    # accuracy
    acc.loc[i, 'simple_max'] = metrics.accuracy_score(results1['PREDICT_SENTENCE_TYPE'],results1['SENTENCE_TYPE'])
    
    # 2. Simple average
    # train
    predictor2 = lambda p: predictor(p, simple_average, simple_average(kp_train).groupby('SENTENCE_TYPE').mean(numeric_only=True))
    # test 
    results2 = predictor2(kp_test)
    # accuracy
    acc.loc[i, 'simple_average'] = metrics.accuracy_score(results2['PREDICT_SENTENCE_TYPE'],results2['SENTENCE_TYPE'])
    
    # train logistic regression models
    lr = train_lr(kp_train)
    
    # 3. Logistic max
    # test
    results3 = wd[wd['SET'] == 'test'][['SENTENCE_NAME', 'SENTENCE_TYPE']].merge(predict_logit_max(kp_test, lr), on='SENTENCE_NAME')
    # accuracy
    acc.loc[i, 'logit_max'] = metrics.accuracy_score(results3['PREDICT_SENTENCE_TYPE'],results3['SENTENCE_TYPE'])
    
    # 4. Logistic average
    # test
    results4 = wd[wd['SET'] == 'test'][['SENTENCE_NAME', 'SENTENCE_TYPE']].merge(predict_logit_average(kp_test, lr), on='SENTENCE_NAME')
    # accuracy
    acc.loc[i, 'logit_average'] = metrics.accuracy_score(results4['PREDICT_SENTENCE_TYPE'],results4['SENTENCE_TYPE'])

acc

Unnamed: 0,MODEL0,simple_max,simple_average,logit_max,logit_average
0,0.334572,0.401487,0.442379,0.360595,0.36803
1,0.334572,0.386617,0.427509,0.33829,0.345725
2,0.334572,0.371747,0.438662,0.342007,0.35316
3,0.334572,0.39777,0.427509,0.356877,0.349442
4,0.334572,0.35316,0.408922,0.33829,0.35316
5,0.334572,0.405204,0.431227,0.33829,0.33829
6,0.334572,0.33829,0.36803,0.345725,0.356877
7,0.334572,0.334572,0.442379,0.33829,0.342007
8,0.334572,0.408922,0.438662,0.349442,0.356877
9,0.334572,0.375465,0.453532,0.345725,0.35316


In [10]:
acc.mean()

MODEL0            0.334572
simple_max        0.377323
simple_average    0.427881
logit_max         0.345353
logit_average     0.351673
dtype: float64