In [2]:
# Setup
import pandas as pd
import numpy as np
import git
repo = git.Repo("./", search_parent_directories=True)
homedir = repo.working_dir

### Load OpenPose Data

In [26]:
keypoints = np.load("{}/data/all_keypoints.npy".format(homedir), allow_pickle=True)
keypoints[0]

array(['ESzlIzyO',
       array([[3.32047e+02, 3.98396e+02, 8.52781e-01, ..., 3.91341e+02,
        1.03356e+03, 8.47541e-01],
       [3.32040e+02, 3.98392e+02, 8.51686e-01, ..., 3.91345e+02,
        1.03357e+03, 8.46827e-01],
       [3.32055e+02, 3.98462e+02, 8.60609e-01, ..., 3.91352e+02,
        1.03357e+03, 8.41133e-01],
       ...,
       [3.21681e+02, 4.05430e+02, 9.21052e-01, ..., 3.98331e+02,
        1.03712e+03, 8.62005e-01],
       [3.21715e+02, 4.05454e+02, 9.22338e-01, ..., 3.98333e+02,
        1.03704e+03, 8.61394e-01],
       [3.28646e+02, 3.98296e+02, 8.48421e-01, ..., 3.87965e+02,
        1.03360e+03, 8.37185e-01]])], dtype=object)

In [27]:
features_kp = np.zeros((383, 60*75 + 1), dtype=object)

In [32]:
# downsample the keypoints 
# each video has framerate = 30; take frames 30-89 to represent the second 2 seconds of each video
for i, subject in enumerate(keypoints): 
    subjectId = subject[0]
    kp = subject[1]
    
    features_kp[i, 0] = subjectId
    features_kp[i, 1:] = kp[30:90].flatten()

In [44]:
df_kp = pd.DataFrame(features_kp)
df_kp = df_kp.rename(columns={0: 'subjectid'})
df_kp

Unnamed: 0,subjectid,1,2,3,4,5,6,7,8,9,...,4491,4492,4493,4494,4495,4496,4497,4498,4499,4500
0,ESzlIzyO,335.63,405.249,0.895572,290.132,503.095,0.849572,262.146,513.397,0.86198,...,0.376569,509.991,1058.1,0.757028,485.627,1071.88,0.835966,401.921,1030.12,0.89212
1,zN0O4jLu,230.898,248.255,0.856856,318.156,353.015,0.853513,258.749,353.053,0.774952,...,0.668692,206.277,1155.67,0.775272,206.374,1138.38,0.647594,321.438,1113.89,0.63126
2,nM7tQcBS,436.93,84.2963,0.849167,381.013,182.025,0.892465,318.176,168.019,0.817223,...,0.805224,373.935,1030.18,0.754506,339.108,1026.63,0.706043,300.716,988.272,0.812182
3,OuYG4U64,541.525,321.646,0.933987,461.399,457.626,0.900076,408.853,454.199,0.844952,...,0.879541,440.289,1075.43,0.846009,408.952,1075.43,0.874136,363.458,1023.09,0.787195
4,tFSyXitq,607.894,267.412,0.94842,524.122,440.729,0.888483,388.191,440.546,0.783813,...,0.651009,524.325,1833.72,0.543139,471.893,1839,0.54514,419.543,1750.08,0.608088
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,bqTsXpKl,276.187,471.733,0.84257,332.062,548.464,0.86469,272.7,548.481,0.871379,...,0.717557,206.472,1047.57,0.862025,206.464,1037.14,0.770676,297.223,1005.69,0.775751
379,UDXlpEgF,468.19,94.8551,0.782617,426.347,230.757,0.822717,373.862,206.547,0.822391,...,0.830474,387.972,908.085,0.88934,366.96,908.177,0.866333,332.044,876.437,0.79478
380,b632WDpF,644.923,141.884,0.908126,560.965,382.903,0.869544,429.859,382.861,0.741821,...,0.799335,602.817,1718.56,0.735241,560.778,1718.44,0.740828,492.83,1650.39,0.68736
381,ybGzWMPA,398.35,206.403,0.923946,352.968,297.207,0.897122,293.692,290.32,0.907,...,0.846899,349.699,918.397,0.831726,328.545,925.398,0.806425,293.832,904.335,0.782418


### Load Precalculated Features

In [49]:
# Create a numpy array containing all the calculated features from the original paper
# Exclude fields 0 (identifiers), 2 (orientation), 3 (framerate) and 537+ (survey data)
df_calc = pd.read_csv(r'https://raw.githubusercontent.com/amandaliusa/cs231n/main/data/dataClean_text.csv')
#calc_features = df.iloc[:,3:537].to_numpy(dtype=np.float32)
#print(calc_features[0,:5])
df_calc = df_calc.iloc[:,:537]
df_calc.drop(['X', 'orientation', 'framerate'], axis=1, inplace=True)
df_calc

Unnamed: 0,subjectid,n,time,time_diff,speed,time_sd,speed_sd,n_sit2stand,time_sit2stand,time_diff_sit2stand,...,neck_max_y_acc_stand2sit,ank_to_hip_dist_sit,ank_to_hip_dist_stand,knee_to_hip_dist_sit,knee_to_hip_dist_stand,height,lknee_angle_first_sit,rknee_angle_first_sit,lknee_angle_first_stand,rknee_angle_first_stand
0,kGhQ1FQc,5,18.752067,-0.275275,0.27,0.631539,0.058177,5,5.972633,-1.618283,...,3.505124,1.081186,1.006474,1.221493,0.869235,0.974188,104.281728,130.765245,174.111505,170.157767
1,KyINSk8e,5,13.346667,-0.216883,0.37,0.395926,0.071269,5,4.337667,-1.101100,...,3.654509,1.057510,1.018101,0.979709,0.735422,0.949425,113.416249,122.046417,183.473800,170.403354
2,ybGzWMPA,5,9.175833,0.108442,0.54,0.152175,0.044839,5,4.304300,0.008342,...,4.730823,1.324459,1.407150,1.272274,1.064869,0.963154,110.360354,116.014708,154.888698,154.736858
3,b632WDpF,4,11.033333,0.133333,0.36,0.234965,0.031573,4,3.833333,-0.700000,...,3.772340,1.523048,1.506615,1.525126,1.145206,0.948291,359.999821,132.874089,179.434883,172.483812
4,UDXlpEgF,5,11.444767,-0.108442,0.44,0.225910,0.045701,5,4.471133,-0.600600,...,5.261429,1.017858,0.947927,0.879847,0.623896,0.993858,113.339615,115.098166,178.141457,171.161810
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
400,tFSyXitq,5,10.066667,0.225000,0.50,0.344867,0.069775,5,4.800000,0.075000,...,4.927668,0.864066,0.852848,1.118739,0.830468,0.966577,94.484956,115.572113,176.582448,175.372981
401,OuYG4U64,5,10.477133,0.116783,0.48,0.178317,0.044853,5,4.771433,-0.150150,...,4.448819,1.151292,1.100129,1.012293,0.837304,0.965469,109.993607,106.238842,182.016620,166.714119
402,nM7tQcBS,5,5.366667,0.058333,0.93,0.082731,0.076364,5,2.633333,0.008333,...,6.866623,1.374656,1.467833,1.375480,1.147340,0.970511,92.538523,108.185109,167.608827,164.889848
403,zN0O4jLu,5,24.290933,0.075075,0.21,0.377854,0.016347,5,12.412400,0.241908,...,2.242874,0.871977,1.034409,0.565355,0.657769,0.962403,106.005854,96.882321,160.433369,152.676643


### Load Survey Data (labels)

In [50]:
df_survey = pd.read_csv(r'https://raw.githubusercontent.com/amandaliusa/cs231n/main/data/survey_data.csv')
df_survey

Unnamed: 0,subjectid,OA_check
0,kGhQ1FQc,1
1,KyINSk8e,0
2,ybGzWMPA,0
3,b632WDpF,0
4,UDXlpEgF,0
...,...,...
400,tFSyXitq,1
401,OuYG4U64,0
402,nM7tQcBS,0
403,zN0O4jLu,0


### Create train/val/test sets

In [73]:
# join the dataframes by subjectId 
df_join = df_kp.set_index('subjectid').join(df_calc.set_index('subjectid'))
df_join = df_join.join(df_survey.set_index('subjectid')).astype('float32')
df_join

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,ank_to_hip_dist_sit,ank_to_hip_dist_stand,knee_to_hip_dist_sit,knee_to_hip_dist_stand,height,lknee_angle_first_sit,rknee_angle_first_sit,lknee_angle_first_stand,rknee_angle_first_stand,OA_check
subjectid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ESzlIzyO,335.630005,405.248993,0.895572,290.131989,503.095001,0.849572,262.145996,513.396973,0.861980,346.191986,...,0.831638,1.153598,0.657760,0.648148,0.962856,97.261581,99.249542,185.223724,181.791748,0.0
zN0O4jLu,230.897995,248.255005,0.856856,318.156006,353.015015,0.853513,258.748993,353.053009,0.774952,178.574997,...,0.871977,1.034409,0.565355,0.657769,0.962403,106.005852,96.882324,160.433365,152.676636,0.0
nM7tQcBS,436.929993,84.296303,0.849167,381.013000,182.024994,0.892465,318.175995,168.018997,0.817223,426.196991,...,1.374656,1.467833,1.375480,1.147340,0.970511,92.538521,108.185112,167.608826,164.889847,0.0
OuYG4U64,541.525024,321.645996,0.933987,461.398987,457.626007,0.900076,408.852997,454.199005,0.844952,440.260986,...,1.151292,1.100129,1.012293,0.837304,0.965469,109.993607,106.238846,182.016617,166.714127,0.0
tFSyXitq,607.893982,267.411987,0.948420,524.122009,440.729004,0.888483,388.191010,440.545990,0.783813,524.356995,...,0.864066,0.852848,1.118739,0.830468,0.966577,94.484955,115.572113,176.582443,175.372986,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
bqTsXpKl,276.187012,471.733002,0.842570,332.062012,548.463989,0.864690,272.700012,548.481018,0.871379,248.317001,...,1.430404,1.485496,1.395664,1.191145,0.943775,100.105484,120.529755,174.666214,174.914749,0.0
UDXlpEgF,468.190002,94.855103,0.782617,426.346985,230.757004,0.822717,373.862000,206.546997,0.822391,370.494995,...,1.017858,0.947927,0.879847,0.623896,0.993858,113.339615,115.098167,178.141464,171.161804,0.0
b632WDpF,644.922974,141.884003,0.908126,560.965027,382.903015,0.869544,429.859009,382.860992,0.741821,555.565002,...,1.523048,1.506615,1.525126,1.145206,0.948291,359.999817,132.874084,179.434875,172.483810,0.0
ybGzWMPA,398.350006,206.403000,0.923946,352.967987,297.207001,0.897122,293.691986,290.320007,0.907000,345.894012,...,1.324459,1.407150,1.272274,1.064869,0.963154,110.360352,116.014709,154.888702,154.736862,0.0


In [71]:
from torch.utils.data import Dataset, DataLoader, sampler

# reference: https://discuss.pytorch.org/t/dataset-from-pandas-without-folder-structure/146816/4
class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __getitem__(self, index):
        row = self.dataframe.iloc[index].to_numpy()
        features = row[:-1]
        label = row[-1]
        return features, label

    def __len__(self):
        return len(self.dataframe)

In [72]:
# 383 examples // 80/10/10 split // 307/38/38 
NUM_TRAIN = 307
NUM_VAL = 38

# TODO: apply any transforms (e.g. normalization to the input data)

train = CustomDataset(dataframe=df_join.iloc[:NUM_TRAIN+NUM_VAL])
loader_train = DataLoader(train, batch_size=5, 
                       sampler=sampler.SubsetRandomSampler(range(NUM_TRAIN)))

val = CustomDataset(dataframe=df_join.iloc[:NUM_TRAIN+NUM_VAL])
loader_val = DataLoader(val, batch_size=5, 
                        sampler=sampler.SubsetRandomSampler(range(NUM_TRAIN, NUM_TRAIN+NUM_VAL)))

test = CustomDataset(dataframe=df_join.iloc[NUM_TRAIN+NUM_VAL:])
loader_test = DataLoader(test, batch_size=5)

In [74]:
for i, (data, labels) in enumerate(loader_train):
  print(data.shape, labels.shape)
  print(data,labels)
  break;

torch.Size([5, 5033]) torch.Size([5])
tensor([[457.6740, 314.6770,   0.9170,  ..., 129.8364, 172.0761, 168.7090],
        [144.9140, 135.6550,   0.8574,  ..., 131.1964, 182.2669, 176.0522],
        [723.3540, 634.1990,   0.8782,  ..., 111.4833, 191.9402, 175.8809],
        [342.4530, 363.4340,   0.8605,  ..., 120.4656, 176.9057, 174.2657],
        [377.3570, 304.1000,   0.9018,  ..., 133.8226, 175.9757, 178.4486]],
       dtype=torch.float64) tensor([0., 0., 0., 0., 0.], dtype=torch.float64)


### BareBones Model

In [None]:
# Placeholder for barebones model

In [None]:
# Placeholder for model visualization