In [9]:
import pickle
import pandas as pd
import numpy as np

SEED = 0

In [2]:
data = pickle.load(open('../data/all_data_processed.pkl', 'rb'))

### Let's test if ADLs can be predicted using single images first...

In [3]:
# create data df
df = pd.DataFrame(columns=['classes', 'active', 'adl', 'image'])

for id, dets in data.items():
    label = id.split('_', 1)[0]
    image = id.split('_', 1)[1]
    classes = dets['detic_data']['classes']
    active = dets['detic_data']['active']

    row = {'classes': classes, 'active': active, 'adl': label, 'image': image}

    df.loc[len(df)] = row

# one hot encode the classes but add counts for each class

# create a list of all classes
all_classes = []
for classes in df['classes']:
    all_classes.extend(classes)
    
# remove duplicates
all_classes = list(set(all_classes))

# create a column for each class
for c in all_classes:
    df[c] = 0

# loop through rows and set the value of the class column to the 
# number of times it appears in the classes column
for i, row in df.iterrows():
    for c in row['classes']:
        df.at[i, c] += 1

df.head()

Unnamed: 0,classes,active,adl,image,0,1,2,3,4,5,...,19,20,21,22,23,24,25,26,27,28
0,"[17, 15, 11, 11, 25, 5, 17, 21, 21, 24, 25, 26...","[True, False, False, False, False, False, Fals...",communication-mgmt,SCI06-7--11_frame49,0,0,0,0,0,1,...,0,0,3,0,0,3,2,1,0,0
1,"[17, 11, 11, 12, 26, 12, 10, 26, 9, 15, 26, 27...","[True, False, False, False, False, False, Fals...",communication-mgmt,SCI06-7--11_frame98,0,0,0,0,0,0,...,0,0,0,0,0,0,3,3,1,0
2,"[17, 12, 11, 11, 12, 16, 26, 10, 12, 9, 13, 26...","[True, False, False, False, False, False, Fals...",communication-mgmt,SCI06-7--11_frame196,0,0,0,0,0,0,...,0,0,0,0,0,1,0,3,0,0
3,"[17, 11, 12, 12, 26, 15, 11, 10, 26, 10, 25, 9...","[True, False, False, False, False, False, Fals...",communication-mgmt,SCI06-7--11_frame147,0,0,0,0,0,1,...,0,0,2,0,0,0,1,3,0,0
4,"[17, 11, 10, 11, 12, 26, 26, 12, 26, 25, 11, 7...","[True, False, False, False, False, False, Fals...",communication-mgmt,SCI06-7--11_frame294,0,0,0,0,0,0,...,0,0,1,0,0,0,1,4,1,0


### Naive classififcation without active objects

In [4]:
from sklearn.model_selection import train_test_split

X = df.drop(['classes', 'active', 'adl'], axis=1)
y = df['adl']

# split into train and test sets stratified by y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=SEED)

In [5]:
# train a model
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, random_state=SEED)
clf.fit(X_train.drop(columns=['image']), y_train)

In [6]:
# evaluate model
from egoviz.models.evaluation import evaluate_model, evaluate_k_fold

report, cm, preds_df = evaluate_model(clf, X_test.drop(columns=['image']), y_test)
print(report); print(cm)

                      precision    recall  f1-score   support

  communication-mgmt       0.68      0.78      0.73        41
 functional-mobility       0.53      0.35      0.42        49
grooming-health-mgmt       0.63      0.63      0.63        46
     home-management       0.57      0.51      0.54        63
       leisure-other       0.78      0.70      0.74        20
   meal-prep-cleanup       0.67      0.86      0.75        79
        self-feeding       0.82      0.77      0.80        53

            accuracy                           0.66       351
           macro avg       0.67      0.66      0.66       351
        weighted avg       0.66      0.66      0.65       351

[[32  3  0  0  2  1  3]
 [ 3 17  6  9  0 11  3]
 [ 4  1 29  9  1  2  0]
 [ 6  7  4 32  0 13  1]
 [ 0  3  0  3 14  0  0]
 [ 0  1  6  2  0 68  2]
 [ 2  0  1  1  1  7 41]]


In [7]:
# use k-fold cross validation to train a new model using f1 score

clf2 = RandomForestClassifier(n_estimators=100, random_state=SEED)
evaluate_k_fold(clf2, X, y, k=5)

f1_macro: 0.6881551428971046 +/- 0.03305029163875453


Unnamed: 0,fit_time,score_time,test_f1_macro,test_precision_macro,test_recall_macro,model,mean_f1_macro,std_f1_macro
0,0.502062,0.029336,0.731395,0.752493,0.718922,RandomForestClassifier,0.69,0.04
1,0.320218,0.031514,0.666953,0.68219,0.6794,RandomForestClassifier,0.69,0.04
2,0.550992,0.0276,0.700584,0.715248,0.69399,RandomForestClassifier,0.69,0.04
3,0.333658,0.032472,0.636325,0.648722,0.631454,RandomForestClassifier,0.69,0.04
4,0.313042,0.026697,0.705519,0.71916,0.699249,RandomForestClassifier,0.69,0.04


### Lets apply a weighting to active objects to see if it improves performance

In [9]:
# create data df
df2 = pd.DataFrame(columns=['classes', 'active', 'adl', 'image'])

for id, dets in data.items():
    label = id.split('_', 1)[0]
    image = id.split('_', 1)[1]
    classes = dets['detic_data']['classes']
    active = dets['detic_data']['active']

    row = {'classes': classes, 'active': active, 'adl': label, 'image': image}

    df2.loc[len(df2)] = row

# one hot encode the classes but add counts for each class

# create a list of all classes
all_classes = []
for classes in df2['classes']:
    all_classes.extend(classes)
    
# remove duplicates
all_classes = list(set(all_classes))

# create a column for each class
for c in all_classes:
    df2[c] = 0

# loop through rows and set the value of the class column to the
# number of times it appears in the classes column

WEIGHT = 2

for i, row in df2.iterrows():
    for c in row['classes']:
        if c in row['active']:
            df2.at[i, c] += WEIGHT
        else:
            df2.at[i, c] += 1

In [10]:
from sklearn.model_selection import train_test_split

X2 = df2.drop(['classes', 'active', 'adl'], axis=1)
y2 = df2['adl']

# split into train and test sets stratified by y
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.3, stratify=y2, random_state=SEED)

In [11]:
# train a model
from sklearn.ensemble import RandomForestClassifier

clf3 = RandomForestClassifier(n_estimators=100, random_state=SEED)
clf3.fit(X_train2.drop(columns=['image']), y_train2)

# evaluate model
report2, cm2, preds_df2 = evaluate_model(clf3, X_test2.drop(columns=['image']), y_test2)
print(report2); print(cm2)

                      precision    recall  f1-score   support

  communication-mgmt       0.71      0.83      0.76        41
 functional-mobility       0.54      0.39      0.45        49
grooming-health-mgmt       0.72      0.67      0.70        46
     home-management       0.56      0.54      0.55        63
       leisure-other       0.89      0.80      0.84        20
   meal-prep-cleanup       0.66      0.84      0.74        79
        self-feeding       0.85      0.74      0.79        53

            accuracy                           0.68       351
           macro avg       0.70      0.69      0.69       351
        weighted avg       0.68      0.68      0.68       351

[[34  2  0  1  0  1  3]
 [ 2 19  6 10  0 11  1]
 [ 3  2 31  8  1  1  0]
 [ 7  8  1 34  0 12  1]
 [ 0  2  0  2 16  0  0]
 [ 0  2  4  5  0 66  2]
 [ 2  0  1  1  1  9 39]]


In [12]:
clf4 = RandomForestClassifier(n_estimators=100, random_state=SEED)
evaluate_k_fold(clf4, X2, y2, k=5)

f1_macro: 0.6857261266415506 +/- 0.03746267964070704


Unnamed: 0,fit_time,score_time,test_f1_macro,test_precision_macro,test_recall_macro,model,mean_f1_macro,std_f1_macro
0,0.315834,0.023537,0.71703,0.737455,0.7078,RandomForestClassifier,0.69,0.04
1,0.267901,0.024533,0.631483,0.630496,0.646763,RandomForestClassifier,0.69,0.04
2,0.466937,0.042251,0.714253,0.730498,0.705585,RandomForestClassifier,0.69,0.04
3,0.278329,0.024435,0.649275,0.654524,0.648063,RandomForestClassifier,0.69,0.04
4,0.266001,0.024449,0.71659,0.729742,0.708541,RandomForestClassifier,0.69,0.04
