# Ablation Study

This notebook is an ablation study of the model. We will train the model with different configurations and compare the results. The configurations are:
- [Counts](#counts)
- [Binary](#binary)
- [Active](#active)
- [Counts + Binary](#counts-binary)
- [Counts + Active](#counts-active)
- [Binary + Active](#binary-active)
- [Counts + Binary + Active](#counts-binary-active)

In [1]:
import os

import pandas as pd
import polars as pl

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

import egoviz.models.evaluation as ev
import egoviz.models.processing as pr
import egoviz.visualize as viz

SEED = 42

pd.set_option("display.max_rows", 300)
cwd = os.getcwd()

## Load and Prepare Data

In [2]:
# load data

raw_binary = pr.load_pickle(os.path.join(cwd, '../data/home_data_all_preds_df.pkl'))
raw_counts = pr.load_pickle(os.path.join(cwd, '../data/home_data_all_preds.pkl'))

# process data
df_counts_active = pr.generate_counts_df(pr.generate_df_from_preds(raw_counts))
df_counts = df_counts_active.drop(columns=[col for col in df_counts_active.columns if 'active' in col])


df_binary_active = pr.generate_binary_presence_df(raw_binary)
df_binary = df_binary_active.drop(columns=[col for col in df_binary_active.columns if 'active' in col])


In [3]:
df_counts_pl = pl.from_pandas(df_counts)
df_binary_pl = pl.from_pandas(df_binary)

df_counts_binary = df_counts_pl.join(df_binary_pl, on='video', how='inner', suffix='_binary').drop(['video_binary', 'adl_binary']).to_pandas()

df_counts_active_pl = pl.from_pandas(df_counts_active)
df_binary_active_pl = pl.from_pandas(df_binary_active)

df_counts_active_binary = df_counts_active_pl.join(df_binary_active_pl, on='video', how='inner', suffix='_binary').drop(['video_binary', 'adl_binary']).to_pandas()

df_active_from_binary = pl.from_pandas(df_binary_active.drop(columns=[col for col in df_binary_active.columns if 'count_' in col]))
df_active_from_counts = pl.from_pandas(df_counts_active.drop(columns=[col for col in df_binary_active.columns if 'count_' in col]))

df_active = df_active_from_binary.join(df_active_from_counts, on='video', how='inner', suffix='_counts').drop(['video_counts', 'adl_counts']).to_pandas()

In [4]:
# scale data

df_counts = pr.row_wise_min_max_scaling(df_counts)
df_counts_active = pr.row_wise_min_max_scaling(df_counts_active)
df_binary = pr.row_wise_min_max_scaling(df_binary)
df_binary_active = pr.row_wise_min_max_scaling(df_binary_active)
df_counts_binary = pr.row_wise_min_max_scaling(df_counts_binary)
df_counts_active_binary = pr.row_wise_min_max_scaling(df_counts_active_binary)
df_active = pr.row_wise_min_max_scaling(df_active)

In [5]:
def get_models():
    return [
        ('Gradient Boosting', GradientBoostingClassifier(random_state=SEED)),
        ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=SEED, class_weight='balanced')),
        ('MLP', MLPClassifier(random_state=SEED, learning_rate='adaptive', max_iter=1000, early_stopping=True)),
        ('Random Forest', RandomForestClassifier(random_state=SEED, class_weight='balanced')),
        ('XGBoost', XGBClassifier(random_state=SEED)),
    ]

In [6]:
label_encoder = LabelEncoder()

<a id="counts"></a>
## Counts

In [7]:
results_counts, results_counts_df = ev.evaluate_models(get_models(), df_counts, label_encoder)
counts_table = ev.display_pct_table(results_counts_df)
counts_table["AUC"] = [round(result.auc, 2) for result in results_counts]

counts_table

2024-03-08 12:27:02,548 - root - INFO - LOGOCV complete for GradientBoostingClassifier
2024-03-08 12:27:02,975 - root - INFO - LOGOCV complete for LogisticRegression
2024-03-08 12:27:06,129 - root - INFO - LOGOCV complete for MLPClassifier
2024-03-08 12:27:15,835 - root - INFO - LOGOCV complete for RandomForestClassifier
2024-03-08 12:27:22,151 - root - INFO - LOGOCV complete for XGBClassifier


Unnamed: 0,model,median_f1,mean_f1,std_f1,pct_above_0.5,AUC
0,GradientBoostingClassifier,0.725213,0.662819,0.233104,0.81,0.91
1,LogisticRegression,0.739945,0.701867,0.140061,0.88,0.9
2,MLPClassifier,0.680514,0.654284,0.204407,0.81,0.88
3,RandomForestClassifier,0.733428,0.653191,0.219334,0.75,0.91
4,XGBClassifier,0.71396,0.669272,0.216977,0.81,0.91


<a id="binary"></a>
## Binary

In [8]:
results_binary, results_binary_df = ev.evaluate_models(get_models(), df_binary, label_encoder)
binary_table = ev.display_pct_table(results_binary_df)
binary_table["AUC"] = [round(result.auc, 2) for result in results_binary]

binary_table

2024-03-08 12:28:14,980 - root - INFO - LOGOCV complete for GradientBoostingClassifier
2024-03-08 12:28:15,497 - root - INFO - LOGOCV complete for LogisticRegression
2024-03-08 12:28:17,744 - root - INFO - LOGOCV complete for MLPClassifier
2024-03-08 12:28:23,675 - root - INFO - LOGOCV complete for RandomForestClassifier
2024-03-08 12:28:28,028 - root - INFO - LOGOCV complete for XGBClassifier


Unnamed: 0,model,median_f1,mean_f1,std_f1,pct_above_0.5,AUC
0,GradientBoostingClassifier,0.762649,0.697337,0.1818,0.81,0.91
1,LogisticRegression,0.762675,0.726209,0.151507,0.94,0.93
2,MLPClassifier,0.685295,0.647769,0.237041,0.81,0.91
3,RandomForestClassifier,0.709992,0.642909,0.240522,0.81,0.91
4,XGBClassifier,0.727006,0.691439,0.175654,0.88,0.91


<a id="active"></a>
## Active

In [9]:
results_active, results_active_df = ev.evaluate_models(get_models(), df_active.dropna(), label_encoder)
active_table = ev.display_pct_table(results_active_df)
active_table["AUC"] = [round(result.auc, 2) for result in results_active]

active_table

2024-03-08 12:28:50,888 - root - INFO - LOGOCV complete for GradientBoostingClassifier
2024-03-08 12:28:51,354 - root - INFO - LOGOCV complete for LogisticRegression
2024-03-08 12:28:54,394 - root - INFO - LOGOCV complete for MLPClassifier
2024-03-08 12:28:58,390 - root - INFO - LOGOCV complete for RandomForestClassifier
2024-03-08 12:29:03,185 - root - INFO - LOGOCV complete for XGBClassifier


Unnamed: 0,model,median_f1,mean_f1,std_f1,pct_above_0.5,AUC
0,GradientBoostingClassifier,0.597022,0.585769,0.168044,0.88,0.86
1,LogisticRegression,0.65132,0.646201,0.154845,0.88,0.9
2,MLPClassifier,0.673496,0.594715,0.204839,0.75,0.88
3,RandomForestClassifier,0.629956,0.595136,0.178504,0.75,0.83
4,XGBClassifier,0.625893,0.594528,0.167676,0.81,0.86


<a id="counts-binary"></a>
## Counts + Binary

In [10]:
results_counts_binary, results_counts_binary_df = ev.evaluate_models(get_models(), df_counts_binary, label_encoder)
counts_binary_table = ev.display_pct_table(results_counts_binary_df)
counts_binary_table["AUC"] = [round(result.auc, 2) for result in results_counts_binary]

counts_binary_table

2024-03-08 12:33:03,319 - root - INFO - LOGOCV complete for GradientBoostingClassifier
2024-03-08 12:33:03,920 - root - INFO - LOGOCV complete for LogisticRegression
2024-03-08 12:33:09,131 - root - INFO - LOGOCV complete for MLPClassifier
2024-03-08 12:33:22,119 - root - INFO - LOGOCV complete for RandomForestClassifier
2024-03-08 12:33:31,329 - root - INFO - LOGOCV complete for XGBClassifier


Unnamed: 0,model,median_f1,mean_f1,std_f1,pct_above_0.5,AUC
0,GradientBoostingClassifier,0.748624,0.692142,0.205406,0.81,0.91
1,LogisticRegression,0.75632,0.724335,0.127021,0.94,0.91
2,MLPClassifier,0.722598,0.682213,0.182587,0.88,0.9
3,RandomForestClassifier,0.761328,0.686863,0.227894,0.81,0.9
4,XGBClassifier,0.702253,0.680697,0.171301,0.81,0.91


<a id="counts-active"></a>
## Counts + Active

In [11]:
results_counts_active, results_counts_active_df = ev.evaluate_models(get_models(), df_counts_active, label_encoder)
counts_active_table = ev.display_pct_table(results_counts_active_df)
counts_active_table["AUC"] = [round(result.auc, 2) for result in results_counts_active]

counts_active_table

2024-03-08 12:35:59,917 - root - INFO - LOGOCV complete for GradientBoostingClassifier
2024-03-08 12:36:00,516 - root - INFO - LOGOCV complete for LogisticRegression
2024-03-08 12:36:05,685 - root - INFO - LOGOCV complete for MLPClassifier
2024-03-08 12:36:14,525 - root - INFO - LOGOCV complete for RandomForestClassifier
2024-03-08 12:36:22,594 - root - INFO - LOGOCV complete for XGBClassifier


Unnamed: 0,model,median_f1,mean_f1,std_f1,pct_above_0.5,AUC
0,GradientBoostingClassifier,0.759874,0.704178,0.185865,0.88,0.92
1,LogisticRegression,0.765218,0.732107,0.13237,0.94,0.91
2,MLPClassifier,0.755866,0.699582,0.17852,0.88,0.91
3,RandomForestClassifier,0.718028,0.671801,0.22611,0.81,0.92
4,XGBClassifier,0.738499,0.700221,0.195188,0.88,0.92


<a id="binary-active"></a>
## Binary + Active

In [12]:
results_binary_active, results_binary_active_df = ev.evaluate_models(get_models(), df_binary_active, label_encoder)
binary_active_table = ev.display_pct_table(results_binary_active_df)
binary_active_table["AUC"] = [round(result.auc, 2) for result in results_binary_active]

binary_active_table

2024-03-08 12:37:25,459 - root - INFO - LOGOCV complete for GradientBoostingClassifier
2024-03-08 12:37:26,125 - root - INFO - LOGOCV complete for LogisticRegression
2024-03-08 12:37:29,903 - root - INFO - LOGOCV complete for MLPClassifier
2024-03-08 12:37:35,636 - root - INFO - LOGOCV complete for RandomForestClassifier
2024-03-08 12:37:40,775 - root - INFO - LOGOCV complete for XGBClassifier


Unnamed: 0,model,median_f1,mean_f1,std_f1,pct_above_0.5,AUC
0,GradientBoostingClassifier,0.774937,0.72248,0.190025,0.88,0.92
1,LogisticRegression,0.811589,0.784882,0.119884,1.0,0.94
2,MLPClassifier,0.788354,0.728143,0.220448,0.81,0.94
3,RandomForestClassifier,0.770547,0.68067,0.236352,0.81,0.92
4,XGBClassifier,0.799526,0.744593,0.17177,0.88,0.93


<a id="counts-binary-active"></a>
## Counts + Binary + Active

In [13]:
results_counts_active_binary, results_counts_active_binary_df = ev.evaluate_models(get_models(), df_counts_active_binary, label_encoder)
counts_active_binary_table = ev.display_pct_table(results_counts_active_binary_df)
counts_active_binary_table["AUC"] = [round(result.auc, 2) for result in results_counts_active_binary]

counts_active_binary_table

2024-03-08 12:42:20,836 - root - INFO - LOGOCV complete for GradientBoostingClassifier
2024-03-08 12:42:22,140 - root - INFO - LOGOCV complete for LogisticRegression
2024-03-08 12:42:29,110 - root - INFO - LOGOCV complete for MLPClassifier
2024-03-08 12:42:41,254 - root - INFO - LOGOCV complete for RandomForestClassifier
2024-03-08 12:42:53,057 - root - INFO - LOGOCV complete for XGBClassifier


Unnamed: 0,model,median_f1,mean_f1,std_f1,pct_above_0.5,AUC
0,GradientBoostingClassifier,0.784352,0.715409,0.193198,0.81,0.93
1,LogisticRegression,0.800787,0.770628,0.126363,0.94,0.92
2,MLPClassifier,0.721186,0.705462,0.159167,0.94,0.91
3,RandomForestClassifier,0.731326,0.674775,0.221774,0.81,0.92
4,XGBClassifier,0.711012,0.70203,0.168771,0.88,0.93
