# Ablation Study

This notebook is an ablation study of the model. We will train the model with different configurations and compare the results. The configurations are:
- [Counts](#counts)
- [Binary](#binary)
- [Active](#active)
- [Counts + Binary](#counts-binary)
- [Counts + Active](#counts-active)
- [Binary + Active](#binary-active)
- [Counts + Binary + Active](#counts-binary-active)

In [1]:
import os

import pandas as pd
import polars as pl

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

import egoviz.models.evaluation as ev
import egoviz.models.processing as pr
import egoviz.visualize as viz

SEED = 42

pd.set_option("display.max_rows", 300)
cwd = os.getcwd()

## Load and Prepare Data

In [2]:
# load data

raw_binary = pr.load_pickle(os.path.join(cwd, '../data/home_data_all_preds_df.pkl'))
raw_counts = pr.load_pickle(os.path.join(cwd, '../data/home_data_all_preds.pkl'))

# process data
df_counts_active = pr.generate_counts_df(pr.generate_df_from_preds(raw_counts))
df_counts = df_counts_active.drop(columns=[col for col in df_counts_active.columns if 'active' in col])


df_binary_active = pr.generate_binary_presence_df(raw_binary)
df_binary = df_binary_active.drop(columns=[col for col in df_binary_active.columns if 'active' in col])


In [3]:
df_counts_pl = pl.from_pandas(df_counts)
df_binary_pl = pl.from_pandas(df_binary)

df_counts_binary = df_counts_pl.join(df_binary_pl, on='video', how='inner', suffix='_binary').drop(['video_binary', 'adl_binary']).to_pandas()

df_counts_active_pl = pl.from_pandas(df_counts_active)
df_binary_active_pl = pl.from_pandas(df_binary_active)

df_counts_active_binary = df_counts_active_pl.join(df_binary_active_pl, on='video', how='inner', suffix='_binary').drop(['video_binary', 'adl_binary']).to_pandas()

df_active_from_binary = pl.from_pandas(df_binary_active.drop(columns=[col for col in df_binary_active.columns if 'count_' in col]))
df_active_from_counts = pl.from_pandas(df_counts_active.drop(columns=[col for col in df_binary_active.columns if 'count_' in col]))

df_active = df_active_from_binary.join(df_active_from_counts, on='video', how='inner', suffix='_counts').drop(['video_counts', 'adl_counts']).to_pandas()

In [4]:
# scale data

df_counts = pr.row_wise_min_max_scaling(df_counts)
df_counts_active = pr.row_wise_min_max_scaling(df_counts_active)
df_binary = pr.row_wise_min_max_scaling(df_binary)
df_binary_active = pr.row_wise_min_max_scaling(df_binary_active)
df_counts_binary = pr.row_wise_min_max_scaling(df_counts_binary)
df_counts_active_binary = pr.row_wise_min_max_scaling(df_counts_active_binary)
df_active = pr.row_wise_min_max_scaling(df_active)

In [5]:
def get_models():
    return [
        ('Gradient Boosting', GradientBoostingClassifier(random_state=SEED)),
        ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=SEED, class_weight='balanced')),
        ('MLP', MLPClassifier(random_state=SEED, learning_rate='adaptive', max_iter=1000, early_stopping=True)),
        ('Random Forest', RandomForestClassifier(random_state=SEED, class_weight='balanced')),
        ('XGBoost', XGBClassifier(random_state=SEED)),
    ]

In [6]:
label_encoder = LabelEncoder()

<a id="counts"></a>
## Counts

In [7]:
results_counts, results_counts_df = ev.evaluate_models(get_models(), df_counts, label_encoder)
counts_table = ev.display_pct_table(results_counts_df)
counts_table["AUC"] = [round(result.auc, 2) for result in results_counts]

counts_table

2024-11-25 03:13:16,242 - root - INFO - LOGOCV complete for GradientBoostingClassifier
2024-11-25 03:13:16,947 - root - INFO - LOGOCV complete for LogisticRegression
2024-11-25 03:13:25,791 - root - INFO - LOGOCV complete for MLPClassifier
2024-11-25 03:13:30,294 - root - INFO - LOGOCV complete for RandomForestClassifier
2024-11-25 03:13:35,315 - root - INFO - LOGOCV complete for XGBClassifier


Unnamed: 0,model,median_f1,mean_f1,std_f1,pct_above_0.5,AUC
0,GradientBoostingClassifier,0.722577,0.662022,0.232936,0.81,0.91
1,LogisticRegression,0.739945,0.701867,0.140061,0.88,0.9
2,MLPClassifier,0.680514,0.654284,0.204407,0.81,0.88
3,RandomForestClassifier,0.733428,0.653191,0.219334,0.75,0.91
4,XGBClassifier,0.71396,0.669272,0.216977,0.81,0.91


<a id="binary"></a>
## Binary

In [8]:
results_binary, results_binary_df = ev.evaluate_models(get_models(), df_binary, label_encoder)
binary_table = ev.display_pct_table(results_binary_df)
binary_table["AUC"] = [round(result.auc, 2) for result in results_binary]

binary_table

2024-11-25 03:14:12,448 - root - INFO - LOGOCV complete for GradientBoostingClassifier
2024-11-25 03:14:13,268 - root - INFO - LOGOCV complete for LogisticRegression
2024-11-25 03:14:18,388 - root - INFO - LOGOCV complete for MLPClassifier
2024-11-25 03:14:21,614 - root - INFO - LOGOCV complete for RandomForestClassifier
2024-11-25 03:14:24,786 - root - INFO - LOGOCV complete for XGBClassifier


Unnamed: 0,model,median_f1,mean_f1,std_f1,pct_above_0.5,AUC
0,GradientBoostingClassifier,0.762649,0.697154,0.182118,0.81,0.91
1,LogisticRegression,0.762675,0.726209,0.151507,0.94,0.93
2,MLPClassifier,0.685295,0.647769,0.237041,0.81,0.91
3,RandomForestClassifier,0.709992,0.642909,0.240522,0.81,0.91
4,XGBClassifier,0.727006,0.691439,0.175654,0.88,0.91


<a id="active"></a>
## Active

In [9]:
results_active, results_active_df = ev.evaluate_models(get_models(), df_active.dropna(), label_encoder)
active_table = ev.display_pct_table(results_active_df)
active_table["AUC"] = [round(result.auc, 2) for result in results_active]

active_table

2024-11-25 03:14:39,225 - root - INFO - LOGOCV complete for GradientBoostingClassifier
2024-11-25 03:14:39,934 - root - INFO - LOGOCV complete for LogisticRegression
2024-11-25 03:14:43,400 - root - INFO - LOGOCV complete for MLPClassifier
2024-11-25 03:14:45,361 - root - INFO - LOGOCV complete for RandomForestClassifier
2024-11-25 03:14:48,565 - root - INFO - LOGOCV complete for XGBClassifier


Unnamed: 0,model,median_f1,mean_f1,std_f1,pct_above_0.5,AUC
0,GradientBoostingClassifier,0.599107,0.579397,0.167414,0.81,0.86
1,LogisticRegression,0.65132,0.646201,0.154845,0.88,0.9
2,MLPClassifier,0.695214,0.63037,0.173636,0.81,0.88
3,RandomForestClassifier,0.633087,0.591964,0.175033,0.75,0.83
4,XGBClassifier,0.625893,0.594528,0.167676,0.81,0.86


<a id="counts-binary"></a>
## Counts + Binary

In [10]:
results_counts_binary, results_counts_binary_df = ev.evaluate_models(get_models(), df_counts_binary, label_encoder)
counts_binary_table = ev.display_pct_table(results_counts_binary_df)
counts_binary_table["AUC"] = [round(result.auc, 2) for result in results_counts_binary]

counts_binary_table

2024-11-25 03:16:56,609 - root - INFO - LOGOCV complete for GradientBoostingClassifier
2024-11-25 03:16:57,742 - root - INFO - LOGOCV complete for LogisticRegression
2024-11-25 03:17:04,347 - root - INFO - LOGOCV complete for MLPClassifier
2024-11-25 03:17:10,132 - root - INFO - LOGOCV complete for RandomForestClassifier
2024-11-25 03:17:18,524 - root - INFO - LOGOCV complete for XGBClassifier


Unnamed: 0,model,median_f1,mean_f1,std_f1,pct_above_0.5,AUC
0,GradientBoostingClassifier,0.748624,0.692531,0.205618,0.81,0.91
1,LogisticRegression,0.75632,0.724335,0.127021,0.94,0.91
2,MLPClassifier,0.722598,0.682213,0.182587,0.88,0.9
3,RandomForestClassifier,0.761328,0.686863,0.227894,0.81,0.9
4,XGBClassifier,0.702253,0.680697,0.171301,0.81,0.91


<a id="counts-active"></a>
## Counts + Active

In [11]:
results_counts_active, results_counts_active_df = ev.evaluate_models(get_models(), df_counts_active, label_encoder)
counts_active_table = ev.display_pct_table(results_counts_active_df)
counts_active_table["AUC"] = [round(result.auc, 2) for result in results_counts_active]

counts_active_table

2024-11-25 03:18:39,685 - root - INFO - LOGOCV complete for GradientBoostingClassifier
2024-11-25 03:18:40,404 - root - INFO - LOGOCV complete for LogisticRegression
2024-11-25 03:18:45,795 - root - INFO - LOGOCV complete for MLPClassifier
2024-11-25 03:18:49,834 - root - INFO - LOGOCV complete for RandomForestClassifier
2024-11-25 03:18:56,156 - root - INFO - LOGOCV complete for XGBClassifier


Unnamed: 0,model,median_f1,mean_f1,std_f1,pct_above_0.5,AUC
0,GradientBoostingClassifier,0.750971,0.700636,0.195756,0.88,0.92
1,LogisticRegression,0.765218,0.732107,0.13237,0.94,0.91
2,MLPClassifier,0.746283,0.695789,0.175844,0.81,0.91
3,RandomForestClassifier,0.746982,0.679232,0.226991,0.81,0.92
4,XGBClassifier,0.738499,0.700303,0.195252,0.88,0.92


<a id="binary-active"></a>
## Binary + Active

In [12]:
results_binary_active, results_binary_active_df = ev.evaluate_models(get_models(), df_binary_active, label_encoder)
binary_active_table = ev.display_pct_table(results_binary_active_df)
binary_active_table["AUC"] = [round(result.auc, 2) for result in results_binary_active]

binary_active_table

2024-11-25 03:19:40,128 - root - INFO - LOGOCV complete for GradientBoostingClassifier
2024-11-25 03:19:41,623 - root - INFO - LOGOCV complete for LogisticRegression
2024-11-25 03:19:49,465 - root - INFO - LOGOCV complete for MLPClassifier
2024-11-25 03:19:52,634 - root - INFO - LOGOCV complete for RandomForestClassifier
2024-11-25 03:19:56,573 - root - INFO - LOGOCV complete for XGBClassifier


Unnamed: 0,model,median_f1,mean_f1,std_f1,pct_above_0.5,AUC
0,GradientBoostingClassifier,0.764761,0.731548,0.182853,0.88,0.92
1,LogisticRegression,0.811589,0.784882,0.119884,1.0,0.94
2,MLPClassifier,0.768238,0.724324,0.199891,0.81,0.94
3,RandomForestClassifier,0.760144,0.684203,0.25108,0.81,0.93
4,XGBClassifier,0.799526,0.7462,0.17299,0.88,0.93


<a id="counts-binary-active"></a>
## Counts + Binary + Active

In [13]:
results_counts_active_binary, results_counts_active_binary_df = ev.evaluate_models(get_models(), df_counts_active_binary, label_encoder)
counts_active_binary_table = ev.display_pct_table(results_counts_active_binary_df)
counts_active_binary_table["AUC"] = [round(result.auc, 2) for result in results_counts_active_binary]

counts_active_binary_table

2024-11-25 03:22:24,516 - root - INFO - LOGOCV complete for GradientBoostingClassifier
2024-11-25 03:22:27,290 - root - INFO - LOGOCV complete for LogisticRegression
2024-11-25 03:22:32,960 - root - INFO - LOGOCV complete for MLPClassifier
2024-11-25 03:22:38,176 - root - INFO - LOGOCV complete for RandomForestClassifier
2024-11-25 03:22:47,995 - root - INFO - LOGOCV complete for XGBClassifier


Unnamed: 0,model,median_f1,mean_f1,std_f1,pct_above_0.5,AUC
0,GradientBoostingClassifier,0.77712,0.714364,0.192927,0.81,0.93
1,LogisticRegression,0.800787,0.770628,0.126363,0.94,0.92
2,MLPClassifier,0.742926,0.704349,0.168386,0.88,0.91
3,RandomForestClassifier,0.74733,0.687557,0.231979,0.81,0.92
4,XGBClassifier,0.711012,0.702041,0.168767,0.88,0.93
