# Ablation Study

This notebook is an ablation study of the model. We will train the model with different configurations and compare the results. The configurations are:
- [Counts](#counts)
- [Binary](#binary)
- [Active](#active)
- [Counts + Binary](#counts-binary)
- [Counts + Active](#counts-active)
- [Binary + Active](#binary-active)
- [Counts + Binary + Active](#counts-binary-active)

In [2]:
import os

import numpy as np
import pandas as pd
import polars as pl

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

import egoviz.models.evaluation as ev
import egoviz.models.processing as pr
import egoviz.visualize as viz

SEED = 42

pd.set_option("display.max_rows", 300)
cwd = os.getcwd()

In [14]:
# load data

raw_binary = pr.load_pickle(os.path.join(cwd, '../data/home_data_all_preds_df.pkl'))
raw_counts = pr.load_pickle(os.path.join(cwd, '../data/home_data_all_preds.pkl'))

# process data
df_counts_active = pr.generate_counts_df(pr.generate_df_from_preds(raw_counts))
df_counts = df_counts_active.drop(columns=[col for col in df_counts_active.columns if 'active' in col])


df_binary_active = pr.generate_binary_presence_df(raw_binary)
df_binary = df_binary_active.drop(columns=[col for col in df_binary_active.columns if 'active' in col])


In [38]:
df_counts_pl = pl.from_pandas(df_counts)
df_binary_pl = pl.from_pandas(df_binary)

df_counts_binary = df_counts_pl.join(df_binary_pl, on='video', how='inner', suffix='_binary').drop(['video_binary', 'adl_binary']).to_pandas()

df_counts_active_pl = pl.from_pandas(df_counts_active)
df_binary_active_pl = pl.from_pandas(df_binary_active)

df_counts_active_binary = df_counts_active_pl.join(df_binary_active_pl, on='video', how='inner', suffix='_binary').drop(['video_binary', 'adl_binary']).to_pandas()

df_active_from_binary = pl.from_pandas(df_binary_active.drop(columns=[col for col in df_binary_active.columns if 'count_' in col]))
df_active_from_counts = pl.from_pandas(df_counts_active.drop(columns=[col for col in df_binary_active.columns if 'count_' in col]))

df_active = df_active_from_binary.join(df_active_from_counts, on='video', how='inner', suffix='_counts').drop(['video_counts', 'adl_counts']).to_pandas()

In [None]:
# scale data

df_counts = pr.row_wise_min_max_scaling(df_counts)
df_counts_active = pr.row_wise_min_max_scaling(df_counts_active)
df_binary = pr.row_wise_min_max_scaling(df_binary)
df_binary_active = pr.row_wise_min_max_scaling(df_binary_active)
df_counts_binary = pr.row_wise_min_max_scaling(df_counts_binary)
df_counts_active_binary = pr.row_wise_min_max_scaling(df_counts_active_binary)
df_active = pr.row_wise_min_max_scaling(df_active)

In [None]:
def get_models():
    return [
        ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=SEED, class_weight='balanced')),
        ('Random Forest', RandomForestClassifier(random_state=SEED, class_weight='balanced')),
        ('Gradient Boosting', GradientBoostingClassifier(random_state=SEED)),
        ('XGBoost', XGBClassifier(random_state=SEED)),
        ('MLP', MLPClassifier(random_state=SEED, learning_rate='adaptive', max_iter=1000, early_stopping=True))
    ]

In [None]:
label_encoder = LabelEncoder()

<a id="counts"></a>
## Counts

In [None]:
results_counts, results_counts_df = ev.evaluate_models(get_models(), df_counts, label_encoder)

In [None]:
ev.display_pct_table(results_counts_df)

<a id="binary"></a>
## Binary

In [None]:
results_binary, results_binary_df = ev.evaluate_models(get_models(), df_binary, label_encoder)

<a id="active"></a>
## Active

<a id="counts-binary"></a>
## Counts + Binary

<a id="counts-active"></a>
## Counts + Active

<a id="binary-active"></a>
## Binary + Active

<a id="counts-binary-active"></a>
## Counts + Binary + Active