# Effect of Active Objects

Here we explore whether active objects make a difference in the performance of the model.

In [1]:
import os

import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from xgboost import XGBClassifier

import egoviz.models.evaluation as ev
import egoviz.models.processing as pr

SEED = 42

pd.set_option("display.max_rows", 300)

In [2]:
# load and process data
cwd = os.getcwd()
df = pr.load_pickle(os.path.join(cwd, '../data/home_data_all_preds_df.pkl'))

df_active = pr.generate_counts_df(df)
df_inactive = df_active.drop(columns=[col for col in df_active.columns if 'active' in col])

df_active_scaled = pr.row_wise_min_max_scaling(df_active)
df_inactive_scaled = pr.row_wise_min_max_scaling(df_inactive)

In [3]:
# prepare data for evaluation
label_encoder = LabelEncoder()

In [4]:
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=SEED, class_weight='balanced')),
    ('Random Forest', RandomForestClassifier(random_state=SEED, class_weight='balanced')),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=SEED)),
    ('XGBoost', XGBClassifier(random_state=SEED)),
    ('SVM', SVC(random_state=SEED, class_weight='balanced')),
    ('MLP', MLPClassifier(random_state=SEED, learning_rate='adaptive', max_iter=1000, early_stopping=True))
]

results_active = ev.evaluate_models(models, df_active_scaled, label_encoder)

2023-12-04 00:08:23,782 - root - INFO - LOGOCV complete for LogisticRegression
2023-12-04 00:08:32,714 - root - INFO - LOGOCV complete for RandomForestClassifier
2023-12-04 00:11:32,245 - root - INFO - LOGOCV complete for GradientBoostingClassifier
2023-12-04 00:11:41,105 - root - INFO - LOGOCV complete for XGBClassifier
2023-12-04 00:11:42,673 - root - INFO - LOGOCV complete for SVC
2023-12-04 00:11:48,460 - root - INFO - LOGOCV complete for MLPClassifier


In [5]:
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=SEED, class_weight='balanced')),
    ('Random Forest', RandomForestClassifier(random_state=SEED, class_weight='balanced')),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=SEED)),
    ('XGBoost', XGBClassifier(random_state=SEED)),
    ('SVM', SVC(random_state=SEED, class_weight='balanced')),
    ('MLP', MLPClassifier(random_state=SEED, learning_rate='adaptive', max_iter=1000, early_stopping=True))
]

results_inactive = ev.evaluate_models(models, df_inactive_scaled, label_encoder)

2023-12-04 00:11:48,903 - root - INFO - LOGOCV complete for LogisticRegression
2023-12-04 00:12:02,211 - root - INFO - LOGOCV complete for RandomForestClassifier
2023-12-04 00:14:09,875 - root - INFO - LOGOCV complete for GradientBoostingClassifier
2023-12-04 00:14:16,686 - root - INFO - LOGOCV complete for XGBClassifier
2023-12-04 00:14:17,812 - root - INFO - LOGOCV complete for SVC
2023-12-04 00:14:20,951 - root - INFO - LOGOCV complete for MLPClassifier


In [6]:
results_active[1][['median_accuracy','median_precision', 'median_recall', 'median_f1', 'model']].groupby('model').first().reset_index()

Unnamed: 0,model,median_accuracy,median_precision,median_recall,median_f1
0,GradientBoostingClassifier,0.721436,0.490829,0.667125,0.525849
1,LogisticRegression,0.723444,0.507118,0.659284,0.452529
2,MLPClassifier,0.709701,0.564599,0.640061,0.460071
3,RandomForestClassifier,0.729113,0.57222,0.622652,0.456153
4,SVC,0.671358,0.485208,0.628113,0.4665
5,XGBClassifier,0.689936,0.48495,0.634075,0.542522


In [7]:
results_active[1][['f1', 'median_f1', 'model']].groupby('model').agg(
    # get the first median f1 score
    median_f1=('median_f1', 'first'),
    # get the percentage of f1 scores that are above 0.5
    percentage_above_05=('f1', lambda x: round(len(x[x > 0.5]) / len(x), 2)),
)

Unnamed: 0_level_0,median_f1,percentage_above_05
model,Unnamed: 1_level_1,Unnamed: 2_level_1
GradientBoostingClassifier,0.525849,0.56
LogisticRegression,0.452529,0.31
MLPClassifier,0.460071,0.44
RandomForestClassifier,0.456153,0.44
SVC,0.4665,0.38
XGBClassifier,0.542522,0.56


In [8]:
results_inactive[1][['median_accuracy','median_precision', 'median_recall', 'median_f1', 'model']].groupby('model').first().reset_index()

Unnamed: 0,model,median_accuracy,median_precision,median_recall,median_f1
0,GradientBoostingClassifier,0.685478,0.464561,0.678255,0.500595
1,LogisticRegression,0.656225,0.495522,0.642054,0.420096
2,MLPClassifier,0.636875,0.529189,0.580142,0.45315
3,RandomForestClassifier,0.6971,0.594783,0.562562,0.429738
4,SVC,0.591837,0.464831,0.627255,0.45934
5,XGBClassifier,0.644771,0.500169,0.63984,0.483414


In [9]:
results_inactive[1][['f1', 'median_f1', 'model']].groupby('model').agg(
    # get the first median f1 score
    median_f1=('median_f1', 'first'),
    # get the percentage of f1 scores that are above 0.5
    percentage_above_05=('f1', lambda x: round(len(x[x > 0.5]) / len(x), 2)),
)

Unnamed: 0_level_0,median_f1,percentage_above_05
model,Unnamed: 1_level_1,Unnamed: 2_level_1
GradientBoostingClassifier,0.500595,0.5
LogisticRegression,0.420096,0.38
MLPClassifier,0.45315,0.38
RandomForestClassifier,0.429738,0.38
SVC,0.45934,0.25
XGBClassifier,0.483414,0.44


### Conclusions

Across the board, all models perform better when including active objects.