# Active Weights

Explore the difference between computing active object weights vs. seperate columns for active object counts.

In [1]:
import os

import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from xgboost import XGBClassifier

import egoviz.models.evaluation as ev
import egoviz.models.processing as pr

SEED = 42

pd.set_option("display.max_rows", 300)

### Load Data and Generate Dataframes

In [2]:
# load and process data
cwd = os.getcwd()
df = pr.load_pickle(os.path.join(cwd, '../data/home_data_all_preds_df.pkl'))

df_counts = pr.generate_counts_df(df)
df_weight_2 = pr.generate_counts_df(df, weighted=True, weight=2)
df_weight_5 = pr.generate_counts_df(df, weighted=True, weight=5)
df_weight_10 = pr.generate_counts_df(df, weighted=True, weight=10)

# scale data
df_counts_scaled = pr.row_wise_min_max_scaling(df_counts)
df_weight_10_scaled = pr.row_wise_min_max_scaling(df_weight_10)

### Evaluate Models

In [3]:
# prepare data for evaluation
label_encoder = LabelEncoder()

In [4]:
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=SEED, class_weight='balanced')),
    ('Random Forest', RandomForestClassifier(random_state=SEED, class_weight='balanced')),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=SEED)),
    ('XGBoost', XGBClassifier(random_state=SEED)),
    ('SVM', SVC(random_state=SEED, class_weight='balanced')),
    ('MLP', MLPClassifier(random_state=SEED, learning_rate='adaptive', max_iter=1000, early_stopping=True))
]

results_counts = ev.evaluate_models(models, df_counts_scaled, label_encoder)

2023-12-04 00:07:51,859 - root - INFO - LOGOCV complete for LogisticRegression
2023-12-04 00:08:00,423 - root - INFO - LOGOCV complete for RandomForestClassifier
2023-12-04 00:10:51,572 - root - INFO - LOGOCV complete for GradientBoostingClassifier
2023-12-04 00:10:59,712 - root - INFO - LOGOCV complete for XGBClassifier
2023-12-04 00:11:01,336 - root - INFO - LOGOCV complete for SVC
2023-12-04 00:11:07,808 - root - INFO - LOGOCV complete for MLPClassifier


In [5]:
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=SEED, class_weight='balanced')),
    ('Random Forest', RandomForestClassifier(random_state=SEED, class_weight='balanced')),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=SEED)),
    ('XGBoost', XGBClassifier(random_state=SEED)),
    ('SVM', SVC(random_state=SEED, class_weight='balanced')),
    ('MLP', MLPClassifier(random_state=SEED, learning_rate='adaptive', max_iter=1000, early_stopping=True))
]

results_weight_10 = ev.evaluate_models(models, df_weight_10_scaled, label_encoder)

2023-12-04 00:11:08,382 - root - INFO - LOGOCV complete for LogisticRegression
2023-12-04 00:11:23,296 - root - INFO - LOGOCV complete for RandomForestClassifier
2023-12-04 00:13:51,411 - root - INFO - LOGOCV complete for GradientBoostingClassifier
2023-12-04 00:13:58,497 - root - INFO - LOGOCV complete for XGBClassifier
2023-12-04 00:13:59,569 - root - INFO - LOGOCV complete for SVC
2023-12-04 00:14:02,451 - root - INFO - LOGOCV complete for MLPClassifier


### Compare Models

In [6]:
results_counts[1][['median_precision', 'median_recall', 'median_f1', 'model']].groupby('model').first().reset_index()

Unnamed: 0,model,median_precision,median_recall,median_f1
0,GradientBoostingClassifier,0.490829,0.667125,0.524714
1,LogisticRegression,0.507118,0.659284,0.452529
2,MLPClassifier,0.560491,0.641847,0.452407
3,RandomForestClassifier,0.575353,0.617427,0.528003
4,SVC,0.485208,0.628113,0.4665
5,XGBClassifier,0.48495,0.634075,0.542522


In [7]:
results_counts[1][['f1', 'median_f1', 'model']].groupby('model').agg(
    # get the first median f1 score
    median_f1=('median_f1', 'first'),
    # get the percentage of f1 scores that are above 0.5
    percentage_above_05=('f1', lambda x: round(len(x[x > 0.5]) / len(x), 2)),
)

Unnamed: 0_level_0,median_f1,percentage_above_05
model,Unnamed: 1_level_1,Unnamed: 2_level_1
GradientBoostingClassifier,0.524714,0.56
LogisticRegression,0.452529,0.31
MLPClassifier,0.452407,0.38
RandomForestClassifier,0.528003,0.56
SVC,0.4665,0.38
XGBClassifier,0.542522,0.56


In [8]:
results_weight_10[1][['median_precision', 'median_recall', 'median_f1', 'model']].groupby('model').first().reset_index()

Unnamed: 0,model,median_precision,median_recall,median_f1
0,GradientBoostingClassifier,0.48422,0.656718,0.559557
1,LogisticRegression,0.494314,0.698342,0.524217
2,MLPClassifier,0.537698,0.672185,0.503334
3,RandomForestClassifier,0.515892,0.628193,0.503513
4,SVC,0.494899,0.697204,0.452942
5,XGBClassifier,0.476995,0.668981,0.487353


In [9]:
results_weight_10[1][['f1', 'median_f1', 'model']].groupby('model').agg(
    # get the first median f1 score
    median_f1=('median_f1', 'first'),
    # get the percentage of f1 scores that are above 0.5
    percentage_above_05=('f1', lambda x: round(len(x[x > 0.5]) / len(x), 2)),
)

Unnamed: 0_level_0,median_f1,percentage_above_05
model,Unnamed: 1_level_1,Unnamed: 2_level_1
GradientBoostingClassifier,0.559557,0.56
LogisticRegression,0.524217,0.62
MLPClassifier,0.503334,0.5
RandomForestClassifier,0.503513,0.56
SVC,0.452942,0.44
XGBClassifier,0.487353,0.5


### Conclusions

For the most part, the models are very similar regardless of whether we use active weights or active counts, slightly favoring the active weights. The exception is the XGBoost model, which performs significantly better without active weights.