In [1]:
import os

import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

import egoviz.models.evaluation as ev
import egoviz.models.processing as pr

SEED = 42

pd.set_option("display.max_rows", 300)

In [2]:
# load and process data
cwd = os.getcwd()
df = pr.load_pickle(os.path.join(cwd, '../data/home_data_all_preds_df.pkl'))

df_weight_10 = pr.generate_counts_df(df, weighted=True, weight=10)
df_weight_25 = pr.generate_counts_df(df, weighted=True, weight=25)
df_weight_50 = pr.generate_counts_df(df, weighted=True, weight=50)
df_weight_100 = pr.generate_counts_df(df, weighted=True, weight=100)


# scale data
df_weight_10_scaled = pr.row_wise_min_max_scaling(df_weight_10)
df_weight_25_scaled = pr.row_wise_min_max_scaling(df_weight_25)
df_weight_50_scaled = pr.row_wise_min_max_scaling(df_weight_50)
df_weight_100_scaled = pr.row_wise_min_max_scaling(df_weight_100)

In [3]:
# prepare data for evaluation
label_encoder = LabelEncoder()

In [4]:
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=SEED, class_weight='balanced')),
    ('Random Forest', RandomForestClassifier(random_state=SEED, class_weight='balanced')),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=SEED)),
    ('XGBoost', XGBClassifier(random_state=SEED)),
    ('MLP', MLPClassifier(random_state=SEED, learning_rate='adaptive', max_iter=1000, early_stopping=True))
]

results_weight_10 = ev.evaluate_models(models, df_weight_50_scaled, label_encoder)

2023-12-04 08:17:44,271 - root - INFO - LOGOCV complete for LogisticRegression
2023-12-04 08:17:54,783 - root - INFO - LOGOCV complete for RandomForestClassifier
2023-12-04 08:20:27,192 - root - INFO - LOGOCV complete for GradientBoostingClassifier
2023-12-04 08:20:34,261 - root - INFO - LOGOCV complete for XGBClassifier
2023-12-04 08:20:36,983 - root - INFO - LOGOCV complete for MLPClassifier


In [5]:
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=SEED, class_weight='balanced')),
    ('Random Forest', RandomForestClassifier(random_state=SEED, class_weight='balanced')),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=SEED)),
    ('XGBoost', XGBClassifier(random_state=SEED)),
    ('MLP', MLPClassifier(random_state=SEED, learning_rate='adaptive', max_iter=1000, early_stopping=True))
]

results_weight_25 = ev.evaluate_models(models, df_weight_50_scaled, label_encoder)

2023-12-04 08:20:37,368 - root - INFO - LOGOCV complete for LogisticRegression
2023-12-04 08:20:47,863 - root - INFO - LOGOCV complete for RandomForestClassifier
2023-12-04 08:23:20,579 - root - INFO - LOGOCV complete for GradientBoostingClassifier
2023-12-04 08:23:27,564 - root - INFO - LOGOCV complete for XGBClassifier
2023-12-04 08:23:30,300 - root - INFO - LOGOCV complete for MLPClassifier


In [6]:
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=SEED, class_weight='balanced')),
    ('Random Forest', RandomForestClassifier(random_state=SEED, class_weight='balanced')),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=SEED)),
    ('XGBoost', XGBClassifier(random_state=SEED)),
    ('MLP', MLPClassifier(random_state=SEED, learning_rate='adaptive', max_iter=1000, early_stopping=True))
]

results_weight_50 = ev.evaluate_models(models, df_weight_50_scaled, label_encoder)

2023-12-04 08:23:30,701 - root - INFO - LOGOCV complete for LogisticRegression
2023-12-04 08:23:41,081 - root - INFO - LOGOCV complete for RandomForestClassifier
2023-12-04 08:26:13,091 - root - INFO - LOGOCV complete for GradientBoostingClassifier
2023-12-04 08:26:19,786 - root - INFO - LOGOCV complete for XGBClassifier
2023-12-04 08:26:22,473 - root - INFO - LOGOCV complete for MLPClassifier


In [7]:
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=SEED, class_weight='balanced')),
    ('Random Forest', RandomForestClassifier(random_state=SEED, class_weight='balanced')),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=SEED)),
    ('XGBoost', XGBClassifier(random_state=SEED)),
    ('MLP', MLPClassifier(random_state=SEED, learning_rate='adaptive', max_iter=1000, early_stopping=True))
]

results_weight_100 = ev.evaluate_models(models, df_weight_100_scaled, label_encoder)

2023-12-04 08:26:22,865 - root - INFO - LOGOCV complete for LogisticRegression
2023-12-04 08:26:33,436 - root - INFO - LOGOCV complete for RandomForestClassifier
2023-12-04 08:29:09,979 - root - INFO - LOGOCV complete for GradientBoostingClassifier
2023-12-04 08:29:16,964 - root - INFO - LOGOCV complete for XGBClassifier
2023-12-04 08:29:19,456 - root - INFO - LOGOCV complete for MLPClassifier


In [8]:
results_weight_10[1][['f1', 'median_f1', 'model']].groupby('model').agg(
    # get the first median f1 score
    median_f1=('median_f1', 'first'),
    # get the percentage of f1 scores that are above 0.5
    percentage_above_05=('f1', lambda x: round(len(x[x > 0.5]) / len(x), 2)),
)

Unnamed: 0_level_0,median_f1,percentage_above_05
model,Unnamed: 1_level_1,Unnamed: 2_level_1
GradientBoostingClassifier,0.502764,0.5
LogisticRegression,0.482811,0.5
MLPClassifier,0.434088,0.44
RandomForestClassifier,0.480295,0.5
XGBClassifier,0.458956,0.44


In [9]:
results_weight_25[1][['f1', 'median_f1', 'model']].groupby('model').agg(
    # get the first median f1 score
    median_f1=('median_f1', 'first'),
    # get the percentage of f1 scores that are above 0.5
    percentage_above_05=('f1', lambda x: round(len(x[x > 0.5]) / len(x), 2)),
)

Unnamed: 0_level_0,median_f1,percentage_above_05
model,Unnamed: 1_level_1,Unnamed: 2_level_1
GradientBoostingClassifier,0.502764,0.5
LogisticRegression,0.482811,0.5
MLPClassifier,0.434088,0.44
RandomForestClassifier,0.480295,0.5
XGBClassifier,0.458956,0.44


In [10]:
results_weight_50[1][['f1', 'median_f1', 'model']].groupby('model').agg(
    # get the first median f1 score
    median_f1=('median_f1', 'first'),
    # get the percentage of f1 scores that are above 0.5
    percentage_above_05=('f1', lambda x: round(len(x[x > 0.5]) / len(x), 2)),
)

Unnamed: 0_level_0,median_f1,percentage_above_05
model,Unnamed: 1_level_1,Unnamed: 2_level_1
GradientBoostingClassifier,0.502764,0.5
LogisticRegression,0.482811,0.5
MLPClassifier,0.434088,0.44
RandomForestClassifier,0.480295,0.5
XGBClassifier,0.458956,0.44


In [11]:
results_weight_100[1][['f1', 'median_f1', 'model']].groupby('model').agg(
    # get the first median f1 score
    median_f1=('median_f1', 'first'),
    # get the percentage of f1 scores that are above 0.5
    percentage_above_05=('f1', lambda x: round(len(x[x > 0.5]) / len(x), 2)),
)

Unnamed: 0_level_0,median_f1,percentage_above_05
model,Unnamed: 1_level_1,Unnamed: 2_level_1
GradientBoostingClassifier,0.528476,0.56
LogisticRegression,0.477762,0.38
MLPClassifier,0.434802,0.38
RandomForestClassifier,0.510054,0.5
XGBClassifier,0.470796,0.44
