In [1]:
import os

import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

import egoviz.models.evaluation as ev
import egoviz.models.processing as pr

SEED = 42

pd.set_option("display.max_rows", 300)

In [4]:
# load and process data
cwd = os.getcwd()
df = pr.load_pickle(os.path.join(cwd, '../data/home_data_all_preds_df.pkl'))

df_weight_10 = pr.generate_counts_df(df, weighted=True, weight=10)
df_weight_25 = pr.generate_counts_df(df, weighted=True, weight=25)
df_weight_50 = pr.generate_counts_df(df, weighted=True, weight=50)
df_weight_100 = pr.generate_counts_df(df, weighted=True, weight=100)


# scale data
df_weight_10_scaled = pr.row_wise_min_max_scaling(df_weight_10)
df_weight_25_scaled = pr.row_wise_min_max_scaling(df_weight_25)
df_weight_50_scaled = pr.row_wise_min_max_scaling(df_weight_50)
df_weight_100_scaled = pr.row_wise_min_max_scaling(df_weight_100)

In [3]:
# prepare data for evaluation
label_encoder = LabelEncoder()

In [6]:
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=SEED, class_weight='balanced')),
    ('Random Forest', RandomForestClassifier(random_state=SEED, class_weight='balanced')),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=SEED)),
    ('XGBoost', XGBClassifier(random_state=SEED)),
    ('MLP', MLPClassifier(random_state=SEED, learning_rate='adaptive', max_iter=1000, early_stopping=True))
]

results_weight_10 = ev.evaluate_models(models, df_weight_50_scaled, label_encoder)

2023-12-04 01:32:00,307 - root - INFO - LOGOCV complete for LogisticRegression
2023-12-04 01:32:19,037 - root - INFO - LOGOCV complete for RandomForestClassifier
2023-12-04 01:36:09,797 - root - INFO - LOGOCV complete for GradientBoostingClassifier
2023-12-04 01:37:02,554 - root - INFO - LOGOCV complete for XGBClassifier
2023-12-04 01:37:15,809 - root - INFO - LOGOCV complete for MLPClassifier


In [7]:
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=SEED, class_weight='balanced')),
    ('Random Forest', RandomForestClassifier(random_state=SEED, class_weight='balanced')),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=SEED)),
    ('XGBoost', XGBClassifier(random_state=SEED)),
    ('MLP', MLPClassifier(random_state=SEED, learning_rate='adaptive', max_iter=1000, early_stopping=True))
]

results_weight_25 = ev.evaluate_models(models, df_weight_50_scaled, label_encoder)

2023-12-04 01:37:17,717 - root - INFO - LOGOCV complete for LogisticRegression
2023-12-04 01:37:34,386 - root - INFO - LOGOCV complete for RandomForestClassifier
2023-12-04 04:42:14,966 - root - INFO - LOGOCV complete for GradientBoostingClassifier
2023-12-04 07:12:28,142 - root - INFO - LOGOCV complete for XGBClassifier
2023-12-04 07:12:45,856 - root - INFO - LOGOCV complete for MLPClassifier


In [None]:
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=SEED, class_weight='balanced')),
    ('Random Forest', RandomForestClassifier(random_state=SEED, class_weight='balanced')),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=SEED)),
    ('XGBoost', XGBClassifier(random_state=SEED)),
    ('MLP', MLPClassifier(random_state=SEED, learning_rate='adaptive', max_iter=1000, early_stopping=True))
]

results_weight_50 = ev.evaluate_models(models, df_weight_50_scaled, label_encoder)

In [None]:
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=SEED, class_weight='balanced')),
    ('Random Forest', RandomForestClassifier(random_state=SEED, class_weight='balanced')),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=SEED)),
    ('XGBoost', XGBClassifier(random_state=SEED)),
    ('MLP', MLPClassifier(random_state=SEED, learning_rate='adaptive', max_iter=1000, early_stopping=True))
]

results_weight_100 = ev.evaluate_models(models, df_weight_100_scaled, label_encoder)

In [None]:
results_weight_10[1][['f1', 'median_f1', 'model']].groupby('model').agg(
    # get the first median f1 score
    median_f1=('median_f1', 'first'),
    # get the percentage of f1 scores that are above 0.5
    percentage_above_05=('f1', lambda x: round(len(x[x > 0.5]) / len(x), 2)),
)

In [None]:
results_weight_25[1][['f1', 'median_f1', 'model']].groupby('model').agg(
    # get the first median f1 score
    median_f1=('median_f1', 'first'),
    # get the percentage of f1 scores that are above 0.5
    percentage_above_05=('f1', lambda x: round(len(x[x > 0.5]) / len(x), 2)),
)

In [None]:
results_weight_50[1][['f1', 'median_f1', 'model']].groupby('model').agg(
    # get the first median f1 score
    median_f1=('median_f1', 'first'),
    # get the percentage of f1 scores that are above 0.5
    percentage_above_05=('f1', lambda x: round(len(x[x > 0.5]) / len(x), 2)),
)

In [None]:
results_weight_100[1][['f1', 'median_f1', 'model']].groupby('model').agg(
    # get the first median f1 score
    median_f1=('median_f1', 'first'),
    # get the percentage of f1 scores that are above 0.5
    percentage_above_05=('f1', lambda x: round(len(x[x > 0.5]) / len(x), 2)),
)