# 1. Import

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from basic_functions import data_load, evaluate_model, save_df

In [2]:
X_train = data_load("data/X_train.csv")
y_train = data_load("data/y_train_clustered.csv")
X_test = data_load("data/X_test.csv")
y_test = data_load("data/y_test_clustered.csv")


Data loaded!
Data loaded!
Data loaded!
Data loaded!


In [3]:
X_train["job_major"] = X_train["job"].astype(str).str[0]

In [4]:
df_evaluation = {}

# 2. Random Guesser

In [5]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# --- Random Guess (uniform) ---
random_clf = DummyClassifier(strategy="uniform", random_state=42)
random_clf.fit(X_train, y_train)
y_pred_random = random_clf.predict(X_test)
y_probs_random = random_clf.predict_proba(X_test)
random_df = evaluate_model(y_pred=y_pred_random, y_test=y_test, model_name="Random_Guess", y_proba=y_probs_random)

Random_Guess:
Accuracy: 0.13199910051720262
Macro-F1: 0.12105423894465372
Weighted-F1: 0.14265139338547475
Log-Loss: 1.945910149055314
              precision    recall  f1-score   support

           0       0.10      0.13      0.11       498
           1       0.27      0.14      0.18      1243
           2       0.14      0.13      0.14       641
           3       0.20      0.14      0.16       940
           4       0.09      0.12      0.10       458
           5       0.11      0.14      0.12       498
           6       0.02      0.07      0.03       169

    accuracy                           0.13      4447
   macro avg       0.13      0.12      0.12      4447
weighted avg       0.17      0.13      0.14      4447



# 3. Majority Guesser

In [6]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# --- Majority Class ---
majority_clf = DummyClassifier(strategy="most_frequent")
majority_clf.fit(X_train, y_train)
y_pred_majority = majority_clf.predict(X_test)
y_probs_majority = majority_clf.predict_proba(X_test)
majority_df= evaluate_model(y_pred=y_pred_majority, y_test=y_test, model_name="Majority_Guess", y_proba=y_probs_majority)

Majority_Guess:
Accuracy: 0.2795142792894086
Macro-F1: 0.06241526487572181
Weighted-F1: 0.12212170444876445
Log-Loss: 25.968937589100822
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       498
           1       0.28      1.00      0.44      1243
           2       0.00      0.00      0.00       641
           3       0.00      0.00      0.00       940
           4       0.00      0.00      0.00       458
           5       0.00      0.00      0.00       498
           6       0.00      0.00      0.00       169

    accuracy                           0.28      4447
   macro avg       0.04      0.14      0.06      4447
weighted avg       0.08      0.28      0.12      4447



# 4. Comparison

In [7]:
df_eval = pd.concat([
    random_df,
    majority_df
], ignore_index=True)

print(df_eval)

            Model  Accuracy  Macro-F1  Weighted-F1   Log-Loss
0    Random_Guess  0.131999  0.121054     0.142651   1.945910
1  Majority_Guess  0.279514  0.062415     0.122122  25.968938


In [8]:
save_df(df_eval, "data/model_comparison.csv")

Data saved!
