### Import libraries

In [None]:
import os
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, MinMaxScaler, OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier 

from xgboost import XGBClassifier

# import own modules
sys.path.append("..")  # Adds higher directory to python modules path.
from scripts import features as ft
from scripts import preprocessing as pp
from scripts import evaluate_models as em

#plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-dark.mplstyle')
plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-light.mplstyle')

import pickle

---
## Preparations

### Load the CSV into a Dataframe

- load csv (or calculate again if not in data folder)
- update index=id
- drop useless columns
- find numerical & object columns

In [None]:
# path to csv file
path_df = os.path.join("..", "data", "df_deepgaze2e.csv")

# get features - or recalculate
recalculate_df = False
if os.path.isfile(path_df) and not recalculate_df:
    df = pd.read_csv(path_df)
else:
    df = ft.get_features()
    df.to_csv(path_df, index=False)

# set id as index
df = df.set_index("id", drop=True)

# drop first batch of useless variables
df = df.drop(columns=['img', 'sp_idx'])
df = df.drop(columns=[col for col in df.columns if "_obj" in col])  # drop 'object' columns

# find numerical and categorical columns
num_cols = df.columns[df.dtypes != "object"]
cat_cols = df.columns[df.dtypes == "object"]

# print info
print(f" -> dataframe has {df.shape[0]} instances and {df.shape[1]} columns")
print(f" -> there are {len(num_cols)} numerical columns")
print(f" -> there are {len(cat_cols)} categoricals columns")

### Checking for highly correlated columns
think after running this lines, which column to additionally drop

In [None]:
# Computing the correlation matrix
corr_matrix = df[num_cols].corr()

# Find pairs with correlation >= 0.8
high_corr_pairs = np.column_stack(
    np.where((np.abs(corr_matrix) >= 0.8) & (corr_matrix != 1))
)
high_corr_cols = []

# Extracting and printing the pairs
seen_pairs = set()
for i, j in high_corr_pairs:
    col1, col2 = corr_matrix.columns[i], corr_matrix.columns[j]
    if (col2, col1) not in seen_pairs:
        print(
            f"Correlation between {col1} and {col2} is {round(corr_matrix.iloc[i, j], 3)}"
        )
        seen_pairs.add((col1, col2))
        seen_pairs.add((col2, col1))
        high_corr_cols.append(col1)
        high_corr_cols.append(col2)

### further processing

... drop more columns, create new ones, handle highly correlating columns.....

### Split into train & test sets

As soon as the dataset is in its final form, perform train-test-split with our own split function to have out 30-image-set always as our test set.

In [None]:
# prepare features and target
X = df
y = X.pop("asd")

# train-test-split
X_train, X_test, y_train, y_test = pp.split(X, y)

# print info
print(f"train-set has '{len(y_train)}' samples & '{X.shape[1]}' features")
print(f"test-set has '{len(y_test)}' samples - out of '{df.shape[0]}'")
print(f"  ~ {len(y_test) / df.shape[0] * 100:.2f}% of full dataset")

### Set variables

- define `metric`
- behavior for saving models as pickles
- defaults for model-objects

In [None]:
# metric
ftwo_scorer = make_scorer(fbeta_score, beta=2)

# defaults
RSEED = 42
cv = 10
n_jobs = -1
verbose = 1

---
## Model - Preparations

### Column Transformers for Data Preproccessing
these are only used for certain Models, which we expect to perform better with Preproccessed Data

In [None]:
# add other transformations at the end if needed
transformer = [("scaler", MinMaxScaler(), num_cols),
               ("ohe", OneHotEncoder(drop="first"), cat_cols  )]
               
preprocessing = ColumnTransformer(transformer,
                                  remainder="passthrough")

###  Pipelines for each Model

In [None]:
# Random Forest: no scaling / no encoding
rf_pipeline = Pipeline([
    ("classifier", RandomForestClassifier())
])

# XGBoost: apply scaling / encoding
xgb_pipeline = Pipeline([
    ("preprocessor", preprocessing),
    ("classifier", XGBClassifier())
])

# Support Vector Classifier: apply scaling / encoding
svc_pipeline = Pipeline([
    ("preprocessor", preprocessing),
    ("classifier",SVC())
])

# Logistic Regression: apply scaling / encoding
log_pipeline = Pipeline([
    ("preprocessor", preprocessing),
    ("classifier", LogisticRegression(max_iter=1000))
])

### Parameter Grids

In [None]:
# Random Forest
param_grid_rf = {
    "classifier__n_estimators": [100, 200, 300],
    "classifier__max_depth": [None, 10, 20],
    "classifier__min_samples_split": [2, 5, 10],
    "classifier__min_samples_leaf": [1, 2, 4]
}

# XGBoost
param_grid_xgb = {
    "classifier__n_estimators": [100, 200, 300],
    "classifier__max_depth": [3, 5, 7, None],
    "classifier__learning_rate": [0.1, 0.01, 0.001]
}

# Support Vector Classifier
param_grid_svc = {
    'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Kernel types to try
    'classifier__C': [0.1, 1, 10, 100],  # Regularization parameter values
    'classifier__gamma': ['scale', 'auto'],  # Gamma parameter for RBF kernel
    'classifier__degree': [2, 3, 4]  # Degree of the polynomial kernel (only for poly kernel)
}

# Logistic Regression
param_grid_log = {
    'classifier__penalty': ['l1', 'l2'],  # Penalty type: l1 (Lasso) or l2 (Ridge)
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]  # Regularization strength
}

---
## Modeling

### Random Forest

In [None]:
# Create GridSearchCV object
grid_search_rf = GridSearchCV(
    rf_pipeline,
    param_grid=param_grid_rf,
    cv=cv,
    scoring=ftwo_scorer,
    n_jobs=n_jobs,
    verbose=verbose,
)
grid_search_rf.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params_rf = grid_search_rf.best_params_
best_est_rf = grid_search_rf.best_estimator_
print("Best params for RF are:", best_params_rf)
print("Best est for RF are:", best_est_rf)

In [None]:
# predict & proba
pred_test = grid_search_rf.predict(X_test)
proba_test = grid_search_rf.predict_proba(X_test)

pred_train = grid_search_rf.predict(X_train)
proba_train = grid_search_rf.predict_proba(X_train)

In [None]:
# evaluate model
em.report(
    y_train=y_train,
    y_train_pred=pred_train,
    y_train_proba=proba_train,
    y_test=y_test,
    y_test_pred=pred_test,
    y_test_proba=proba_test,
)

In [None]:
# learning curves - for one model - f2 score
em.learning(best_est_rf, X_train, y_train)

### XGBoost

In [None]:
# Create GridSearchCV object
grid_search_xgb = GridSearchCV(xgb_pipeline, param_grid=param_grid_xgb, cv=cv, scoring=ftwo_scorer)
grid_search_xgb.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params_xgb = grid_search_xgb.best_params_
best_est_xgb = grid_search_xgb.best_estimator_
print("Best params for XGB are:", best_params_xgb)
print("Best est for XGB are:", best_est_xgb)

### Logistic Regression Pipeline - Best Params/Est

In [None]:
# Create GridSearchCV object
grid_search_log = GridSearchCV(log_pipeline, param_grid=param_grid_log, cv=cv, scoring=ftwo_scorer)
grid_search_log.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params_log = grid_search_log.best_params_
best_est_log = grid_search_log.best_estimator_
print("Best params for LogReg are:", best_params_log)
print("Best est for LogReg are:", best_est_log)

### Support Vector Classifier Pipeline - Best Params/Est

In [None]:
# Create GridSearchCV object
grid_search_svc = GridSearchCV(svc_pipeline, param_grid=param_grid_svc, cv=cv, scoring=ftwo_scorer)
grid_search_svc.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params_svc = grid_search_log.best_params_
best_est_svc = grid_search_log.best_estimator_
print("Best params for SVC are:", best_params_svc)
print("Best est for SVC are:", best_est_svc)