# Model Training & Evaluation

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import os
import sys

# Add the src directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [None]:
import pandas as pd

pd.set_option('display.max_columns', None)

# suppress debugging warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
FOLDS = 3

In [None]:
from IPython.display import display
import joblib

X_train, X_test, y_train, y_test = joblib.load('../data/processed/accepted_2007_to_2018Q4.pkl')

## Training

### Train the model

In [None]:
from src.model import create_model

model = create_model(eval_metric='logloss')

model.fit(X_train, y_train)

## Inference
### Make predictions

In [None]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:,1]

## Evaluation

### Accuracy

In [None]:
# Accuracy
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

### Classification Report

In [None]:
# Classification report
from sklearn.metrics import classification_report

print("Classification Report:\n", classification_report(y_test, y_pred))

### Confusion Matrix

In [None]:
# Confusion matrix
from src import visualize

visualize.confusion_matrix(y_test, y_pred)

### ROC Curve

In [None]:
from sklearn.metrics import roc_auc_score

# ROC-AUC score
roc_auc = roc_auc_score(y_test, y_proba)
print(f"ROC-AUC Score: {roc_auc:.4f}")

# Plot ROC curve
visualize.roc_curve(y_test, y_proba, roc_auc)

### Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')

print("Cross Validation ROC-AUC Scores:")
print(f"Mean: {cv_scores.mean():.4f}")
print(f"Standard deviation: {cv_scores.std():.2e}")

## Feature importance

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.inspection import permutation_importance

# Calculate permutation importance
result = permutation_importance(
    model, X_train, y_train, n_repeats=FOLDS, random_state=42)

# Get importance values
importances = result['importances_mean']

# Create a DataFrame with feature names and importances
feature_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
})

# Sort the DataFrame by importance in descending order
feature_importances = feature_importances.sort_values('Importance', ascending=False)

# Display the feature importances DataFrame
with pd.option_context('display.max_rows', None):
    display(feature_importances)


## Hyperparameter Tuning

### Random search

In [None]:
import random

from sklearn.model_selection import RandomizedSearchCV
from scipy import stats

# Takes 14.5min at 30 iterations
param_dist = {
    "max_depth": stats.randint(2, 3),
    "learning_rate": stats.uniform(loc=0.93, scale=0.07),
    "n_estimators": stats.randint(100, 1000),
    "subsample": stats.norm(0.85, scale=0.05),
    "colsample_bytree": stats.uniform(loc=0.98, scale=0.02),
}

# Setup the randomized search
search = RandomizedSearchCV(
    estimator=create_model(eval_metric='logloss'),
    param_distributions=param_dist,
    n_iter=30, # ~28sec/iter
    cv=FOLDS,
    verbose=0,
    random_state=42,
    n_jobs=(-1)
)

# Fit the model
search.fit(X_train, y_train)

# Best Model
print("Best Parameters:", search.best_params_)
print(f"Best ROC-AUC Score: {search.best_score_:.8f}")

### Retrain with these hyperparameters

In [None]:
best_params = search.best_params_
model_best = create_model(**best_params)
model_best.fit(X_train, y_train)

### Evaluate the optimized model

In [None]:
# Predict on test data
y_pred_best = model_best.predict(X_test)
y_proba_best = model_best.predict_proba(X_test)[:,1]

# Accuracy
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f"Optimized Accuracy: {accuracy_best:.4f}")

print("Optimized Classification Report:\n", classification_report(y_test, y_pred_best))

In [None]:
# ROC-AUC score
roc_auc = roc_auc_score(y_test, y_proba_best)
print(f"Optimized ROC-AUC Score: {roc_auc:.4f}")

# Plot ROC curve
visualize.roc_curve(y_test, y_proba_best, roc_auc)