<span style="color:red">
This notebook gives a framework to train and evaluate a model. You can copy it and adapt it to your needs.
</span>

# Imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import f1_score, roc_auc_score, roc_curve

# from sklearn.linear_model import LogisticRegression  # Example - uncomment and modify as needed
# from sklearn.ensemble import RandomForestClassifier  # Example
from sklearn.model_selection import cross_val_score, train_test_split

import constants.constants as cst
from src.preprocessing import preprocess_data
from src.utils.calibration import plot_calibration_curve
from src.utils.compute_metrics import compute_and_store_metrics
from src.utils.confusion_matrix import plot_confusion_matrix
from src.utils.load_data import load_data
from src.utils.model_utils import save_model
from src.utils.plot_roc import plot_roc_curve


# Load Data

In [None]:
data = load_data()
X = data.drop(columns=cst.TARGET)
y = data[cst.TARGET]

# Model

## Your Model

In [None]:
## Train Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

In [4]:
preprocessed_train_data, y_train, preprocessor = preprocess_data(train_data, fit=True)

preprocessed_test_data, y_test, _ = preprocess_data(
    test_data, preprocessor=preprocessor, fit=False
)

[32m2025-10-25 16:10:15.862[0m | [1mINFO    [0m | [36msrc.preprocessing[0m:[36mpreprocess_data[0m:[36m61[0m - [1mNo preprocessor provided. Creating a new one.[0m
[32m2025-10-25 16:10:15.865[0m | [1mINFO    [0m | [36msrc.preprocessing[0m:[36mpreprocess_data[0m:[36m71[0m - [1mFitting and transforming data.[0m
[32m2025-10-25 16:10:15.897[0m | [1mINFO    [0m | [36msrc.utils.model_utils[0m:[36msave_model[0m:[36m39[0m - [1mModel saved to: /home/augustin/projects/xhec_dsb/05_intro_to_finance_for_data_scientists/models/preprocessor.pkl[0m
[32m2025-10-25 16:10:15.898[0m | [1mINFO    [0m | [36msrc.preprocessing[0m:[36mpreprocess_data[0m:[36m74[0m - [1mPreprocessor fitted and saved.[0m


In [None]:
## Model - REPLACE WITH YOUR MODEL
# Example: model = LogisticRegression(random_state=42)
# Example: model = RandomForestClassifier(random_state=42)
model_baseline = ...  # <-- Replace this with your actual model

# CRITICAL: Use preprocessed data for training!
model_baseline.fit(preprocessed_train_data, y_train)

# Make predictions
y_pred_proba_baseline = model_baseline.predict_proba(preprocessed_test_data)[:, 1]

In [None]:
threshold = 0.5
y_pred_baseline = (y_pred_proba_baseline >= threshold).astype(int)

In [None]:
plot_confusion_matrix(y_test, y_pred_baseline, cmap="Blues")

In [None]:
# Feature importance (if model supports it)
if hasattr(model_baseline, "feature_importances_"):
    feature_names = preprocessor.get_feature_names_out()
    importance_df = pd.DataFrame(
        {"feature": feature_names, "importance": model_baseline.feature_importances_}
    ).sort_values("importance", ascending=False)

    print("\nTop 10 Most Important Features:")
    print(importance_df.head(10))

    # Plot feature importance
    plt.figure(figsize=(10, 6))
    plt.barh(importance_df.head(15)["feature"], importance_df.head(15)["importance"])
    plt.xlabel("Importance")
    plt.title("Top 15 Feature Importances")
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
elif hasattr(model_baseline, "coef_"):
    feature_names = preprocessor.get_feature_names_out()
    importance_df = pd.DataFrame(
        {"feature": feature_names, "coefficient": np.abs(model_baseline.coef_[0])}
    ).sort_values("coefficient", ascending=False)

    print("\nTop 10 Most Important Features (by coefficient magnitude):")
    print(importance_df.head(10))

In [None]:
# Cross-validation score
cv_scores = cross_val_score(
    model_baseline, preprocessed_train_data, y_train, cv=5, scoring="f1"
)
print(f"Cross-validation F1 scores: {cv_scores}")
print(f"Mean CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

In [None]:
# Plot ROC curve
plot_roc_curve(y_test, y_pred_proba_baseline)

In [None]:
plot_calibration_curve(y_test, y_pred_proba_baseline)

## Hyperparameter fine-tuning (Optuna)

In [None]:
cv_splits = 5
n_trials = 200

In [None]:
def objective(trial):
    """
    Optuna objective function for hyperparameter optimization with CV.
    Adapt the hyperparameters based on your model.
    """

    # Example for RandomForestClassifier - MODIFY FOR YOUR MODEL
    # n_estimators = trial.suggest_int('n_estimators', 50, 300)
    # max_depth = trial.suggest_int('max_depth', 3, 20)
    # min_samples_split = trial.suggest_int('min_samples_split', 2, 20)

    # Example for LogisticRegression - MODIFY FOR YOUR MODEL
    # C = trial.suggest_float('C', 0.001, 100.0, log=True)
    # penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    # solver = 'liblinear'  # Required for l1 penalty

    # Suggest hyperparameters for YOUR model here
    # param1 = trial.suggest_...
    # param2 = trial.suggest_...

    # Create model with suggested hyperparameters
    # Example: model = RandomForestClassifier(
    #     n_estimators=n_estimators,
    #     max_depth=max_depth,
    #     min_samples_split=min_samples_split,
    #     random_state=42
    # )

    model = ...  # <-- Replace with your model using suggested hyperparameters

    # Use cross-validation on TRAINING data only (more robust than single split)
    cv_scores = cross_val_score(
        model,
        preprocessed_train_data,
        y_train,
        cv=cv_splits,  # 5-fold cross-validation
        scoring="f1",  # or 'roc_auc', 'accuracy', etc.
        n_jobs=-1,  # Use all CPU cores
    )

    # Return mean CV score
    return cv_scores.mean()


# Create and run the study
study = optuna.create_study(direction="maximize")  # Maximize F1-score
study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

print("\n" + "=" * 60)
print("BEST HYPERPARAMETERS")
print("=" * 60)
print(f"Best CV F1-Score: {study.best_value:.4f}")
print(f"\nBest Parameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

### Evaluate Best Model on Test Set

Now we evaluate the best hyperparameters found by Optuna on the held-out test set.

In [None]:
# Train the final model with best parameters on full training set
best_params = study.best_params.copy()

# Example: model = RandomForestClassifier(**best_params, random_state=42)
# Example: model = LogisticRegression(**best_params, random_state=42)
model_optimized = ...  # <-- Replace with your model using **best_params

model_optimized.fit(preprocessed_train_data, y_train)

# Make predictions on test set
y_pred_proba_optimized = model_optimized.predict_proba(preprocessed_test_data)[:, 1]

thresholds = np.arange(0.3, 0.7, 0.01)
f1_scores = [
    f1_score(y_test, (y_pred_proba_optimized >= t).astype(int)) for t in thresholds
]
optimal_threshold = thresholds[np.argmax(f1_scores)]

y_pred_optimized = (y_pred_proba_optimized >= optimal_threshold).astype(int)

print(f"Optimal threshold on test set: {optimal_threshold:.3f}")
print(f"Test F1-Score: {max(f1_scores):.4f}")

In [None]:
plot_confusion_matrix(y_test, y_pred_optimized, cmap="Greens")

In [None]:
# ROC curve for optimized model
plot_roc_curve(y_test, y_pred_proba_optimized)

In [None]:
# Compare both models side by side

# Calculate ROC curves for both models
fpr_baseline, tpr_baseline, _ = roc_curve(y_test, y_pred_proba_baseline)
auc_baseline = roc_auc_score(y_test, y_pred_proba_baseline)

fpr_optimized, tpr_optimized, _ = roc_curve(y_test, y_pred_proba_optimized)
auc_optimized = roc_auc_score(y_test, y_pred_proba_optimized)

# Plot both on the same figure
plt.figure(figsize=(10, 6))
plt.plot(
    fpr_baseline,
    tpr_baseline,
    color="blue",
    lw=2,
    label=f"Baseline Model (AUC = {auc_baseline:.4f})",
)
plt.plot(
    fpr_optimized,
    tpr_optimized,
    color="green",
    lw=2,
    label=f"Optimized Model (AUC = {auc_optimized:.4f})",
)
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--", label="Random Guess")

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison: Baseline vs Optimized")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

print(f"Baseline AUC: {auc_baseline:.4f}")
print(f"Optimized AUC: {auc_optimized:.4f}")
print(
    f"Improvement: {auc_optimized - auc_baseline:.4f} ({((auc_optimized - auc_baseline) / auc_baseline * 100):.2f}%)"
)

In [None]:
plot_calibration_curve(y_test, y_pred_proba_optimized)

In [None]:
# Calibrate the optimized model

model_calibrated = CalibratedClassifierCV(
    model_optimized, method="isotonic", cv="prefit"
)
model_calibrated.fit(preprocessed_train_data, y_train)

y_pred_proba_calibrated = model_calibrated.predict_proba(preprocessed_test_data)[:, 1]

# Find optimal threshold on test set
thresholds = np.arange(0, 1, 0.01)
f1_scores = [
    f1_score(y_test, (y_pred_proba_calibrated >= t).astype(int)) for t in thresholds
]
calibrated_threshold = thresholds[np.argmax(f1_scores)]

y_pred_calibrated = (y_pred_proba_calibrated >= optimal_threshold).astype(int)

In [None]:
plot_calibration_curve(y_test, y_pred_proba_baseline)

In [None]:
compute_and_store_metrics(y_test, y_pred_optimized, model_name="Model_Name_Optimized")

compute_and_store_metrics(y_test, y_pred_calibrated, model_name="Model_Name_Calibrated")

In [None]:
# Final training on the entire dataset
# Preprocess full dataset
full_data = pd.concat([X, y], axis=1)
preprocessed_full_data, y_full, final_preprocessor = preprocess_data(
    full_data, fit=True
)

# Train final model with best hyperparameters
# Example: final_model = RandomForestClassifier(**best_params, random_state=42)
final_model = ...  # <-- Replace with your model using **best_params

final_model.fit(preprocessed_full_data, y_full)

# Calibrate final model
model_calibrated_final = CalibratedClassifierCV(
    final_model, method="isotonic", cv="prefit"
)
model_calibrated_final.fit(preprocessed_full_data, y_full)

# Save both model and preprocessor
save_model(model_calibrated_final, model_name="Model_Name_Calibrated")
save_model(final_preprocessor, model_name="Model_Name_preprocessor")

print(f"✓ Model trained on full dataset ({len(y_full)} samples)")
print(f"✓ Model saved with calibrated threshold: {calibrated_threshold:.3f}")