# Model Development

**Carlos Bustillo | Prosigliere Tech Interview**
<br>I'm using GPU T4 on Google Colab

## 0) Dependencies

In [1]:
!pip3 install xgboost



In [2]:
import os
import joblib
import itertools

import numpy as np
from sklearn.metrics import f1_score, classification_report, roc_auc_score
from imblearn.combine import SMOTEENN
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

## 1) Load datasets

Load from local all the cleaned data
<br>They were obtained using the first notebook (0__EDA.ipynb)

In [3]:
!unzip "/content/processed_data.zip"

Archive:  /content/processed_data.zip
   creating: content/processed_data/
  inflating: content/processed_data/X_test.joblib  
  inflating: content/processed_data/y_test.joblib  
  inflating: content/processed_data/X_val.joblib  
  inflating: content/processed_data/y_train.joblib  
  inflating: content/processed_data/X_train.joblib  
  inflating: content/processed_data/y_val.joblib  


In [4]:
processed_dir = "/content/content/processed_data"

X_train = joblib.load(os.path.join(processed_dir, "X_train.joblib"))
y_train = joblib.load(os.path.join(processed_dir, "y_train.joblib"))
X_val   = joblib.load(os.path.join(processed_dir, "X_val.joblib"))
y_val   = joblib.load(os.path.join(processed_dir, "y_val.joblib"))
X_test  = joblib.load(os.path.join(processed_dir, "X_test.joblib"))
y_test  = joblib.load(os.path.join(processed_dir, "y_test.joblib"))

## 2) Define models

**Random Forest**

In [5]:
def train_rf_with_params(params):
    sm = SMOTEENN(random_state=42)
    X_res, y_res = sm.fit_resample(X_train, y_train)

    model = RandomForestClassifier(**params, random_state=42, n_jobs=-1)
    model.fit(X_res, y_res)
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    return f1, model

param_grid_rf = {
    'n_estimators': [100, 200, 500, 1000],
    'max_depth': [10, 20, 50, 100],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 10]
}

**SVM (with scaled data)**

In [6]:
def train_svm_with_params(params):
    sm = SMOTEENN(random_state=42)
    X_res, y_res = sm.fit_resample(X_train, y_train)

    scaler = StandardScaler()
    X_res_scaled = scaler.fit_transform(X_res)
    X_val_scaled = scaler.transform(X_val)

    model = SVC(**params, probability=True, random_state=42)
    model.fit(X_res_scaled, y_res)
    y_pred = model.predict(X_val_scaled)
    f1 = f1_score(y_val, y_pred)
    return f1, model, scaler

param_grid_svm = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'kernel': ['rbf']
}

**XGBoost**

In [12]:
def train_xgb_with_params(params):
    sm = SMOTEENN(random_state=42)
    X_res, y_res = sm.fit_resample(X_train, y_train)

    model = XGBClassifier(eval_metric='logloss', **params, random_state=42)
    model.fit(X_res, y_res)
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    return f1, model

param_grid_xgb = {
    'n_estimators': [100, 200, 500, 1000],
    'max_depth': [4, 6, 10, 50, 100],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.7, 1.0]
}

## 3) Run Tunning

In [8]:
def run_grid_manual(param_grid, train_func, model_name, use_scaler=False):
    all_params = list(itertools.product(*param_grid.values()))
    param_names = list(param_grid.keys())

    best_score = 0
    best_model = None
    best_scaler = None
    best_params = None

    for values in all_params:
        params = dict(zip(param_names, values))
        if use_scaler:
            score, model, scaler = train_func(params)
        else:
            score, model = train_func(params)
        print(f"{model_name} Params: {params} -> F1: {score:.4f}")

        if score > best_score:
            best_score = score
            best_model = model
            best_params = params
            if use_scaler:
                best_scaler = scaler

    print(f"\nBest {model_name} Params: {best_params}")
    print(f"Best F1 on Validation: {best_score:.4f}")
    return best_model, best_scaler if use_scaler else None

**Random Forest**

In [9]:
best_rf, _ = run_grid_manual(param_grid_rf, train_rf_with_params, "Random Forest")

Random Forest Params: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1} -> F1: 0.6961
Random Forest Params: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 10} -> F1: 0.6851
Random Forest Params: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 1} -> F1: 0.6908
Random Forest Params: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 10} -> F1: 0.6851
Random Forest Params: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 1} -> F1: 0.6910
Random Forest Params: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 10} -> F1: 0.6851
Random Forest Params: {'n_estimators': 100, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 1} -> F1: 0.7175
Random Forest Params: {'n_estimators': 100, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 10} -> F1: 0.6953
Random Forest Para

**SVM**

In [10]:
best_svm, best_scaler_svm = run_grid_manual(param_grid_svm, train_svm_with_params, "SVM", use_scaler=True)

SVM Params: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'} -> F1: 0.6218
SVM Params: {'C': 0.1, 'gamma': 'auto', 'kernel': 'rbf'} -> F1: 0.6218
SVM Params: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'} -> F1: 0.6812
SVM Params: {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'} -> F1: 0.6812
SVM Params: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'} -> F1: 0.6763
SVM Params: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'} -> F1: 0.6763

Best SVM Params: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Best F1 on Validation: 0.6812


**XGBoost**

In [13]:
best_xgb, _ = run_grid_manual(param_grid_xgb, train_xgb_with_params, "XGBoost")

XGBoost Params: {'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.01, 'subsample': 0.7} -> F1: 0.6723
XGBoost Params: {'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.01, 'subsample': 1.0} -> F1: 0.6672
XGBoost Params: {'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.1, 'subsample': 0.7} -> F1: 0.7146
XGBoost Params: {'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.1, 'subsample': 1.0} -> F1: 0.7020
XGBoost Params: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.01, 'subsample': 0.7} -> F1: 0.6874
XGBoost Params: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.01, 'subsample': 1.0} -> F1: 0.6709
XGBoost Params: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.1, 'subsample': 0.7} -> F1: 0.7259
XGBoost Params: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.1, 'subsample': 1.0} -> F1: 0.7067
XGBoost Params: {'n_estimators': 100, 'max_depth': 10, 'learning_rate': 0.01, 'subsample': 0.7} -> F1: 0.7020
XGBoost Params: {'n_es

## 4) Final Test with test set

In [14]:
def evaluate_model(model, X_test, y_test, name, scaler=None):
    X_eval = scaler.transform(X_test) if scaler else X_test
    y_pred = model.predict(X_eval)
    y_proba = model.predict_proba(X_eval)[:, 1]

    print(f"\nClassification Report: {name}")
    print(classification_report(y_test, y_pred))
    print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")

**Random Forest**

In [15]:
evaluate_model(best_rf, X_test, y_test, "Random Forest")


Classification Report: Random Forest
              precision    recall  f1-score   support

           0       0.84      0.83      0.84      1222
           1       0.68      0.70      0.69       626

    accuracy                           0.79      1848
   macro avg       0.76      0.76      0.76      1848
weighted avg       0.79      0.79      0.79      1848

ROC AUC: 0.8501


**SVM**

In [16]:
evaluate_model(best_svm, X_test, y_test, "SVM", scaler=best_scaler_svm)


Classification Report: SVM
              precision    recall  f1-score   support

           0       0.85      0.66      0.74      1222
           1       0.54      0.78      0.64       626

    accuracy                           0.70      1848
   macro avg       0.69      0.72      0.69      1848
weighted avg       0.75      0.70      0.71      1848

ROC AUC: 0.7828


**XGBoost**

In [17]:
evaluate_model(best_xgb, X_test, y_test, "XGBoost")


Classification Report: XGBoost
              precision    recall  f1-score   support

           0       0.86      0.80      0.83      1222
           1       0.66      0.75      0.71       626

    accuracy                           0.79      1848
   macro avg       0.76      0.78      0.77      1848
weighted avg       0.80      0.79      0.79      1848

ROC AUC: 0.8534


## 5) Save the best models

In [18]:
joblib.dump(best_rf, "best_rf.joblib")
joblib.dump((best_svm, best_scaler_svm), "best_svm_with_scaler.joblib")
joblib.dump(best_xgb, "best_xgb.joblib")

['best_xgb.joblib']