In [2]:
#Load the split files 

import pandas as pd

X_train = pd.read_csv("data/X_train.csv")
X_val   = pd.read_csv("data/X_val.csv")
X_test  = pd.read_csv("data/X_test.csv")

y_train = pd.read_csv("data/y_train.csv")
y_val   = pd.read_csv("data/y_val.csv")
y_test  = pd.read_csv("data/y_test.csv")


In [3]:
#Recreate Preprocessor

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

num_cols = X_train.select_dtypes(include='number').columns.tolist()
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])


In [4]:
#Random Forest Hyperparameter Tuning

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
import numpy as np
rf = RandomForestClassifier(random_state=42)


In [5]:
# Define Hyperparameter Search Space

param_grid = {
    'model__n_estimators': [100, 200, 300, 400, 500],
    'model__max_depth': [5, 10, 15, 20, None],
    'model__min_samples_split': [2, 5, 10, 20],
    'model__min_samples_leaf': [1, 2, 4, 6],
    'model__max_features': ['auto', 'sqrt', 'log2']
}


In [6]:
# Build Pipeline for Tuning

rf_pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("model", rf)
])


In [7]:
#Run RandomizedSearchCV (Important Step)

random_search = RandomizedSearchCV(
    rf_pipeline,
    param_distributions=param_grid,
    n_iter=20,          # tries 20 combinations
    scoring='roc_auc',  # optimize AUC
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)


In [8]:
#Fit the Model (This will take ~2–4 minutes)

random_search.fit(X_train, y_train.values.ravel())


Fitting 3 folds for each of 20 candidates, totalling 60 fits


21 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
11 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\aakri\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\aakri\anaconda3\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\aakri\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
    ~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File 

In [9]:
#Evaluate the Tuned Random Forest

from sklearn.metrics import classification_report, roc_auc_score

best_rf = random_search.best_estimator_

y_pred_rf = best_rf.predict(X_val)
y_proba_rf = best_rf.predict_proba(X_val)[:,1]

print("AUC (Tuned RF):", roc_auc_score(y_val, y_proba_rf))
print(classification_report(y_val, y_pred_rf))


AUC (Tuned RF): 0.9761336964152254
              precision    recall  f1-score   support

           0       0.95      0.98      0.96      2341
           1       0.91      0.80      0.85       659

    accuracy                           0.94      3000
   macro avg       0.93      0.89      0.91      3000
weighted avg       0.94      0.94      0.94      3000



In [10]:
#Install XGBoost

!pip install xgboost


Collecting xgboost
  Downloading xgboost-3.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.1-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.8/72.0 MB 1.8 MB/s eta 0:00:39
    --------------------------------------- 1.0/72.0 MB 1.8 MB/s eta 0:00:40
   - -------------------------------------- 1.8/72.0 MB 2.1 MB/s eta 0:00:33
   - -------------------------------------- 2.6/72.0 MB 2.5 MB/s eta 0:00:28
   - -------------------------------------- 3.1/72.0 MB 2.5 MB/s eta 0:00:28
   - -------------------------------------- 3.4/72.0 MB 2.4 MB/s eta 0:00:29
   -- ------------------------------------- 3.9/72.0 MB 2.5 MB/s eta 0:00:28
   -- ------------------------------------- 4.5/72.0 MB 2.4 MB/s eta 0:00:29
   -- ------------------------------------- 5.0/72.0 MB 2.4 MB/s eta 0:00:28
   --- -------------

In [11]:
from xgboost import XGBClassifier

In [13]:
#Create XGBoost Pipeline

xgb = Pipeline([
    ("preprocess", preprocessor),
    ("model", XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric='logloss',
        random_state=42
    ))
])


In [14]:
#Train XGBoost

xgb.fit(X_train, y_train.values.ravel())


In [15]:
# Evaluate XGBoost

y_pred_xgb = xgb.predict(X_val)
y_proba_xgb = xgb.predict_proba(X_val)[:,1]

print("AUC (XGBoost):", roc_auc_score(y_val, y_proba_xgb))
print(classification_report(y_val, y_pred_xgb))


AUC (XGBoost): 0.9816045566302094
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      2341
           1       0.89      0.84      0.86       659

    accuracy                           0.94      3000
   macro avg       0.92      0.90      0.91      3000
weighted avg       0.94      0.94      0.94      3000



In [None]:

| Model    | AUC   | Recall | Precision |
| -------- | ---   | ------ | --------- |
| Tuned RF | 0.977 | 0.94   | 0.94      |
| XGBoost  | 0.982 | 0.94   | 0.94      |

In [17]:
import joblib
joblib.dump(xgb, "models/XGBOOST.pkl")


['models/XGBOOST.pkl']

In [18]:
joblib.dump(best_rf, "models/Tuned RF_model.pkl")


['models/Tuned RF_model.pkl']

In [19]:
joblib.dump(xgb, "models/final_day5_bestmodel.pkl")

['models/final_day5_bestmodel.pkl']