In [1]:
import numpy as np
# Loading preprocessed data
X_train_scaled = np.load("data/X_train_scaled.npy")
X_test_scaled = np.load("data/X_test_scaled.npy")
y_train = np.load("data/y_train.npy")
y_test = np.load("data/y_test.npy")
X_train = np.load("data/X_train_unscaled.npy")
X_test = np.load("data/X_test_unscaled.npy")

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(
    n_splits=1,
    train_size=200000,   # ~30% of training data
    random_state=42
)

for train_idx, _ in sss.split(X_train, y_train):
    X_sub = X_train[train_idx]
    y_sub = y_train[train_idx]

In [5]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

param_dist = {
    "n_estimators": [100, 150],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

rf_search = RandomizedSearchCV(
    rf,
    param_distributions=param_dist,
    n_iter=5,                 
    scoring="roc_auc",
    cv=3,                     
    verbose=1,
    n_jobs=-1,
    random_state=42
)

rf_search.fit(X_sub, y_sub)

print("Best RF parameters:", rf_search.best_params_)
print("Best RF CV ROC-AUC:", rf_search.best_score_)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best RF parameters: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None}
Best RF CV ROC-AUC: 0.9999915723480649


In [9]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

neg, pos = np.bincount(y_train)
scale_pos_weight = neg / pos

xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_jobs=-1
)

param_dist = {
    "n_estimators": [200, 300],
    "max_depth": [4, 6],
    "learning_rate": [0.05, 0.1],
    "subsample": [0.8],
    "colsample_bytree": [0.8]
}

xgb_search = RandomizedSearchCV(
    xgb,
    param_distributions=param_dist,
    n_iter=4,          
    scoring="roc_auc",
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

xgb_search.fit(X_sub, y_sub)

print("Best XGB parameters:", xgb_search.best_params_)
print("Best XGB CV ROC-AUC:", xgb_search.best_score_)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best XGB parameters: {'subsample': 0.8, 'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.8}
Best XGB CV ROC-AUC: 0.9999999936645642
