In [1]:
import optuna
import pandas as pd
import cudf
import cupy as cp

from cuml.ensemble import RandomForestRegressor as cuRFC
from cuml.metrics import roc_auc_score as cuml_roc_auc
from cuml.decomposition import PCA as cuPCA

from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE

In [12]:
# Load data
df = pd.read_parquet('../data/processed/selected_features_df.parquet')
X = df.drop('target', axis=1)
y = df['target']

Unnamed: 0,NAME_EDUCATION_TYPE_Higher education,NAME_EDUCATION_TYPE_Incomplete higher,NAME_EDUCATION_TYPE_Secondary / secondary special,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,OCCUPATION_TYPE_Core staff,OCCUPATION_TYPE_Drivers,OCCUPATION_TYPE_High skill tech staff,OCCUPATION_TYPE_Laborers,OCCUPATION_TYPE_Sales staff,...,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_2,EXT_SOURCE_3,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_3,FLAG_DOCUMENT_5,FLAG_DOCUMENT_8
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,209.0,0.262949,0.139376,-1134.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,74.0,0.622246,0.628502,-828.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,115.0,0.555912,0.729567,-815.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,208.0,0.650442,0.737596,-617.0,1.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,73.0,0.322738,0.518093,-1106.0,0.0,0.0,1.0


In [91]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y)

We will apply SMOTE algorithm to balance the dataset.

In [89]:
sm = SMOTE(k_neighbors=4, random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_train, y_train)
X_resampled = cp.asarray(X_resampled)
y_resampled = cp.asarray(y_resampled)

# Random Forest Classificator

Performing HyperParameter Tuning with Optuna

In [92]:
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 30, 60),
        'min_samples_split': trial.suggest_int('min_samples_split', 20, 60),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 30, 50),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
    }

    model = cuRFC(**param)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    roc_auc = cuml_roc_auc(y_test, y_pred)
    return roc_auc


In [93]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, n_jobs=4)

[I 2024-11-16 15:51:11,860] A new study created in memory with name: no-name-f01239af-52b9-4d70-bfc9-e4d264ef8764
  ret = func(*args, **kwargs)
[I 2024-11-16 15:51:20,332] Trial 1 finished with value: 0.518510103225708 and parameters: {'n_estimators': 120, 'max_depth': 38, 'min_samples_split': 40, 'min_samples_leaf': 35, 'bootstrap': True}. Best is trial 1 with value: 0.518510103225708.
[I 2024-11-16 15:51:23,594] Trial 2 finished with value: 0.5208380222320557 and parameters: {'n_estimators': 128, 'max_depth': 59, 'min_samples_split': 32, 'min_samples_leaf': 42, 'bootstrap': False}. Best is trial 2 with value: 0.5208380222320557.
[I 2024-11-16 15:51:25,534] Trial 3 finished with value: 0.5208641290664673 and parameters: {'n_estimators': 152, 'max_depth': 43, 'min_samples_split': 22, 'min_samples_leaf': 32, 'bootstrap': True}. Best is trial 3 with value: 0.5208641290664673.
[I 2024-11-16 15:51:25,546] Trial 0 finished with value: 0.5209730863571167 and parameters: {'n_estimators': 127,

In [73]:
print(study.best_params)

{'n_estimators': 61, 'max_depth': 44, 'min_samples_split': 33, 'min_samples_leaf': 49, 'bootstrap': True}
