In [1]:
import pandas as pd
import numpy as np
from helpers.helper_functions import load_data, encode_string_value, get_prepared_data

In [2]:
train, test = load_data('data')
X,y = train.drop('target', axis=1), train.target
X = get_prepared_data(X)

In [None]:
import random
random.seed(42)

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb

import optuna
from optuna import Trial, visualization

In [None]:
!nvidia-smi

In [None]:
def Objective(trial):

    
    # n_estimators (int) – Number of gradient boosted trees.
    # max_depth (int) – Maximum tree depth for base learners.
    # learning_rate (float) – Boosting learning rate.
    # booster (string) – Specify which booster to use: gbtree, gblinear or dart.
    # tree_method (string) – Specify which tree method to use.
    # gamma (float) – Minimum loss reduction required to make a further partition on a leaf node of the tree.
    # min_child_weight (float) – Minimum sum of instance weight(hessian) needed in a child.
    # max_delta_step (float) – Maximum delta step we allow each tree’s weight estimation to be.
    # subsample (float) – Subsample ratio of the training instance.
    # colsample_bytree (float) – Subsample ratio of columns when constructing each tree.
    # colsample_bylevel (float) – Subsample ratio of columns for each level.
    # colsample_bynode (float) – Subsample ratio of columns for each split.
    # reg_alpha (float) – L1 regularization term on weights
    # reg_lambda (float) – L2 regularization term on weights


    param = {
        "tree_method": "gpu_hist",  # use gpu
        "objective":"binary:hinge", #Hinge loss may give better accuracy and some sparsity but much less sensitivity in terms of probabilities
        "lambda": trial.suggest_loguniform("lambda", 1e-3, 10.0),
        "alpha": trial.suggest_loguniform("alpha", 1e-3, 10.0),
        "colsample_bytree": trial.suggest_loguniform("colsample_bytree", 0.2, 0.6),
        "subsample":  trial.suggest_loguniform("subsample", 0.4, 0.8),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.05),
        "n_estimators": 8000,
        'n_jobs' : -1,
        "max_depth": trial.suggest_int("max_depth", 4, 12),
        "random_state": 42,
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 100),
    }

    kf = KFold(n_splits=3, random_state=42, shuffle=True)
    CV_score_array = []
    for train_index, test_index in kf.split(X):
        X_train, X_valid = X[train_index], X[test_index]
        y_train, y_valid = y[train_index], y[test_index]
        clf = xgb.XGBClassifier(**param, use_label_encoder=False)
        clf.fit(
            X_train,
            y_train,
            eval_set=[(X_valid, y_valid)],
        )

        preds = clf.predict(X_valid)
        auc = roc_auc_score(y_valid, preds)

        CV_score_array.append(auc)
    avg = np.mean(CV_score_array)
    return avg