In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss, accuracy_score
import optuna
import xgboost as xgb
import sklearn
import numpy as np

In [2]:
# read the data from /input/train.csv
data = pd.read_csv("train.csv")

In [4]:
X = data.drop(['Class', 'id'], axis=1)
y = data['Class']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25, random_state=1)

In [38]:
# 1. Define an objective function to be maximized.
def objective(trial):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_val, label=y_val)

    param = {
        "verbosity": 0,
        'objective':'multi:softprob', 'num_class':2,
        # use exact for small dataset.
        "tree_method": "exact",
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1e-6, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1e-6, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
    }

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 2, 6, step=1)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 5, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    bst = xgb.train(param, dtrain)
    preds = bst.predict(dvalid)
    logloss = log_loss(y_val, preds)
    return logloss


if __name__ == "__main__":
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=1000)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))


[32m[I 2023-03-11 21:17:34,982][0m A new study created in memory with name: no-name-66e399bb-7cdf-4e19-b2a6-e6bcb25d0472[0m
[32m[I 2023-03-11 21:17:35,553][0m Trial 0 finished with value: 0.6931441732475648 and parameters: {'booster': 'gbtree', 'lambda': 4.489671815361404e-07, 'alpha': 1.818346379881522e-07, 'subsample': 0.2176882204641376, 'colsample_bytree': 0.7120211735869371, 'max_depth': 6, 'min_child_weight': 5, 'eta': 3.19172292519731e-07, 'gamma': 0.3767020247391007, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.6931441732475648.[0m
[32m[I 2023-03-11 21:17:35,956][0m Trial 1 finished with value: 0.64402705541002 and parameters: {'booster': 'gbtree', 'lambda': 6.075595343143422e-07, 'alpha': 2.4248738520135487e-08, 'subsample': 0.9140578677770981, 'colsample_bytree': 0.9201856062325362, 'max_depth': 2, 'min_child_weight': 8, 'eta': 0.005374337300214719, 'gamma': 4.318089601044637e-07, 'grow_policy': 'lossguide'}. Best is trial 1 with value: 0.64402705541002.

Number of finished trials:  1000
Best trial:
  Value: 0.03477412324307987
  Params: 
    booster: dart
    lambda: 2.176402650648911e-08
    alpha: 1.7539953753610712e-07
    subsample: 0.8640164616328125
    colsample_bytree: 0.9206467734362929
    max_depth: 4
    min_child_weight: 10
    eta: 0.7523171979478614
    gamma: 3.312943743451231e-05
    grow_policy: lossguide
    sample_type: weighted
    normalize_type: forest
    rate_drop: 0.0028945438396066592
    skip_drop: 0.008164725076388634


In [39]:
params = trial.params
params['objective']='multi:softprob'
params['num_class']=2

In [40]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)
model = xgb.train(trial.params, dtrain)

In [61]:
preds = old_model.predict(dtest)

In [62]:
preds

array([[9.9950659e-01, 4.9345964e-04],
       [9.9957150e-01, 4.2849797e-04],
       [9.9952328e-01, 4.7668372e-04],
       ...,
       [9.9942017e-01, 5.7989027e-04],
       [4.3601882e-02, 9.5639813e-01],
       [9.9936861e-01, 6.3143601e-04]], dtype=float32)

In [63]:
testloss = log_loss(y_test, preds)
testloss

0.027648258102026334

In [64]:
train_preds = model.predict(dtrain)
trainloss = log_loss(y_train, train_preds)
trainloss

0.027549924226041526

In [37]:
#first_params = params
# save params to file
#with open('first_params.txt', 'w') as f:
#    for key, value in first_params.items():
#        f.write('%s:%s' % (key, value))



In [57]:
first_params

{'booster': 'dart',
 'lambda': 0.0003214621386645728,
 'alpha': 9.14430241501814e-07,
 'subsample': 0.43134139726273457,
 'colsample_bytree': 0.41021804879201995,
 'max_depth': 5,
 'min_child_weight': 10,
 'eta': 0.9222209435510054,
 'gamma': 1.9823302681308873e-05,
 'grow_policy': 'lossguide',
 'sample_type': 'weighted',
 'normalize_type': 'forest',
 'rate_drop': 1.0621384789194997e-07,
 'skip_drop': 3.368458597381678e-07,
 'objective': 'multi:softprob',
 'num_class': 2}

In [44]:
# read the real test data from test.csv
test_data = pd.read_csv("test.csv")

In [45]:
X_testtest = test_data.drop(['id'], axis=1)
ID_test = test_data.id

In [46]:
dtesttest = xgb.DMatrix(X_testtest)

In [65]:
preds = old_model.predict(dtesttest)

In [66]:
preds=  preds[:,1]

In [67]:
preds

array([3.5134782e-04, 1.2821819e-03, 3.0713019e-04, ..., 2.8572624e-04,
       8.9753732e-02, 9.8864436e-01], dtype=float32)

In [68]:
# Save test predictions to file
submission = pd.DataFrame({'id': ID_test,
                       'Class': preds})
submission.to_csv('submission_optuna_xgb.csv', index=False)

In [54]:
# define dmatrix with entire data
dfull = xgb.DMatrix(X, label=y)

In [58]:
old_model = xgb.train(first_params, dfull)