In [8]:
!pip install optuna



In [9]:
import pandas as pd
import numpy as np

import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [10]:
train = pd.read_csv('../data/train.csv', index_col=[0])
test = pd.read_csv('../data/test.csv')

In [11]:
train.shape

(11504798, 11)

In [12]:
test.shape

(7669866, 11)

In [13]:
def label_encode_gender(val):
    if val == 'Female':
        return 1
    elif val == 'Male':
        return 0
    return None

In [14]:
train['Gender'] = train['Gender'].apply(label_encode_gender)
test['Gender'] = test['Gender'].apply(label_encode_gender)

In [15]:
def label_encode_vehicle_age(val):
    if val == '< 1 Year':
        return 0
    elif val == '1-2 Year':
        return 1
    elif val == '> 2 Years':
        return 2
    return None

In [16]:
train['Vehicle_Age'] = train['Vehicle_Age'].apply(label_encode_vehicle_age)
test['Vehicle_Age'] = test['Vehicle_Age'].apply(label_encode_vehicle_age)

In [17]:
def label_encode_vehicle_damage(val):
    if val == 'Yes':
        return 1
    else:
        return 0
    return None

In [18]:
train['Vehicle_Damage'] = train['Vehicle_Damage'].apply(label_encode_vehicle_damage)
test['Vehicle_Damage'] = test['Vehicle_Damage'].apply(label_encode_vehicle_damage)

In [19]:
y = train['Response']
X = train.drop(columns=['Response'])

In [20]:
def objective(trial):
    param = {
        'verbosity': 0,
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'use_label_encoder': False,
        'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
        'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10)
    }

    dtrain = xgb.DMatrix(X, label=y)
    cv_results = xgb.cv(
        param,
        dtrain,
        num_boost_round=1000,
        nfold=5,
        stratified=True,
        early_stopping_rounds=10,
        metrics='auc',
        seed=42
    )

    mean_auc = cv_results['test-auc-mean'].max()
    return mean_auc


In [21]:
study_name = "xgb_study"
storage_name = f"sqlite:///optuna_study_{study_name}.db"
study = optuna.create_study(study_name=study_name, direction='maximize', storage=storage_name, load_if_exists=True)
optuna.logging.set_verbosity(optuna.logging.INFO)

[I 2024-07-31 19:01:24,969] A new study created in RDB with name: xgb_study


In [None]:
study.optimize(objective, n_trials=500)

  'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),


In [None]:
print('Number of finished trials:', len(study.trials))
print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))

print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

In [None]:
best_params = trial.params
best_model = xgb.XGBClassifier(**best_params, use_label_encoder=False, eval_metric='auc')
best_model.fit(X, y)

In [None]:
test_predictions = best_model.predict_proba(test.iloc[:, 1:])[:, 1]

submission = pd.DataFrame({'id': test['id'], 'Response': test_predictions})

submission.to_csv('submission.csv', index=False)