In [1]:
from __future__ import print_function

from datetime import datetime
import numpy as np
import pandas as pd
import optuna
import lightgbm as lgb
from path import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

In [2]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [3]:
class Config:
    input_path = Path('/kaggle/input/porto-seguro-safe-driver-prediction')
    optuna_lgb = True
    n_estimators = 1500
    early_stopping_round = 150
    cv_folds = 5
    random_state = 0
    params = {
        'objective' : 'binary',
        'boosting_type' : 'gbdt',
        'learning_rate' : 0.01 ,
        'max_bin' : 25 ,
        'num_leaves' : 31,
        'min_child_samples' : 1500,
        'colsample_bytree' : 0.7,
        'subsample_freq' : 1,
        'subsample' : 0.7,
        'reg_alpha' : 1.0,
        'reg_lambda' : 1.0,
        'verbosity' : 0,
        'random_state' : 0
    }
config = Config()

In [4]:
train = pd.read_csv(config.input_path / 'train.csv',index_col='id')
test = pd.read_csv(config.input_path / 'test.csv',index_col='id')
submission = pd.read_csv(config.input_path / 'sample_submission.csv',index_col='id')

calc_features = [col for col in train.columns if '_calc' in col]
cat_features = [col for col in train.columns if '_cat' in col]

In [5]:
target = train['target']
train = train.drop(columns = 'target',axis='columns')

In [6]:
train = train.drop(columns = calc_features , axis = 'columns')
test = test.drop(columns = calc_features , axis = 'columns')

In [7]:
train = pd.get_dummies(train, columns=cat_features)
test = pd.get_dummies(test, columns=cat_features)

assert((train.columns==test.columns).all())

In [8]:
from numba import jit

@jit
def eval_gini(y_true,y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    
    for i in range(n-1 , -1 , -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

def gini_lgb(y_true,y_pred):
    eval_name = 'normalized_gini_coef'
    eval_result = eval_gini(y_true,y_pred)
    is_higher_better = True
    return eval_name, eval_result , is_higher_better
                           

In [9]:
config.params

{'objective': 'binary',
 'boosting_type': 'gbdt',
 'learning_rate': 0.01,
 'max_bin': 25,
 'num_leaves': 31,
 'min_child_samples': 1500,
 'colsample_bytree': 0.7,
 'subsample_freq': 1,
 'subsample': 0.7,
 'reg_alpha': 1.0,
 'reg_lambda': 1.0,
 'verbosity': 0,
 'random_state': 0}

In [10]:
if config.optuna_lgb:
    
    def objective(trial):
        params = {
            'learning_rate' : trial.suggest_float("learning_rate",0.01 , 1) ,
            'num_leaves' : trial.suggest_int("num_leaves",3,255),
            'min_child_samples' : trial.suggest_int("min_child_samples",3,3000),
            'colsample_bytree' : trial.suggest_float("colsample_bytree",0.1,1),
            'subsample_freq' : trial.suggest_int("subsample_freq",0,10),
            'subsample' : trial.suggest_float("subsample",0.1,1),
            'reg_alpha' : trial.suggest_loguniform("reg_alpha",1e-9,10.0),
            'reg_lambda' : trial.suggest_loguniform("reg_lambda",1e-9,10.0)
        }
        score = list()
        skf = StratifiedKFold(n_splits=config.cv_folds,shuffle=True,random_state=config.random_state)
        
        for train_idx , valid_idx in skf.split(train,target):
            X_train = train.iloc[train_idx]
            y_train = target.iloc[train_idx]
            X_valid = train.iloc[valid_idx]
            y_valid = target.iloc[valid_idx]
            
            model = lgb.LGBMClassifier(**params , n_estimators=1500 , early_stopping_round=150 , force_row_wise=True)
            
            callbacks = [lgb.early_stopping(stopping_rounds=150 , verbose=False)]
            
            model.fit(X_train,y_train, eval_set=[(X_valid,y_valid)] , eval_metric=gini_lgb , callbacks=callbacks)
            
            score.append(model.best_score_['valid_0']['normalized_gini_coef'])
        return np.mean(score)
    study = optuna.create_study(direction="maximize")
    study.optimize(objective,n_trials=300)
    print('Best Normalized Score:',study.best_value)
    print('Best Params:',study.best_params)
    
    params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbosity': 0,
        'random_state': 0
    }
    
    params.update(study.best_params)
    
else:
    params = config.params

[32m[I 2023-03-04 12:01:37,612][0m A new study created in memory with name: no-name-8e3c5ff9-1e22-4598-8e2d-3e356fecefb6[0m
  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == "":
[32m[I 2023-03-04 12:03:08,066][0m Trial 0 finished with value: -0.0005706231819715413 and parameters: {'learning_rate': 0.6150706204467805, 'num_leaves': 112, 'min_child_samples': 989, 'colsample_bytree': 0.38612092204647486, 'subsample_freq': 4, 'subsample': 0.6840317979177926, 'reg_alpha': 4.409308050298407e-07, 'reg_lambda': 5.164897153198387e-07}. Best is trial 0 with value: -0.0005706231819715413.[0m
  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == "":
[32m[I 2023-03-04 12:05:03,160][0m Trial 1 finished with value: -0.0005706231819715413 and parameters: {'learning_rate': 0.08087411762189872, 'num_leaves': 186, 'min_child_samples': 2646, 'colsample_bytree': 0.9599222068210076, 'subsample_freq': 2, 'subsample': 0.931127084933912, 'reg_alpha': 

Best Normalized Score: -0.0005706231819715413
Best Params: {'learning_rate': 0.6150706204467805, 'num_leaves': 112, 'min_child_samples': 989, 'colsample_bytree': 0.38612092204647486, 'subsample_freq': 4, 'subsample': 0.6840317979177926, 'reg_alpha': 4.409308050298407e-07, 'reg_lambda': 5.164897153198387e-07}


In [11]:
params

{'objective': 'binary',
 'boosting_type': 'gbdt',
 'verbosity': 0,
 'random_state': 0,
 'learning_rate': 0.6150706204467805,
 'num_leaves': 112,
 'min_child_samples': 989,
 'colsample_bytree': 0.38612092204647486,
 'subsample_freq': 4,
 'subsample': 0.6840317979177926,
 'reg_alpha': 4.409308050298407e-07,
 'reg_lambda': 5.164897153198387e-07}