In [1]:
from __future__ import print_function

from datetime import datetime
import numpy as np
import pandas as pd
import optuna
import lightgbm as lgb
from path import Path
from sklearn.model_selection import StratifiedKFold

In [2]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [3]:
class Config:
    input_path = Path('/kaggle/input/porto-seguro-safe-driver-prediction')
    optuna_lgb = True
    n_estimators = 1500
    early_stopping_round = 150
    cv_folds = 5
    random_state = 0
    params = {
        'objective' : 'binary',
        'boosting_type' : 'gbdt',
        'learning_rate' : 0.01 ,
        'max_bin' : 25 ,
        'num_leaves' : 31,
        'min_child_samples' : 1500,
        'colsample_bytree' : 0.7,
        'subsample_freq' : 1,
        'subsample' : 0.7,
        'reg_alpha' : 1.0,
        'reg_lambda' : 1.0,
        'verbosity' : 0,
        'random_state' : 0
    }
config = Config()

In [4]:
train = pd.read_csv(config.input_path / 'train.csv',index_col='id')
test = pd.read_csv(config.input_path / 'test.csv',index_col='id')
submission = pd.read_csv(config.input_path / 'sample_submission.csv',index_col='id')

calc_features = [col for col in train.columns if '_calc' in col]
cat_features = [col for col in train.columns if '_cat' in col]

In [5]:
target = train['target']
train = train.drop(columns = 'target',axis='columns')

In [6]:
train = train.drop(columns = calc_features , axis = 'columns')
test = test.drop(columns = calc_features , axis = 'columns')

In [7]:
train = pd.get_dummies(train, columns=cat_features)
test = pd.get_dummies(test, columns=cat_features)

assert((train.columns==test.columns).all())

In [8]:
from numba import jit

@jit
def eval_gini(y_true,y_pred):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_pred)]
    
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    
    for i in range(n-1 , -1 , -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

def gini_lgb(y_true,y_pred):
    eval_name = 'normalized_gini_coef'
    eval_result = eval_gini(y_true,y_pred)
    is_higher_better = True
    return eval_name, eval_result , is_higher_better
                           

In [9]:
start_time = timer()

In [10]:
preds = np.zeros(len(test))
oof = np.zeros(len(train))
metric_evaluations = list()
skf = StratifiedKFold(n_splits=config.cv_folds , shuffle=True,random_state=config.random_state)

for idx , (train_idx , valid_idx) in enumerate(skf.split(train,target)):
    print(f"CV fold : {idx}")
    X_train = train.iloc[train_idx]
    y_train = target.iloc[train_idx]
    X_valid = train.iloc[valid_idx]
    y_valid = target.iloc[valid_idx]

    model = lgb.LGBMClassifier(**config.params , n_estimators=1500 , early_stopping_round=150 , force_row_wise=True)

    callbacks = [lgb.early_stopping(stopping_rounds=150 , verbose=False)]

    model.fit(X_train,y_train, eval_set=[(X_valid,y_valid)] , eval_metric=gini_lgb , callbacks=callbacks)

    metric_evaluations.append(model.best_score_['valid_0']['normalized_gini_coef'])
    
    preds += (model.predict_proba(test,num_iterations=model.best_iteration_)[:,1] / skf.n_splits)
    
    oof[valid_idx] = model.predict_proba(X_valid,num_iterations=model.best_iteration_)[:,1]

CV fold : 0
CV fold : 1
CV fold : 2
CV fold : 3
CV fold : 4


In [11]:
timer(start_time)


 Time taken: 0 hours 11 minutes and 7.62 seconds.


In [12]:
print(f"Evaluation Metric mean: {np.mean(metric_evaluations)}")
print(f"Evaluation Metric std: {np.std(metric_evaluations)}")

Evaluation Metric mean: 0.28856750056382924
Evaluation Metric std: 0.014968321544153982


In [13]:
submission['target'] = preds
submission.to_csv('lgb_submission.csv')

In [14]:
gini_lgb(target,oof)

Compilation is falling back to object mode WITH looplifting enabled because Function "eval_gini" failed type inference due to: [1m[1mnon-precise type pyobject[0m
[0m[1mDuring: typing of argument at /tmp/ipykernel_19/3895309277.py (5)[0m
[1m
File "../../tmp/ipykernel_19/3895309277.py", line 5:[0m
[1m<source missing, REPL/exec in use?>[0m
[0m
  @jit
Compilation is falling back to object mode WITHOUT looplifting enabled because Function "eval_gini" failed type inference due to: [1m[1mCannot determine Numba type of <class 'numba.core.dispatcher.LiftedLoop'>[0m
[1m
File "../../tmp/ipykernel_19/3895309277.py", line 13:[0m
[1m<source missing, REPL/exec in use?>[0m
[0m[0m
  @jit
[1m
File "../../tmp/ipykernel_19/3895309277.py", line 5:[0m
[1m<source missing, REPL/exec in use?>[0m
[0m
  state.func_ir.loc))
Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour.

For more information visit https://n

('normalized_gini_coef', 0.2885009109990003, True)