In [57]:
import re
import numpy as np
import pandas as pd
import feather

from numba import jit
import time
import gc
from scipy.stats import randint as sp_randint
from sklearn.metrics import make_scorer
from sklearn.grid_search import RandomizedSearchCV, GridSearchCV
from sklearn.cross_validation import StratifiedKFold

from sklearn.utils import shuffle

import pickle

import lightgbm as lgb
from lightgbm import LGBMClassifier
from datetime import datetime

In [47]:
# Compute gini

# from CPMP's kernel https://www.kaggle.com/cpmpml/extremely-fast-gini-computation
@jit
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

# Funcitons from olivier's kernel
# https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = -eval_gini(labels, preds)
    return [('gini', gini_score)]

def gini_lgbm(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = eval_gini(labels, preds)
    return [('gini', gini_score, True)]

def gini_normalized(a, p):
    return eval_gini(a, p) / eval_gini(a, a)

gini_scorer = make_scorer(gini_normalized, greater_is_better = True)

In [15]:
df = pd.read_csv('../cache/train_labels.csv')
target = df['y']
y = df['y'].values

In [4]:
trn_df = feather.read_dataframe('../cache/trn_df.feather')
sub_df = feather.read_dataframe('../cache/sub_df.feather')

In [5]:
trn_df.shape

(595212, 227)

In [6]:
sub_df.shape

(892816, 432)

In [7]:
sub_df = sub_df[trn_df.columns]

In [8]:
sub_df.shape

(892816, 227)

In [9]:
np.bincount(target)

array([573518,  21694])

In [10]:
595212 / (2 * np.bincount(target))

array([  0.51891309,  13.71835531])

In [12]:
wts = [0.947635485 if x==1 else 0.03644752 for x in target]

In [21]:
params = {'num_leaves':[150,200,300], 'max_depth':[4,7,10],
          'learning_rate':[0.05,0.03, 0.01, 0.08],'max_bin':[100,200,400], 'n_estimators': [100,200,400]}


lgbm = LGBMClassifier(objective='binary:logistic', sample_weight=wts)



clf = RandomizedSearchCV(lgbm, params, n_jobs=4, 
                   cv=StratifiedKFold(y, 5, True),
                   scoring = gini_scorer,
                   verbose=3, refit=True)


In [22]:
clf.fit(trn_df, y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] num_leaves=150, n_estimators=200, learning_rate=0.01, max_bin=400, max_depth=10 
[CV] num_leaves=150, n_estimators=200, learning_rate=0.01, max_bin=400, max_depth=10 
[CV] num_leaves=150, n_estimators=200, learning_rate=0.01, max_bin=400, max_depth=10 
[CV] num_leaves=150, n_estimators=200, learning_rate=0.01, max_bin=400, max_depth=10 
[CV]  num_leaves=150, n_estimators=200, learning_rate=0.01, max_bin=400, max_depth=10, score=0.002134 -50.8min
[CV] num_leaves=150, n_estimators=200, learning_rate=0.01, max_bin=400, max_depth=10 
[CV]  num_leaves=150, n_estimators=200, learning_rate=0.01, max_bin=400, max_depth=10, score=-0.002848 -50.8min
[CV] num_leaves=150, n_estimators=400, learning_rate=0.05, max_bin=400, max_depth=10 
[CV]  num_leaves=150, n_estimators=200, learning_rate=0.01, max_bin=400, max_depth=10, score=-0.007631 -51.4min
[CV] num_leaves=150, n_estimators=400, learning_rate=0.05, max_bin=400, max_depth=10 
[C

[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed: 314.4min


[CV]  num_leaves=300, n_estimators=400, learning_rate=0.01, max_bin=400, max_depth=4, score=-0.009653 -11.1min
[CV] num_leaves=300, n_estimators=400, learning_rate=0.01, max_bin=400, max_depth=4 
[CV]  num_leaves=300, n_estimators=400, learning_rate=0.01, max_bin=400, max_depth=4, score=0.000609 -11.0min
[CV] num_leaves=300, n_estimators=400, learning_rate=0.01, max_bin=400, max_depth=4 
[CV]  num_leaves=300, n_estimators=400, learning_rate=0.01, max_bin=400, max_depth=4, score=-0.004434 -10.9min
[CV] num_leaves=300, n_estimators=100, learning_rate=0.03, max_bin=100, max_depth=4 
[CV]  num_leaves=300, n_estimators=100, learning_rate=0.03, max_bin=100, max_depth=4, score=-0.009653 - 2.9min
[CV] num_leaves=300, n_estimators=100, learning_rate=0.03, max_bin=100, max_depth=4 
[CV]  num_leaves=300, n_estimators=100, learning_rate=0.03, max_bin=100, max_depth=4, score=0.000609 - 2.9min
[CV] num_leaves=300, n_estimators=100, learning_rate=0.03, max_bin=100, max_depth=4 
[CV]  num_leaves=300, 

[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed: 531.2min finished


RandomizedSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[0 0 ..., 0 0], n_folds=5, shuffle=True, random_state=None),
          error_score='raise',
          estimator=LGBMClassifier(boosting_type='gbdt', colsample_bytree=1.0, learning_rate=0.1,
        max_bin=255, max_depth=-1, min_child_samples=10,
        min_child_weight=5, min_split_gain=0.0, n_estimators=10, n_jobs=-1,
        num_leaves=31, objective='binary:logistic', random_state=0,
        reg... 0.03644752],
        silent=True, subsample=1.0, subsample_for_bin=50000,
        subsample_freq=1),
          fit_params={}, iid=True, n_iter=10, n_jobs=4,
          param_distributions={'num_leaves': [150, 200, 300], 'n_estimators': [100, 200, 400], 'learning_rate': [0.05, 0.03, 0.01, 0.08], 'max_bin': [100, 200, 400], 'max_depth': [4, 7, 10]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=make_scorer(gini_normalized), verbose=3)

In [23]:
print(clf.best_score_)
print(clf.best_params_)

0.009520515552805779
{'num_leaves': 150, 'n_estimators': 400, 'learning_rate': 0.05, 'max_bin': 400, 'max_depth': 10}


In [25]:
with open('../cache/idx_info.pkl', 'rb') as f:
    idx_info= pickle.load(f)

In [52]:
n_splits = 5
n_estimators = 200
#folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=15) 
imp_df = np.zeros((len(trn_df.columns), n_splits))
lgbm_evals = np.zeros((n_estimators, n_splits))
oof_lgbm = np.empty(len(trn_df))
sub_preds = np.zeros(len(sub_df))
tgt = df['y']
sub_df = feather.read_dataframe('../cache/sub_df.feather')
sub_df = sub_df[trn_df.columns]
sub_df.fillna(-1, inplace=True)
print(sub_df.shape)
np.random.seed(0)

(892816, 227)


In [53]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'gini',
    'num_leaves': 150,
    'learning_rate': 0.05,
    'max_bin': 400,
    'max_depth': 10,
    'n_estimators': 400,
    'verbose': 1
}

In [54]:
for fold_ in range(len(idx_info)):
    l = idx_info[fold_]
    trn_idx = l[0]
    val_idx = l[1]
    trn_dat, trn_tgt = trn_df.iloc[trn_idx], tgt.iloc[trn_idx]
    val_dat, val_tgt = trn_df.iloc[val_idx], tgt.iloc[val_idx]
    
    pos = sum(trn_tgt==0)/sum(trn_tgt==1)
    wts = np.array([pos if x==1 else 1 for x in trn_tgt])
    
    eval_pos = sum(val_tgt==0)/sum(val_tgt==1)
    eval_wts = [eval_pos if x==1 else 1 for x in val_tgt]
    
    lgb_train = lgb.Dataset(trn_dat, trn_tgt,
                        weight=wts, free_raw_data=False)
    lgb_eval = lgb.Dataset(val_dat, val_tgt, reference=lgb_train,
                       weight=eval_wts, free_raw_data=False)

#     clf = LGBMClassifier(objective='binary:logistic', sample_weight=wts, num_leaves=150, n_estimators=400, 
#                          learning_rate=0.05, max_bin=400, max_depth=10)
#     clf.fit(trn_dat, trn_tgt, sample_weight=wts, 
#             eval_set=[(trn_dat, trn_tgt), (val_dat, val_tgt)],
#             #eval_sample_weight = eval_wts,
#             eval_metric=gini_xgb,
#             early_stopping_rounds=None,
#             verbose=True)
    
    clf = lgb.train(params,
                    lgb_train,
                    num_boost_round=200,
                    valid_sets=lgb_eval,
                    feval = gini_lgbm,
                    early_stopping_rounds=20)
    
    # Predict OOF and submission probas with the best round
    oof_lgbm[val_idx] = clf.predict(val_dat, num_iteration=clf.best_iteration)
    # Update submission
    
    sub_preds += clf.predict(sub_df, num_iteration=clf.best_iteration) / n_splits
    
    # Display results
    print("Fold %2d : %.6f @%4d "
          % (fold_ + 1,
             eval_gini(val_tgt, oof_lgbm[val_idx]),
             n_estimators
             ))

[1]	valid_0's gini: 0.211688
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's gini: 0.230021
[3]	valid_0's gini: 0.24114
[4]	valid_0's gini: 0.247334
[5]	valid_0's gini: 0.255675
[6]	valid_0's gini: 0.265633
[7]	valid_0's gini: 0.265896
[8]	valid_0's gini: 0.270041
[9]	valid_0's gini: 0.277607
[10]	valid_0's gini: 0.278591
[11]	valid_0's gini: 0.281882
[12]	valid_0's gini: 0.282922
[13]	valid_0's gini: 0.285759
[14]	valid_0's gini: 0.288482
[15]	valid_0's gini: 0.291393
[16]	valid_0's gini: 0.293939
[17]	valid_0's gini: 0.297432
[18]	valid_0's gini: 0.301979
[19]	valid_0's gini: 0.30393
[20]	valid_0's gini: 0.306387
[21]	valid_0's gini: 0.307441
[22]	valid_0's gini: 0.309646
[23]	valid_0's gini: 0.311447
[24]	valid_0's gini: 0.315373
[25]	valid_0's gini: 0.31714
[26]	valid_0's gini: 0.318582
[27]	valid_0's gini: 0.318926
[28]	valid_0's gini: 0.319927
[29]	valid_0's gini: 0.322254
[30]	valid_0's gini: 0.325918
[31]	valid_0's gini: 0.326838
[32]	valid_0's gini:

[113]	valid_0's gini: 0.390767
[114]	valid_0's gini: 0.390929
[115]	valid_0's gini: 0.39069
[116]	valid_0's gini: 0.390417
[117]	valid_0's gini: 0.390189
[118]	valid_0's gini: 0.390115
[119]	valid_0's gini: 0.390361
[120]	valid_0's gini: 0.390644
[121]	valid_0's gini: 0.390295
[122]	valid_0's gini: 0.39033
[123]	valid_0's gini: 0.390014
[124]	valid_0's gini: 0.38987
[125]	valid_0's gini: 0.39017
[126]	valid_0's gini: 0.390638
[127]	valid_0's gini: 0.390284
[128]	valid_0's gini: 0.390647
[129]	valid_0's gini: 0.391319
[130]	valid_0's gini: 0.391403
[131]	valid_0's gini: 0.391126
[132]	valid_0's gini: 0.391556
[133]	valid_0's gini: 0.391663
[134]	valid_0's gini: 0.391857
[135]	valid_0's gini: 0.391719
[136]	valid_0's gini: 0.391897
[137]	valid_0's gini: 0.393837
[138]	valid_0's gini: 0.394336
[139]	valid_0's gini: 0.393529
[140]	valid_0's gini: 0.393282
[141]	valid_0's gini: 0.392739
[142]	valid_0's gini: 0.392248
[143]	valid_0's gini: 0.392624
[144]	valid_0's gini: 0.392434
[145]	valid_

[60]	valid_0's gini: 0.377112
[61]	valid_0's gini: 0.377225
[62]	valid_0's gini: 0.377401
[63]	valid_0's gini: 0.377204
[64]	valid_0's gini: 0.378034
[65]	valid_0's gini: 0.378526
[66]	valid_0's gini: 0.379038
[67]	valid_0's gini: 0.379868
[68]	valid_0's gini: 0.380133
[69]	valid_0's gini: 0.380391
[70]	valid_0's gini: 0.380566
[71]	valid_0's gini: 0.380492
[72]	valid_0's gini: 0.381269
[73]	valid_0's gini: 0.381375
[74]	valid_0's gini: 0.38168
[75]	valid_0's gini: 0.382439
[76]	valid_0's gini: 0.382306
[77]	valid_0's gini: 0.383682
[78]	valid_0's gini: 0.383992
[79]	valid_0's gini: 0.384187
[80]	valid_0's gini: 0.38451
[81]	valid_0's gini: 0.384942
[82]	valid_0's gini: 0.3849
[83]	valid_0's gini: 0.385059
[84]	valid_0's gini: 0.38576
[85]	valid_0's gini: 0.385472
[86]	valid_0's gini: 0.385906
[87]	valid_0's gini: 0.385988
[88]	valid_0's gini: 0.386675
[89]	valid_0's gini: 0.387824
[90]	valid_0's gini: 0.388693
[91]	valid_0's gini: 0.389017
[92]	valid_0's gini: 0.388846
[93]	valid_0's 

[153]	valid_0's gini: 0.393361
[154]	valid_0's gini: 0.394741
[155]	valid_0's gini: 0.395174
[156]	valid_0's gini: 0.395572
[157]	valid_0's gini: 0.395775
[158]	valid_0's gini: 0.395733
[159]	valid_0's gini: 0.395524
[160]	valid_0's gini: 0.396211
[161]	valid_0's gini: 0.396222
[162]	valid_0's gini: 0.396403
[163]	valid_0's gini: 0.396184
[164]	valid_0's gini: 0.396178
[165]	valid_0's gini: 0.396389
[166]	valid_0's gini: 0.396414
[167]	valid_0's gini: 0.396915
[168]	valid_0's gini: 0.397136
[169]	valid_0's gini: 0.396524
[170]	valid_0's gini: 0.396591
[171]	valid_0's gini: 0.396083
[172]	valid_0's gini: 0.395928
[173]	valid_0's gini: 0.395868
[174]	valid_0's gini: 0.396106
[175]	valid_0's gini: 0.395972
[176]	valid_0's gini: 0.396275
[177]	valid_0's gini: 0.395772
[178]	valid_0's gini: 0.395604
[179]	valid_0's gini: 0.395409
[180]	valid_0's gini: 0.395739
[181]	valid_0's gini: 0.39581
[182]	valid_0's gini: 0.396072
[183]	valid_0's gini: 0.397198
[184]	valid_0's gini: 0.398061
[185]	val

In [58]:
np.save('../cache/oof_lgbm', oof_lgbm)
now = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
fn = '../cache/model.lgbm.{}'.format(now)
with open(fn, 'wb') as f:
    pickle.dump(clf, f)

In [None]:
sub_preds = np.clip(sub_preds, a_min=0.05, a_max=0.95)

In [61]:
df = pd.read_csv('../cache/test_id.csv')

In [62]:

sub_df["target"] = sub_preds
sub_df['id'] = df['id'].values
now = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
fn = '../submissions/sub.lgbm.{}'.format(now)
sub_df[['id', "target"]].to_csv(fn, index=False, float_format="%.9f")