# Lightgbm

In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import lightgbm

from sklearn.externals import joblib
from sklearn.model_selection import StratifiedKFold
from IPython.display import display

In [2]:
train = joblib.load('models/train.joblib')
targets = train['TARGET']

train_ids = train['SK_ID_CURR']
train = train.drop(columns=['SK_ID_CURR', 'TARGET'])

In [3]:
test = joblib.load('models/test.joblib')
test_ids = test['SK_ID_CURR']
test = test.drop(columns=['SK_ID_CURR'])

In [4]:
# Convert to np arrays
features = np.array(train)
test_features = np.array(test)

In [7]:
k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

valid_scores = []
train_scores = []
test_predictions = np.zeros(test_features.shape[0])

for train_indices, valid_indices in k_fold.split(features, targets):
    # Training data for the fold
    train_features, train_labels = features[train_indices], targets[train_indices]
    # Validation data for the fold
    valid_features, valid_labels = features[valid_indices], targets[valid_indices]
    
    # d_train = lightgbm.Dataset(train_features, label=train_labels)
    # d_valid = lightgbm.Dataset(valid_features, label=valid_labels)
    # model = lightgbm.train(parameters, d_train, verbose_eval=100, valid_sets=[d_valid], num_boost_round=20000, early_stopping_rounds=200)
    
    # The ‘balanced’ mode uses the values of y to automatically adjust weights inversely proportional
    # to class frequencies in the input data as n_samples / (n_classes * np.bincount(y))
    model = lightgbm.LGBMClassifier(n_estimators=5000, objective='binary', 
                                   class_weight='balanced', learning_rate=0.01, 
                                   reg_alpha=0.1, reg_lambda=0.1, 
                                   subsample=0.8, n_jobs=6, random_state=4242)
    
    model.fit(train_features, train_labels, eval_metric='auc',
                  eval_set=[(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names=['valid', 'train'], early_stopping_rounds=100, verbose=100)

    best_iteration = model.best_iteration_
    
    test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
    
    valid_score = model.best_score_['valid']['auc']
    train_score = model.best_score_['train']['auc']
        
    valid_scores.append(valid_score)
    train_scores.append(train_score)


Training until validation scores don't improve for 100 rounds.
[100]	valid's auc: 0.75245	train's auc: 0.776391
[200]	valid's auc: 0.756557	train's auc: 0.79738
[300]	valid's auc: 0.757089	train's auc: 0.813947
[400]	valid's auc: 0.757094	train's auc: 0.828808
Early stopping, best iteration is:
[340]	valid's auc: 0.757206	train's auc: 0.820312
Training until validation scores don't improve for 100 rounds.
[100]	valid's auc: 0.756575	train's auc: 0.775447
[200]	valid's auc: 0.760815	train's auc: 0.796226
[300]	valid's auc: 0.761325	train's auc: 0.813195
[400]	valid's auc: 0.761182	train's auc: 0.82842
Early stopping, best iteration is:
[376]	valid's auc: 0.761362	train's auc: 0.824852
Training until validation scores don't improve for 100 rounds.
[100]	valid's auc: 0.755574	train's auc: 0.775729
[200]	valid's auc: 0.758837	train's auc: 0.796677
[300]	valid's auc: 0.759166	train's auc: 0.813371
Early stopping, best iteration is:
[267]	valid's auc: 0.759293	train's auc: 0.808259
Training 

In [9]:
submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})
submission.to_csv('submissions/lightgbm.csv', index=False)