# 07 - LGBM + KMeans

#### Imports

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="white")

#### Constants

In [2]:
n_components = 1000

In [3]:
models_folder = "models/"
train_data_fn = models_folder+'train_data.pkl'
target_fn = models_folder+'target.pkl'
test_data_fn = models_folder+'test_data.pkl'

weight_multiplier_fn = models_folder+"weight_multiplier.pkl"

#### Functions

In [4]:
import os.path
from sklearn.externals import joblib

def Load(filename):
    if os.path.isfile(filename):
        return joblib.load(filename)
    
def Save(obj, filename):
    joblib.dump(obj, filename)

# Loading data

In [5]:
import scipy

data = scipy.sparse.load_npz("train_sparse_matrix_after_scale.npz")

kmeans100 = Load(models_folder+'kmeans_n100.pkl')
kmeans2 = Load(models_folder+'kmeans_n2.pkl')
target = Load(target_fn)

In [6]:
from scipy import sparse

traink100 = sparse.csr_matrix(kmeans100[:427994])
traink2 = sparse.csr_matrix(kmeans2[:427994])

In [8]:
from scipy.sparse import hstack

data = hstack([data,traink100.transpose(),traink2.transpose()]).tocsr()  

In [9]:
weight_multiplier = Load(weight_multiplier_fn)

## Splitting dataset

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_validation, Y_train, Y_validation = train_test_split(data, target.ravel(), train_size=0.8)



In [11]:
import lightgbm as lgbm
import re

In [12]:
params = {'subsample_freq': 2, 'subsample_for_bin': 100, 'subsample': 0.7, 'scale_pos_weight': 1, 'reg_lambda': 0.2, 'reg_alpha': 7, 'objective': 'binary', 'num_leaves': 50, 'min_split_gain': 2.0, 'min_child_weight': 3, 'min_child_samples': 100, 'metric': 'auc', 'max_depth': 20, 'max_bin': 100, 'learning_rate': 0.1, 'colsample_bytree': 0.7}

In [16]:
d_train = lgbm.Dataset(X_train, label=Y_train)
d_valid = lgbm.Dataset(X_validation, label=Y_validation)

In [17]:
evals_results = {}
num_boost_round=3000
early_stopping_rounds=200
feval=None

model = lgbm.train(params, 
                     d_train, 
                     valid_sets=[d_train, d_valid], 
                     valid_names=['train','valid'], 
                     evals_result=evals_results, 
                     num_boost_round=num_boost_round,
                     early_stopping_rounds=early_stopping_rounds,
                     verbose_eval=10, 
                     feval=feval)

Training until validation scores don't improve for 200 rounds.
[10]	train's auc: 0.650597	valid's auc: 0.630546
[20]	train's auc: 0.668918	valid's auc: 0.640062
[30]	train's auc: 0.681768	valid's auc: 0.646047
[40]	train's auc: 0.691654	valid's auc: 0.650378
[50]	train's auc: 0.699754	valid's auc: 0.65194
[60]	train's auc: 0.70606	valid's auc: 0.653426
[70]	train's auc: 0.71143	valid's auc: 0.653383
[80]	train's auc: 0.716006	valid's auc: 0.653951
[90]	train's auc: 0.719446	valid's auc: 0.654458
[100]	train's auc: 0.722461	valid's auc: 0.654586
[110]	train's auc: 0.72497	valid's auc: 0.654439
[120]	train's auc: 0.727716	valid's auc: 0.654062
[130]	train's auc: 0.729996	valid's auc: 0.654119
[140]	train's auc: 0.731751	valid's auc: 0.654028
[150]	train's auc: 0.733327	valid's auc: 0.653447
[160]	train's auc: 0.734771	valid's auc: 0.653385
[170]	train's auc: 0.736306	valid's auc: 0.652954
[180]	train's auc: 0.737471	valid's auc: 0.652845
[190]	train's auc: 0.738473	valid's auc: 0.65298
[

In [18]:
n_estimators = model.best_iteration
print("\nModel Report")
print("n_estimators : ", n_estimators)
print("AUC"+":", evals_results['valid']['auc'][n_estimators-1])


Model Report
n_estimators :  103
AUC: 0.6547462118265377


In [21]:
from sklearn.metrics import roc_auc_score

predicted = model.predict(X_validation)
print("ROC AUC score:",roc_auc_score(Y_validation, predicted))

ROC AUC score: 0.6547462118265377


In [22]:
model.save_model('lgbm_kmeans_model.cbm')

# Test Data

In [28]:
# data = Load(test_data_fn)
test_data = scipy.sparse.load_npz("test_sparse_matrix_after_scale.npz")

In [29]:
from scipy import sparse

testk100 = sparse.csr_matrix(kmeans100[427994:])
testk2 = sparse.csr_matrix(kmeans2[427994:])

In [30]:
from scipy.sparse import hstack

test_data = hstack([test_data,testk100.transpose(),testk2.transpose()]).tocsr()  

In [31]:
Y_test = model.predict(test_data)

## Saving test predictions

In [32]:
predictions = pd.DataFrame(Y_test)
predictions.to_csv("solution_lgbm_kmeans.csv",header=None, index=None)