# 07 - LGBM + KMeans

#### Imports

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="white")

#### Constants

In [2]:
n_components = 1000

In [3]:
models_folder = "models/"
train_data_fn = models_folder+'train_data.pkl'
target_fn = models_folder+'target.pkl'
test_data_fn = models_folder+'test_data.pkl'

weight_multiplier_fn = models_folder+"weight_multiplier.pkl"

#### Functions

In [4]:
import os.path
from sklearn.externals import joblib

def Load(filename):
    if os.path.isfile(filename):
        return joblib.load(filename)
    
def Save(obj, filename):
    joblib.dump(obj, filename)

In [17]:
lgbm = pd.read_csv('solution_lgbm.csv',
                      delimiter='\t',
                      encoding='utf-8',
                      header=None)
lgbm.columns = ['lgbm']

catboost = pd.read_csv('solution_catboost.csv',
                      delimiter='\t',
                      encoding='utf-8',
                      header=None)
catboost.columns = ['catboost']

lgbm_kmeans = pd.read_csv('solution_lgbm_kmeans.csv',
                      delimiter='\t',
                      encoding='utf-8',
                      header=None)

lgbm_kmeans.columns = ['lgbm_kmeans']

In [28]:
df = lgbm.join(lgbm_kmeans)

In [29]:
df.head(5)

Unnamed: 0,lgbm,lgbm_kmeans
0,0.130328,0.175593
1,0.042108,0.054789
2,0.075908,0.071964
3,0.04137,0.051234
4,0.031718,0.029703


In [30]:
predictions = df.mean(axis=1)

In [31]:
predictions

0         0.152960
1         0.048449
2         0.073936
3         0.046302
4         0.030711
5         0.039109
6         0.065182
7         0.048276
8         0.066141
9         0.030397
10        0.026220
11        0.024621
12        0.042983
13        0.045932
14        0.038844
15        0.044818
16        0.044298
17        0.054569
18        0.080715
19        0.055531
20        0.019236
21        0.046827
22        0.054694
23        0.016473
24        0.047906
25        0.036086
26        0.014198
27        0.103437
28        0.015201
29        0.017398
            ...   
180994    0.020754
180995    0.081219
180996    0.040667
180997    0.060395
180998    0.046208
180999    0.085668
181000    0.035335
181001    0.032099
181002    0.031505
181003    0.061867
181004    0.056301
181005    0.103779
181006    0.045230
181007    0.050998
181008    0.028350
181009    0.040163
181010    0.033341
181011    0.033344
181012    0.051268
181013    0.047068
181014    0.011343
181015    0.

In [32]:
predictions.to_csv("solution_dummy_stacking.csv",header=None, index=None)

# Loading data

In [None]:
import scipy

data = scipy.sparse.load_npz("train_sparse_matrix_after_scale.npz")

kmeans100 = Load(models_folder+'kmeans_n100.pkl')
kmeans2 = Load(models_folder+'kmeans_n2.pkl')
target = Load(target_fn)

In [None]:
from scipy import sparse

traink100 = sparse.csr_matrix(kmeans100[:427994])
traink2 = sparse.csr_matrix(kmeans2[:427994])

In [None]:
from scipy.sparse import hstack

data = hstack([data,traink100.transpose(),traink2.transpose()]).tocsr()  

In [None]:
weight_multiplier = Load(weight_multiplier_fn)

## Splitting dataset

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_validation, Y_train, Y_validation = train_test_split(data, target.ravel(), train_size=0.8)

In [None]:
import lightgbm as lgbm
import re

In [None]:
params = {'subsample_freq': 2, 'subsample_for_bin': 100, 'subsample': 0.7, 'scale_pos_weight': 1, 'reg_lambda': 0.2, 'reg_alpha': 7, 'objective': 'binary', 'num_leaves': 50, 'min_split_gain': 2.0, 'min_child_weight': 3, 'min_child_samples': 100, 'metric': 'auc', 'max_depth': 20, 'max_bin': 100, 'learning_rate': 0.1, 'colsample_bytree': 0.7}

In [None]:
d_train = lgbm.Dataset(X_train, label=Y_train)
d_valid = lgbm.Dataset(X_validation, label=Y_validation)

In [None]:
evals_results = {}
num_boost_round=3000
early_stopping_rounds=200
feval=None

model = lgbm.train(params, 
                     d_train, 
                     valid_sets=[d_train, d_valid], 
                     valid_names=['train','valid'], 
                     evals_result=evals_results, 
                     num_boost_round=num_boost_round,
                     early_stopping_rounds=early_stopping_rounds,
                     verbose_eval=10, 
                     feval=feval)

In [None]:
n_estimators = model.best_iteration
print("\nModel Report")
print("n_estimators : ", n_estimators)
print("AUC"+":", evals_results['valid']['auc'][n_estimators-1])

In [None]:
from sklearn.metrics import roc_auc_score

predicted = model.predict(X_validation)
print("ROC AUC score:",roc_auc_score(Y_validation, predicted))

In [None]:
model.save_model('lgbm_kmeans_model.cbm')

# Test Data

In [None]:
# data = Load(test_data_fn)
test_data = scipy.sparse.load_npz("test_sparse_matrix_after_scale.npz")

In [None]:
from scipy import sparse

testk100 = sparse.csr_matrix(kmeans100[427994:])
testk2 = sparse.csr_matrix(kmeans2[427994:])

In [None]:
from scipy.sparse import hstack

test_data = hstack([test_data,testk100.transpose(),testk2.transpose()]).tocsr()  

In [None]:
Y_test = model.predict(test_data)

## Saving test predictions

In [None]:
predictions = pd.DataFrame(Y_test)
predictions.to_csv("solution_lgbm_kmeans.csv",header=None, index=None)