In [1]:
#Santander Customer Transaction Prediction - LGBM

#In the Kaggle competition, the objective is to identify which customer will make a transaction in the 
#future.

#Link to the competition: https://www.kaggle.com/c/santander-customer-transaction-prediction/
#Type of Problem: Classification
#Metric for evalution: AOC (Area Under Curve)

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
import lightgbm
from sklearn.metrics import roc_auc_score

import matplotlib.pylab as plt

In [2]:
#Step1: Read Training Data from CSV

#Use pandas read_csv function to read train.csv

df_train = pd.read_csv('E:/datafiles/santander-customer-transaction-prediction/train.csv')
df_train.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [3]:
#Separate the data into independent and dependent variables.
#Use sklearn's train_test_split function to separate the data into training and validation data


var_columns = [c for c in df_train.columns if c not in ['ID_code','target']]

X = df_train.loc[:,var_columns]
y = df_train.loc[:,'target']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((160000, 200), (40000, 200), (160000,), (40000,))

In [4]:
#Step2: Create a simple Light GBM Model and evaluate performance¶

#LightGBM has function Dataset to read the data. This is required for using LightGBM.


train_data = lightgbm.Dataset(X_train, label=y_train)
valid_data = lightgbm.Dataset(X_valid, label=y_valid)

In [5]:
#Specify the parameters for LightGBM

parameters = {'objective': 'binary',
              'metric': 'auc',
              'is_unbalance': 'true',
              'boosting': 'gbdt',
              'num_leaves': 63,
              'feature_fraction': 0.5,
              'bagging_fraction': 0.5,
              'bagging_freq': 20,
              'learning_rate': 0.01,
              'verbose': -1
             }

In [6]:
#Train the LightGBM model for maximum 5000 rounds. Early stopping criteria is 50 iterations.

model_lgbm = lightgbm.train(parameters,
                            train_data,
                            valid_sets=valid_data,
                            num_boost_round=5000,
                            early_stopping_rounds=50)

[1]	valid_0's auc: 0.66387
Training until validation scores don't improve for 50 rounds
[2]	valid_0's auc: 0.691844
[3]	valid_0's auc: 0.714632
[4]	valid_0's auc: 0.720863
[5]	valid_0's auc: 0.730256
[6]	valid_0's auc: 0.744463
[7]	valid_0's auc: 0.750754
[8]	valid_0's auc: 0.761351
[9]	valid_0's auc: 0.766762
[10]	valid_0's auc: 0.770928
[11]	valid_0's auc: 0.775191
[12]	valid_0's auc: 0.777102
[13]	valid_0's auc: 0.780203
[14]	valid_0's auc: 0.783726
[15]	valid_0's auc: 0.783711
[16]	valid_0's auc: 0.784068
[17]	valid_0's auc: 0.78349
[18]	valid_0's auc: 0.783665
[19]	valid_0's auc: 0.7836
[20]	valid_0's auc: 0.783855
[21]	valid_0's auc: 0.785852
[22]	valid_0's auc: 0.786739
[23]	valid_0's auc: 0.787076
[24]	valid_0's auc: 0.78861
[25]	valid_0's auc: 0.7896
[26]	valid_0's auc: 0.791877
[27]	valid_0's auc: 0.793067
[28]	valid_0's auc: 0.793554
[29]	valid_0's auc: 0.79517
[30]	valid_0's auc: 0.795741
[31]	valid_0's auc: 0.797234
[32]	valid_0's auc: 0.79732
[33]	valid_0's auc: 0.798988


[276]	valid_0's auc: 0.855156
[277]	valid_0's auc: 0.85517
[278]	valid_0's auc: 0.855214
[279]	valid_0's auc: 0.855236
[280]	valid_0's auc: 0.855227
[281]	valid_0's auc: 0.8553
[282]	valid_0's auc: 0.855396
[283]	valid_0's auc: 0.855467
[284]	valid_0's auc: 0.855583
[285]	valid_0's auc: 0.855613
[286]	valid_0's auc: 0.855651
[287]	valid_0's auc: 0.855753
[288]	valid_0's auc: 0.8558
[289]	valid_0's auc: 0.85584
[290]	valid_0's auc: 0.855981
[291]	valid_0's auc: 0.856025
[292]	valid_0's auc: 0.856109
[293]	valid_0's auc: 0.856182
[294]	valid_0's auc: 0.856309
[295]	valid_0's auc: 0.856382
[296]	valid_0's auc: 0.856438
[297]	valid_0's auc: 0.856559
[298]	valid_0's auc: 0.856657
[299]	valid_0's auc: 0.85679
[300]	valid_0's auc: 0.856804
[301]	valid_0's auc: 0.856887
[302]	valid_0's auc: 0.857056
[303]	valid_0's auc: 0.857185
[304]	valid_0's auc: 0.857238
[305]	valid_0's auc: 0.857372
[306]	valid_0's auc: 0.857483
[307]	valid_0's auc: 0.857639
[308]	valid_0's auc: 0.857805
[309]	valid_0's a

[551]	valid_0's auc: 0.874262
[552]	valid_0's auc: 0.874252
[553]	valid_0's auc: 0.874289
[554]	valid_0's auc: 0.874348
[555]	valid_0's auc: 0.874377
[556]	valid_0's auc: 0.874368
[557]	valid_0's auc: 0.874383
[558]	valid_0's auc: 0.874393
[559]	valid_0's auc: 0.874401
[560]	valid_0's auc: 0.874427
[561]	valid_0's auc: 0.874479
[562]	valid_0's auc: 0.874555
[563]	valid_0's auc: 0.87462
[564]	valid_0's auc: 0.874681
[565]	valid_0's auc: 0.874767
[566]	valid_0's auc: 0.874814
[567]	valid_0's auc: 0.874885
[568]	valid_0's auc: 0.874928
[569]	valid_0's auc: 0.874976
[570]	valid_0's auc: 0.875001
[571]	valid_0's auc: 0.875082
[572]	valid_0's auc: 0.875162
[573]	valid_0's auc: 0.875245
[574]	valid_0's auc: 0.875268
[575]	valid_0's auc: 0.875311
[576]	valid_0's auc: 0.875346
[577]	valid_0's auc: 0.875352
[578]	valid_0's auc: 0.875404
[579]	valid_0's auc: 0.875459
[580]	valid_0's auc: 0.875452
[581]	valid_0's auc: 0.875492
[582]	valid_0's auc: 0.875554
[583]	valid_0's auc: 0.875606
[584]	valid

[827]	valid_0's auc: 0.883658
[828]	valid_0's auc: 0.883681
[829]	valid_0's auc: 0.883717
[830]	valid_0's auc: 0.88374
[831]	valid_0's auc: 0.88375
[832]	valid_0's auc: 0.883767
[833]	valid_0's auc: 0.883798
[834]	valid_0's auc: 0.883843
[835]	valid_0's auc: 0.883902
[836]	valid_0's auc: 0.88391
[837]	valid_0's auc: 0.883934
[838]	valid_0's auc: 0.883939
[839]	valid_0's auc: 0.883953
[840]	valid_0's auc: 0.883999
[841]	valid_0's auc: 0.884048
[842]	valid_0's auc: 0.884077
[843]	valid_0's auc: 0.884081
[844]	valid_0's auc: 0.884105
[845]	valid_0's auc: 0.884117
[846]	valid_0's auc: 0.884107
[847]	valid_0's auc: 0.88412
[848]	valid_0's auc: 0.884134
[849]	valid_0's auc: 0.884145
[850]	valid_0's auc: 0.884184
[851]	valid_0's auc: 0.884201
[852]	valid_0's auc: 0.884199
[853]	valid_0's auc: 0.884214
[854]	valid_0's auc: 0.88425
[855]	valid_0's auc: 0.884261
[856]	valid_0's auc: 0.884296
[857]	valid_0's auc: 0.884336
[858]	valid_0's auc: 0.884351
[859]	valid_0's auc: 0.884349
[860]	valid_0's

[1098]	valid_0's auc: 0.88849
[1099]	valid_0's auc: 0.888505
[1100]	valid_0's auc: 0.888528
[1101]	valid_0's auc: 0.888547
[1102]	valid_0's auc: 0.888569
[1103]	valid_0's auc: 0.888598
[1104]	valid_0's auc: 0.888626
[1105]	valid_0's auc: 0.88865
[1106]	valid_0's auc: 0.88868
[1107]	valid_0's auc: 0.888691
[1108]	valid_0's auc: 0.888698
[1109]	valid_0's auc: 0.888706
[1110]	valid_0's auc: 0.888723
[1111]	valid_0's auc: 0.888726
[1112]	valid_0's auc: 0.888736
[1113]	valid_0's auc: 0.888761
[1114]	valid_0's auc: 0.88879
[1115]	valid_0's auc: 0.888798
[1116]	valid_0's auc: 0.888815
[1117]	valid_0's auc: 0.888823
[1118]	valid_0's auc: 0.888837
[1119]	valid_0's auc: 0.888855
[1120]	valid_0's auc: 0.88887
[1121]	valid_0's auc: 0.888888
[1122]	valid_0's auc: 0.88889
[1123]	valid_0's auc: 0.888911
[1124]	valid_0's auc: 0.888912
[1125]	valid_0's auc: 0.888929
[1126]	valid_0's auc: 0.888954
[1127]	valid_0's auc: 0.888977
[1128]	valid_0's auc: 0.888983
[1129]	valid_0's auc: 0.889002
[1130]	valid_0

[1365]	valid_0's auc: 0.891666
[1366]	valid_0's auc: 0.891663
[1367]	valid_0's auc: 0.891665
[1368]	valid_0's auc: 0.891672
[1369]	valid_0's auc: 0.891686
[1370]	valid_0's auc: 0.8917
[1371]	valid_0's auc: 0.891729
[1372]	valid_0's auc: 0.891752
[1373]	valid_0's auc: 0.891758
[1374]	valid_0's auc: 0.891761
[1375]	valid_0's auc: 0.891765
[1376]	valid_0's auc: 0.891776
[1377]	valid_0's auc: 0.891781
[1378]	valid_0's auc: 0.891812
[1379]	valid_0's auc: 0.89182
[1380]	valid_0's auc: 0.891828
[1381]	valid_0's auc: 0.891834
[1382]	valid_0's auc: 0.891823
[1383]	valid_0's auc: 0.891835
[1384]	valid_0's auc: 0.891845
[1385]	valid_0's auc: 0.891848
[1386]	valid_0's auc: 0.891865
[1387]	valid_0's auc: 0.891858
[1388]	valid_0's auc: 0.891877
[1389]	valid_0's auc: 0.891873
[1390]	valid_0's auc: 0.891873
[1391]	valid_0's auc: 0.891879
[1392]	valid_0's auc: 0.891889
[1393]	valid_0's auc: 0.891889
[1394]	valid_0's auc: 0.891899
[1395]	valid_0's auc: 0.891919
[1396]	valid_0's auc: 0.891924
[1397]	vali

[1632]	valid_0's auc: 0.893371
[1633]	valid_0's auc: 0.893386
[1634]	valid_0's auc: 0.893381
[1635]	valid_0's auc: 0.893383
[1636]	valid_0's auc: 0.893404
[1637]	valid_0's auc: 0.893383
[1638]	valid_0's auc: 0.893392
[1639]	valid_0's auc: 0.893421
[1640]	valid_0's auc: 0.893424
[1641]	valid_0's auc: 0.893426
[1642]	valid_0's auc: 0.893443
[1643]	valid_0's auc: 0.893454
[1644]	valid_0's auc: 0.893446
[1645]	valid_0's auc: 0.893457
[1646]	valid_0's auc: 0.893453
[1647]	valid_0's auc: 0.893442
[1648]	valid_0's auc: 0.893446
[1649]	valid_0's auc: 0.893429
[1650]	valid_0's auc: 0.893427
[1651]	valid_0's auc: 0.893421
[1652]	valid_0's auc: 0.89342
[1653]	valid_0's auc: 0.89342
[1654]	valid_0's auc: 0.89343
[1655]	valid_0's auc: 0.893429
[1656]	valid_0's auc: 0.89344
[1657]	valid_0's auc: 0.893458
[1658]	valid_0's auc: 0.893461
[1659]	valid_0's auc: 0.893459
[1660]	valid_0's auc: 0.893454
[1661]	valid_0's auc: 0.893463
[1662]	valid_0's auc: 0.893481
[1663]	valid_0's auc: 0.893495
[1664]	valid

[1898]	valid_0's auc: 0.894095
[1899]	valid_0's auc: 0.894086
[1900]	valid_0's auc: 0.894094
[1901]	valid_0's auc: 0.894107
[1902]	valid_0's auc: 0.89412
[1903]	valid_0's auc: 0.894109
[1904]	valid_0's auc: 0.894114
[1905]	valid_0's auc: 0.894098
[1906]	valid_0's auc: 0.894095
[1907]	valid_0's auc: 0.894107
[1908]	valid_0's auc: 0.894112
[1909]	valid_0's auc: 0.894133
[1910]	valid_0's auc: 0.894149
[1911]	valid_0's auc: 0.894138
[1912]	valid_0's auc: 0.894136
[1913]	valid_0's auc: 0.894133
[1914]	valid_0's auc: 0.894128
[1915]	valid_0's auc: 0.894131
[1916]	valid_0's auc: 0.894131
[1917]	valid_0's auc: 0.894128
[1918]	valid_0's auc: 0.89415
[1919]	valid_0's auc: 0.894169
[1920]	valid_0's auc: 0.894161
[1921]	valid_0's auc: 0.894171
[1922]	valid_0's auc: 0.894177
[1923]	valid_0's auc: 0.894181
[1924]	valid_0's auc: 0.89419
[1925]	valid_0's auc: 0.894185
[1926]	valid_0's auc: 0.894194
[1927]	valid_0's auc: 0.894194
[1928]	valid_0's auc: 0.894191
[1929]	valid_0's auc: 0.894191
[1930]	vali

[2163]	valid_0's auc: 0.89466
[2164]	valid_0's auc: 0.894663
[2165]	valid_0's auc: 0.894664
[2166]	valid_0's auc: 0.894663
[2167]	valid_0's auc: 0.894655
[2168]	valid_0's auc: 0.894669
[2169]	valid_0's auc: 0.894672
[2170]	valid_0's auc: 0.89467
[2171]	valid_0's auc: 0.894677
[2172]	valid_0's auc: 0.894677
[2173]	valid_0's auc: 0.894676
[2174]	valid_0's auc: 0.894688
[2175]	valid_0's auc: 0.894679
[2176]	valid_0's auc: 0.894684
[2177]	valid_0's auc: 0.894679
[2178]	valid_0's auc: 0.894685
[2179]	valid_0's auc: 0.894693
[2180]	valid_0's auc: 0.894701
[2181]	valid_0's auc: 0.894704
[2182]	valid_0's auc: 0.89471
[2183]	valid_0's auc: 0.894701
[2184]	valid_0's auc: 0.894702
[2185]	valid_0's auc: 0.894699
[2186]	valid_0's auc: 0.894711
[2187]	valid_0's auc: 0.894698
[2188]	valid_0's auc: 0.8947
[2189]	valid_0's auc: 0.894704
[2190]	valid_0's auc: 0.894692
[2191]	valid_0's auc: 0.894694
[2192]	valid_0's auc: 0.894697
[2193]	valid_0's auc: 0.894708
[2194]	valid_0's auc: 0.894705
[2195]	valid_

In [7]:
y_train_pred = model_lgbm.predict(X_train)
y_valid_pred = model_lgbm.predict(X_valid)

print("AUC Train: {:.4f}\nAUC Valid: {:.4f}".format(roc_auc_score(y_train, y_train_pred),
                                                    roc_auc_score(y_valid, y_valid_pred)))

AUC Train: 0.9883
AUC Valid: 0.8953


In [8]:
#Step3: Find predictions for test data

#Read the test and sample submission data

df_test = pd.read_csv('E:/datafiles/santander-customer-transaction-prediction/test.csv')
df_sample_submission = pd.read_csv('E:/datafiles/santander-customer-transaction-prediction/sample_submission.csv')

df_test.shape, df_sample_submission.shape

((200000, 201), (200000, 2))

In [9]:
X_test = df_test.loc[:,var_columns]
df_sample_submission['target'] = model_lgbm.predict(X_test)
df_sample_submission

Unnamed: 0,ID_code,target
0,test_0,0.348873
1,test_1,0.493735
2,test_2,0.520182
3,test_3,0.433698
4,test_4,0.207041
...,...,...
199995,test_199995,0.181327
199996,test_199996,0.027922
199997,test_199997,0.011246
199998,test_199998,0.286368
