# Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [51]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#plt.rcParams['figure.figsize'] = [10, 5]
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import missingno as msno
import pandas_profiling

%matplotlib inline

!python --version

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn import metrics

Python 3.6.5 :: Anaconda, Inc.


In [3]:
## Show all rows and columns instead of showing only some part and hiding other parts for large data.
def show_full_data(data, row_size=None, column_size=None):
    with pd.option_context('display.max_rows', row_size, 'display.max_columns', column_size):
        display(data)

# Import Data

In [4]:
train = pd.read_csv('datasets/train.csv')
test = pd.read_csv('datasets/test.csv')
print(train.shape)
print(test.shape)

(307511, 714)
(48744, 713)


# Saving TARGET values and Column names

In [5]:
# Copying labels to not lose them and use later
train_labels = train.TARGET.copy()
train_labels.shape

(307511,)

In [6]:
# Saving IDs 
train_id = train.SK_ID_CURR.copy()
test_id = test.SK_ID_CURR.copy()

In [7]:
train.drop(columns='TARGET', inplace=True)
train.shape

(307511, 713)

In [8]:
# Extracting column names to not lose them
column_names = train.columns.copy()
column_names.shape

(713,)

## Imputing missing values

Even if we dropped some features before we create our dataset, there are some missing values in our dateset. Scikit-Learn algorithms generally does not work with missing values. If we want to use scaling algorithms, the algorithm probably gives error. Also, most algorithms (lightgbm works) does not work with NaN values. Therefore, if we want to use our dataset, we should find a solution for our NaN values.  
  
Imputer from Scikit performs different methods to fill NaN values. I use mean strategy to calculate mean of the values and fill NaN values with this algorithm.  I do this for train and test data because if we want to make predictions on test data, test data should not have NaN values.

### Train

In [9]:
from sklearn.preprocessing import Imputer

In [10]:
# Create our imputer to replace missing values with the mean e.g.
imp = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=1)
imp = imp.fit(train)
# Impute our data, then train
train_imp = imp.transform(train)

  "observed values: %s" % missing)


In [11]:
# Choosing dropped columns during imputing.
nanmask = np.isnan(imp.statistics_)

In [12]:
np.where(nanmask==True)

(array([380, 381]),)

In [13]:
# Names of dropped columns.
column_names.values[380:382]

array(['previous_app/credit_mean', 'previous_app/credit_max'],
      dtype=object)

These 2 columns are dropped because they cannot be filled.

In [14]:
column_names_imp = column_names.values
column_names_imp = np.delete(column_names_imp, [380, 381])

We also drop these columns from column names to equalize the size of columns and column names.

In [15]:
train_imp = pd.DataFrame(train_imp)
print(train_imp.shape)
train_imp.head()

(307511, 711)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,701,702,703,704,705,706,707,708,709,710
0,100002.0,0.0,0.0,1.0,0.0,202500.0,406597.5,24700.5,351000.0,0.018801,...,0.0,0.0,0.0,0.0,0.0,-20.421053,-12.0,-31.0,24.25731,-388.0
1,100003.0,0.0,0.0,0.0,0.0,270000.0,1293502.5,35698.5,1129500.0,0.003541,...,0.0,0.0,0.0,0.0,0.0,-7.16,-1.0,-14.0,13.89,-179.0
2,100004.0,1.0,1.0,1.0,0.0,67500.0,135000.0,6750.0,135000.0,0.010032,...,0.0,0.0,0.0,0.0,0.0,-7.666667,-3.0,-11.0,17.333333,-23.0
3,100006.0,0.0,0.0,1.0,0.0,135000.0,312682.5,29686.5,297000.0,0.008019,...,379.700182,24001.706113,-6797.926065,354638600.0,7133.394533,-11.130596,16.6321,-38.073301,787.185389,-353.020996
4,100007.0,0.0,0.0,1.0,0.0,121500.0,513000.0,21865.5,513000.0,0.028663,...,-452.384318,0.0,-22655.655,8084830.0,-29857.365,-3.636364,12.0,-31.0,63.865734,-240.0


In [16]:
train_imp.columns = column_names_imp
train_imp.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,installments_AMT_PAYMENT-INSTALLMENT_mean,installments_AMT_PAYMENT-INSTALLMENT_max,installments_AMT_PAYMENT-INSTALLMENT_min,installments_AMT_PAYMENT-INSTALLMENT_var,installments_AMT_PAYMENT-INSTALLMENT_sum,installments_DAYS_PAYMENT-INSTALLMENT_mean,installments_DAYS_PAYMENT-INSTALLMENT_max,installments_DAYS_PAYMENT-INSTALLMENT_min,installments_DAYS_PAYMENT-INSTALLMENT_var,installments_DAYS_PAYMENT-INSTALLMENT_sum
0,100002.0,0.0,0.0,1.0,0.0,202500.0,406597.5,24700.5,351000.0,0.018801,...,0.0,0.0,0.0,0.0,0.0,-20.421053,-12.0,-31.0,24.25731,-388.0
1,100003.0,0.0,0.0,0.0,0.0,270000.0,1293502.5,35698.5,1129500.0,0.003541,...,0.0,0.0,0.0,0.0,0.0,-7.16,-1.0,-14.0,13.89,-179.0
2,100004.0,1.0,1.0,1.0,0.0,67500.0,135000.0,6750.0,135000.0,0.010032,...,0.0,0.0,0.0,0.0,0.0,-7.666667,-3.0,-11.0,17.333333,-23.0
3,100006.0,0.0,0.0,1.0,0.0,135000.0,312682.5,29686.5,297000.0,0.008019,...,379.700182,24001.706113,-6797.926065,354638600.0,7133.394533,-11.130596,16.6321,-38.073301,787.185389,-353.020996
4,100007.0,0.0,0.0,1.0,0.0,121500.0,513000.0,21865.5,513000.0,0.028663,...,-452.384318,0.0,-22655.655,8084830.0,-29857.365,-3.636364,12.0,-31.0,63.865734,-240.0


In [17]:
train_imp.isnull().sum().sum()

0

We checked how many null values in our dataset. It is zero so we have completed our goal.

### Test

We are doing similar process for test data to fill all NaN values.

In [18]:
imp = imp.fit(test)
# Impute our data, then train
test_imp = imp.transform(test)
test_imp.shape

(48744, 713)

In [19]:
nanmask_test = np.isnan(imp.statistics_)
np.where(nanmask_test==True)

(array([], dtype=int64),)

In [20]:
test_imp = np.delete(test_imp, [380, 381], axis=1)
test_imp.shape

(48744, 711)

In [21]:
test_imp = pd.DataFrame(test_imp)
test_imp.columns = column_names_imp
test_imp.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,installments_AMT_PAYMENT-INSTALLMENT_mean,installments_AMT_PAYMENT-INSTALLMENT_max,installments_AMT_PAYMENT-INSTALLMENT_min,installments_AMT_PAYMENT-INSTALLMENT_var,installments_AMT_PAYMENT-INSTALLMENT_sum,installments_DAYS_PAYMENT-INSTALLMENT_mean,installments_DAYS_PAYMENT-INSTALLMENT_max,installments_DAYS_PAYMENT-INSTALLMENT_min,installments_DAYS_PAYMENT-INSTALLMENT_var,installments_DAYS_PAYMENT-INSTALLMENT_sum
0,100001.0,0.0,0.0,1.0,0.0,135000.0,568800.0,20560.5,450000.0,0.01885,...,0.0,0.0,0.0,0.0,0.0,-7.285714,11.0,-36.0,213.904762,-51.0
1,100005.0,0.0,0.0,1.0,0.0,99000.0,222768.0,17370.0,180000.0,0.035792,...,0.0,0.0,0.0,0.0,0.0,-23.555556,1.0,-37.0,182.527778,-212.0
2,100013.0,0.0,1.0,1.0,0.0,202500.0,663264.0,69777.0,630000.0,0.019101,...,-1157.662742,0.0,-23147.82,23467720.0,-179437.725,-5.180645,21.0,-38.0,127.200922,-803.0
3,100028.0,0.0,0.0,1.0,2.0,315000.0,1575000.0,49018.5,1575000.0,0.026392,...,-622.550708,0.0,-8505.0,2972476.0,-70348.23,-3.0,7.0,-19.0,26.678571,-339.0
4,100038.0,0.0,1.0,0.0,1.0,180000.0,625500.0,32067.0,625500.0,0.010032,...,571.951414,28566.898614,-7136.377516,380383500.0,15742.302464,-11.552455,12.531942,-38.759448,544.77243,-381.936021


In [22]:
test_imp.isnull().sum().sum()

0

In [23]:
train_imp = train_imp.drop(columns='SK_ID_CURR')
test_imp = test_imp.drop(columns='SK_ID_CURR')

# Sampling Data

- There are a lot of models for classification problems. Some models like deep neural networks have high performance but they need high performance computers. In this problem, because of I am using a laptop to make computations, I would like to try less computationally expensive model.
- Moreover, we have imbalanced dataset; one class has way less instances than other one. Risky instances (TARGET=1) are rare instances in our dataset. There are different approaches like undersampling and oversampling to overcome this problem but none of them increases performance too much. In this type of problem, if we can reach the data source, we should try to collect more data for rare class. But, in this work, we cannot reach the data source and we try to increase the size of rare instances. 

There are a lot of different ranges of data in our dataset. Some of them are too large and some of them are too small. Calculations with these types of high range of numbers could be hard for computers. Relating values with each other could be hard and finding similarities in high dimensions could not be reachable. Moreover, if we want to use neural networks, we should normalize our values to increase performance of neural networks. Therefore, before we move to feature selection and modeling parts, I would like to scale feature values in between 0 and 1.   
  
This scaling could be better with transformations. For instance, we have some skewed data which have a lot of values in a short range on one side of distribution. When we scale highly skewed data, it would not have distinguishing values. However, I do not transform my data to see the results. 
  
I oversampled to increase the size of rare data and undersampled to decrease the size of common data. I use SMOTE method which is highly effective for oversampling. For undersampling, I use Random Under Sampler.

In [24]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [25]:
print("Number transactions X_train dataset: ", train_imp.shape)
print("Number transactions y_train dataset: ", test_imp.shape)

print("Before OverSampling, counts of label '0': {}".format(sum(train_labels.values==0)))
print("Before OverSampling, counts of label '1': {} \n".format(sum(train_labels.values==1)))

Number transactions X_train dataset:  (307511, 710)
Number transactions y_train dataset:  (48744, 710)
Before OverSampling, counts of label '0': 282686
Before OverSampling, counts of label '1': 24825 



In [26]:
sm = SMOTE()
train_os, label_os = sm.fit_sample(train_imp, train_labels.values.ravel())

print('After OverSampling, the shape of train_X: {}'.format(train_os.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(label_os.shape))
print("After OverSampling, counts of label '0': {}".format(sum(label_os==0)))
print("After OverSampling, counts of label '1': {}".format(sum(label_os==1)))

After OverSampling, the shape of train_X: (565372, 710)
After OverSampling, the shape of train_y: (565372,) 

After OverSampling, counts of label '0': 282686
After OverSampling, counts of label '1': 282686


In [27]:
rus = RandomUnderSampler(return_indices=True)
train_rus, label_rus, idx_resampled = rus.fit_sample(train_imp, train_labels.values.ravel())

print('After Undersampling, the shape of train_X: {}'.format(train_rus.shape))
print('After Undersampling, the shape of train_y: {} \n'.format(label_rus.shape))
print("After Undersampling, counts of label '0': {}".format(sum(label_rus==0)))
print("After Undersampling, counts of label '1': {}".format(sum(label_rus==1)))

After Undersampling, the shape of train_X: (49650, 710)
After Undersampling, the shape of train_y: (49650,) 

After Undersampling, counts of label '0': 24825
After Undersampling, counts of label '1': 24825


# Feature Selection

We created a lot of new features during dataset creation. However, some of these features may not be useful for training machine learning model. They may decrease the performance or increase the chance of overfitting. Therefore, we can use feature selection techniques to eliminate unnecessary features. There are different types of approaches , some of them needs too much computing power and others may not.

In [28]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif, chi2, f_classif

## For Oversampled Data

In [30]:
selector_os = SelectKBest(mutual_info_classif, k=250)
selector_os.fit(train_os, label_os)

SelectKBest(k=250,
      score_func=<function mutual_info_classif at 0x7f9a69119b70>)

In [31]:
train_os_sel = selector_os.transform(train_os)
train_os_sel.shape

(565372, 250)

In [32]:
selector_os.get_support(indices=True)

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  17,
        19,  20,  21,  22,  27,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  62,  64,  66,  90,
        91,  92,  93,  94,  95,  96,  97,  98, 105, 112, 117, 119, 161,
       218, 228, 235, 236, 237, 238, 239, 241, 246, 247, 249, 250, 251,
       252, 253, 255, 261, 262, 269, 278, 279, 280, 282, 283, 284, 295,
       296, 297, 298, 317, 324, 328, 329, 330, 333, 334, 335, 336, 337,
       339, 343, 344, 345, 347, 348, 349, 351, 352, 353, 354, 355, 356,
       357, 358, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371,
       373, 374, 375, 377, 378, 379, 382, 383, 385, 386, 387, 388, 389,
       390, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
       405, 406, 455, 456, 457, 458, 459, 460, 461, 464, 465, 470, 471,
       472, 483, 488, 489, 498, 499, 500, 501, 503, 504, 511, 51

In [33]:
selected_features_os = selector_os.get_support(indices=True)
selected_features_os = pd.DataFrame(selected_features_os)
selected_features_os.head()

Unnamed: 0,0
0,1
1,2
2,3
3,4
4,5


In [34]:
selected_features_os.to_csv('selected_features_os_250.csv', index=False, header=False)

In [35]:
test_os_sel = selector_os.transform(test_imp)
test_os_sel.shape

(48744, 250)

## For Undersampled Data

In [36]:
selector_rus = SelectKBest(mutual_info_classif, k=250)
selector_rus.fit(train_rus, label_rus)
train_rus_sel = selector_rus.transform(train_rus)
train_rus_sel.shape

(49650, 250)

In [37]:
selector_rus.get_support(indices=True)

array([  0,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  20,  21,
        22,  26,  27,  29,  30,  31,  32,  34,  37,  38,  39,  41,  43,
        44,  46,  48,  52,  55,  56,  57,  59,  61,  64,  66,  76,  84,
        91,  92,  93,  94,  96,  97,  98, 103, 105, 112, 114, 117, 121,
       124, 128, 131, 132, 147, 176, 188, 197, 202, 205, 213, 217, 225,
       228, 236, 237, 238, 239, 240, 241, 242, 245, 246, 247, 248, 249,
       250, 251, 252, 253, 254, 255, 258, 260, 261, 262, 265, 266, 267,
       268, 270, 271, 273, 275, 277, 278, 279, 280, 281, 282, 283, 284,
       290, 291, 295, 296, 298, 299, 308, 309, 311, 318, 319, 322, 324,
       327, 329, 330, 333, 338, 343, 345, 346, 348, 349, 351, 354, 355,
       356, 358, 359, 360, 361, 362, 363, 365, 366, 367, 369, 371, 372,
       373, 379, 380, 382, 384, 385, 388, 390, 393, 401, 402, 404, 413,
       442, 450, 457, 460, 461, 465, 468, 470, 471, 472, 473, 476, 478,
       483, 491, 494, 501, 503, 506, 514, 515, 517, 520, 522, 52

In [38]:
selected_features_rus = selector_rus.get_support(indices=True)
selected_features_rus = pd.DataFrame(selected_features_rus)
selected_features_rus.head()

Unnamed: 0,0
0,0
1,5
2,6
3,7
4,8


In [39]:
selected_features_rus.to_csv('selected_features_rus_250.csv', index=False, header=False)

In [40]:
test_rus_sel = selector_rus.transform(test_imp)
test_rus_sel.shape

(48744, 250)

# PCA

Principal Component Analysis (PCA) is used for dimensionality reduction to take important feature combinations. It uses orthogonal transformation to convert feature combinations in lower dimension. There is a good explanation of PCA in http://setosa.io/ev/principal-component-analysis/  
I used PCA to create 50 features. I want to compare performance of this dataset with others that I have created before. The downside of PCA algorithm is that you cannot know which features are more important for model. To be able to use PCA, I need to apply it on both train and test data.

In [41]:
from sklearn.decomposition import PCA

## Oversampled

In [42]:
concat_os = np.concatenate((train_os_sel, test_os_sel), axis=0)

In [44]:
pca_os = PCA(n_components=50)
pca_os.fit(concat_os)
concat_os_pca = pca_os.transform(concat_os)

In [45]:
train_os_pca = concat_os_pca[:565372, :]
test_os_pca = concat_os_pca[565372:, :]

In [46]:
print(train_os_pca.shape)
print(test_os_pca.shape)

(565372, 50)
(48744, 50)


## Undersampled

In [47]:
concat_rus = np.concatenate((train_rus_sel, test_rus_sel), axis=0)

In [48]:
pca_rus = PCA(n_components=50)
pca_rus.fit(concat_os)
concat_rus_pca = pca_rus.transform(concat_rus)

In [49]:
train_rus_pca = concat_rus_pca[:49650, :]
test_rus_pca = concat_rus_pca[49650:, :]

In [50]:
print(train_rus_pca.shape)
print(test_rus_pca.shape)

(49650, 50)
(48744, 50)


# Scaling

I used scaling to fit data in better distribution. Therefore, model can recognize patterns better and their performances would increase.

In [52]:
from sklearn.preprocessing import RobustScaler

## Oversampled Data

In [53]:
scaler = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True)

### Train

In [54]:
scaler.fit(train_os_sel)

RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)

In [55]:
train_os_sel_scaled = scaler.transform(train_os_sel)

In [56]:
train_os_sel_scaled = pd.DataFrame(train_os_sel_scaled)
train_os_sel_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,240,241,242,243,244,245,246,247,248,249
0,0.0,0.0,0.0,0.627969,-0.220236,-0.026622,-0.23913,0.0,0.880836,0.283063,...,-0.270268,-0.462276,0.523779,0.070544,0.047336,-0.139398,-1.579874,-1.082012,0.414038,-0.320302
1,0.0,-1.219961,0.0,1.378374,1.696667,0.679665,1.641304,-0.970456,-0.267411,0.002875,...,-0.300205,-0.185683,-0.534629,-0.423265,-0.14012,-0.232825,0.675213,-0.42064,1.821766,0.587345
2,1.102932,0.0,0.0,-0.872841,-0.807251,-1.179395,-0.76087,-0.557663,-0.626002,0.492569,...,0.395007,-1.199859,-1.19721,-1.058163,-0.215881,-0.412263,0.589053,-0.540889,2.070188,1.264823
3,0.0,0.0,0.0,-0.122436,-0.423219,0.293577,-0.369565,-0.685679,-0.619557,-0.938375,...,0.688208,0.198436,0.492122,0.468891,0.854294,0.710823,0.0,0.639485,-0.171685,-0.168394
4,0.0,0.0,0.0,-0.272517,0.009736,-0.208685,0.152174,0.627172,-0.765288,-0.937867,...,-0.06077,2.165362,-0.111813,-0.070544,-0.080681,0.268416,1.27442,0.360981,0.414038,0.322434


### Test

In [57]:
scaler_test = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True)
scaler_test.fit(test_os_sel)
test_os_sel_scaled = scaler_test.transform(test_os_sel)

In [58]:
test_os_sel_scaled = pd.DataFrame(test_os_sel_scaled)
test_os_sel_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,240,241,242,243,244,245,246,247,248,249
0,0.0,0.0,0.0,-0.2,0.286707,-0.291541,0.133333,0.0,-0.483966,-0.396327,...,-0.031154,-0.785714,-0.792839,-0.77273,-0.147964,-0.291908,0.535583,0.591194,-0.066667,0.89769
1,0.0,0.0,0.0,-0.52,-0.548393,-0.455852,-0.533333,0.908077,-0.319143,-1.214996,...,-0.081504,-0.714286,-0.371762,-0.450759,-0.098401,-0.258326,-1.752249,-0.147798,-0.133333,0.366337
2,1.0,0.0,0.0,0.4,0.514683,2.243105,0.577778,0.013453,-0.595575,-1.210788,...,0.103462,0.464286,6.762943,6.246231,9.889809,8.438212,0.831593,1.330186,-0.2,-1.584158
3,0.0,0.0,2.0,1.4,2.71503,1.174044,2.911111,0.404245,0.253326,-0.219204,...,0.791184,0.785714,4.326692,3.73486,3.625167,4.135796,1.138231,0.295597,1.066667,-0.052805
4,1.0,-1.0,1.0,0.2,0.423545,0.301043,0.566667,-0.472638,0.3844,-0.343535,...,0.787565,0.283725,0.615957,0.613635,0.908264,0.784298,-0.064398,0.704403,-0.25063,-0.194508


## For Undersampled Data

### Train

In [59]:
scaler_rus_train = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True)
scaler_rus_train.fit(train_rus_sel)
train_rus_sel_scaled = scaler.transform(train_rus_sel)
train_rus_sel_scaled = pd.DataFrame(train_rus_sel_scaled)
train_rus_sel_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,240,241,242,243,244,245,246,247,248,249
0,0.0,548981.3,27193.5,3.379454,-1.099032,-2.577648,-1.087536,-486882.726216,1.568307,0.607492,...,1287.124336,39518.699572,-1.617777,185753.935866,-3.150709,-20.441119,598208.1,-1927.196922,5.649819,-0.08579
1,0.0,1072930.0,25843.5,5.380534,-1.099032,-2.397448,-1.092957,-14500.802355,2.107687,0.607492,...,2838.622439,1884.291999,12.893391,28237.711436,-0.461907,-3.75782,8764.223,-203.281378,9.596699,-0.928294
2,0.0,383133.7,13963.5,1.028185,-1.099032,-2.782444,-1.103915,-676395.129703,2.091495,0.607492,...,1471.669531,949.303596,16.000852,17958.796258,-1.833804,-8.429144,153270.3,-1014.96483,8.934365,-0.398472
3,0.0,1288823.0,31018.5,8.632289,-1.099032,-2.866957,-1.090623,-642308.334982,1.883823,0.607492,...,297.479378,6516.525615,1468.787907,48423.790996,3.034559,-10.502212,60307510.0,428.532692,68.165954,-0.168394
4,0.0,1019130.0,40320.0,5.880804,-1.099032,-2.655803,-1.090662,-404145.935297,1.674422,0.607492,...,7.279409,593.806947,981.240337,2843.37457,-5.826623,-6.99705,473290.7,-275.061252,19.511033,1.01294


### Test

In [60]:
scaler_rus_test = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True)
scaler_rus_test.fit(test_rus_sel)
test_rus_sel_scaled = scaler.transform(test_rus_sel)
test_rus_sel_scaled = pd.DataFrame(test_rus_sel_scaled)
test_rus_sel_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,240,241,242,243,244,245,246,247,248,249
0,0.0,693912.7,20560.5,3.379454,-1.099032,-2.848526,-1.092582,-328786.137266,2.240528,0.607492,...,2.539329,800.640116,848.326176,2904.864436,-0.224464,-0.421161,1.892796,-0.360515,20.693998,1.143224
1,0.0,271767.1,17370.0,0.377834,-1.099032,-2.77294,-1.097751,-579858.274504,2.113033,0.607492,...,17.380295,812.549541,1033.805807,3960.623529,-0.224464,-0.421161,1.892796,-0.360515,18.095747,0.444032
2,0.0,809155.2,69777.0,5.380534,-1.099032,-2.899709,-1.097725,-138319.812285,1.817482,0.607492,...,3008.242095,16471.960252,12.893391,119160.204776,-10.160854,-34.748449,3990768.0,-10788.999922,13.514266,-2.122566
3,0.0,1921438.0,49018.5,15.886203,-1.099032,-2.51041,-1.091464,-127190.728187,1.70665,0.607492,...,1132.362503,1795.945266,-1.375763,39691.06083,-5.567909,-13.033734,505481.5,-4230.026748,5.190258,-0.107504
4,0.0,763084.5,32067.0,5.330507,-1.099032,-2.450301,-1.092249,-254380.260729,1.698161,0.607492,...,318.815205,7139.477267,1580.849589,54068.68942,4.68468,-11.004122,64685510.0,946.1407,48.092334,-0.293967


# Model

The model is taken from William Koehrsen's kernel and modified by me. LightGBM is very effective gradient boosting algorithm. It uses multiple tree based algorithms to improve prediction performance.

## LightGBM

In [61]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import gc

In [73]:
# Taken from William Koehrsen and modified.
def model(features, test_features, encoding = 'ohe', n_folds = 5):
    
    """Train and test a light gradient boosting model using
    cross validation. 
    
    Parameters
    --------
        features (pd.DataFrame): 
            dataframe of training features to use 
            for training a model. Must include the TARGET column.
        test_features (pd.DataFrame): 
            dataframe of testing features to use
            for making predictions with the model. 
        encoding (str, default = 'ohe'): 
            method for encoding categorical variables. Either 'ohe' for one-hot encoding or 'le' for integer label encoding
            n_folds (int, default = 5): number of folds to use for cross validation
        
    Return
    --------
        submission (pd.DataFrame): 
            dataframe with `SK_ID_CURR` and `TARGET` probabilities
            predicted by the model.
        feature_importances (pd.DataFrame): 
            dataframe with the feature importances from the model.
        valid_metrics (pd.DataFrame): 
            dataframe with training and validation metrics (ROC AUC) for each fold and overall.
        
    """
    
    # Extract the ids
    train_ids = features['SK_ID_CURR']
    test_ids = test_features['SK_ID_CURR']
    
    # Extract the labels for training
    labels = features['TARGET']
    
    # Remove the ids and target
    features = features.drop(columns = ['SK_ID_CURR', 'TARGET'])
    test_features = test_features.drop(columns = ['SK_ID_CURR'])
    
    
    # One Hot Encoding
    if encoding == 'ohe':
        features = pd.get_dummies(features)
        test_features = pd.get_dummies(test_features)
        
        # Align the dataframes by the columns
        features, test_features = features.align(test_features, join = 'inner', axis = 1)
        
        # No categorical indices to record
        cat_indices = 'auto'
    
    # Integer label encoding
    elif encoding == 'le':
        
        # Create a label encoder
        label_encoder = LabelEncoder()
        
        # List for storing categorical indices
        cat_indices = []
        
        # Iterate through each column
        for i, col in enumerate(features):
            if features[col].dtype == 'object':
                # Map the categorical features to integers
                features[col] = label_encoder.fit_transform(np.array(features[col].astype(str)).reshape((-1,)))
                test_features[col] = label_encoder.transform(np.array(test_features[col].astype(str)).reshape((-1,)))

                # Record the categorical indices
                cat_indices.append(i)
    
    # Catch error if label encoding scheme is not valid
    else:
        raise ValueError("Encoding must be either 'ohe' or 'le'")
        
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
    
    # Extract feature names
    feature_names = list(features.columns)
    
    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)
    
    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 53)
    
    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])
    
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []
    
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features):
        
        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]
        
        # Create the model
        model = lgb.LGBMClassifier(n_estimators=1000, objective = 'binary', 
                                   class_weight = 'balanced', learning_rate = 0.01, 
                                   reg_alpha = 0.1, reg_lambda = 0.1, 
                                   subsample = 0.8, n_jobs = -1, random_state = 50)
        
        # Train the model
        model.fit(train_features, train_labels, eval_metric = 'auc',
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'], categorical_feature = cat_indices,
                  early_stopping_rounds = 100, verbose = 200)
        
        # Record the best iteration
        best_iteration = model.best_iteration_
        
        # Record the feature importances
        feature_importance_values += model.feature_importances_ / k_fold.n_splits
        
        # Make predictions
        test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
        
        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
        
        # Record the best score
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        
        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()
        
    # Make the submission dataframe
    submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})
    
    # Make the feature importance dataframe
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
    
    # Overall validation score
    valid_auc = roc_auc_score(labels, out_of_fold)
    
    # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores}) 
    
    return submission, feature_importances, metrics

In [63]:
training = train.copy()

In [64]:
training['TARGET'] = train_labels

In [65]:
training.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,installments_AMT_PAYMENT-INSTALLMENT_max,installments_AMT_PAYMENT-INSTALLMENT_min,installments_AMT_PAYMENT-INSTALLMENT_var,installments_AMT_PAYMENT-INSTALLMENT_sum,installments_DAYS_PAYMENT-INSTALLMENT_mean,installments_DAYS_PAYMENT-INSTALLMENT_max,installments_DAYS_PAYMENT-INSTALLMENT_min,installments_DAYS_PAYMENT-INSTALLMENT_var,installments_DAYS_PAYMENT-INSTALLMENT_sum,TARGET
0,100002,0,0,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,...,0.0,0.0,0.0,0.0,-20.421053,-12.0,-31.0,24.25731,-388.0,1
1,100003,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,...,0.0,0.0,0.0,0.0,-7.16,-1.0,-14.0,13.89,-179.0,0
2,100004,1,1,1,0,67500.0,135000.0,6750.0,135000.0,0.010032,...,0.0,0.0,0.0,0.0,-7.666667,-3.0,-11.0,17.333333,-23.0,0
3,100006,0,0,1,0,135000.0,312682.5,29686.5,297000.0,0.008019,...,,,,,,,,,,0
4,100007,0,0,1,0,121500.0,513000.0,21865.5,513000.0,0.028663,...,0.0,-22655.655,8084830.0,-29857.365,-3.636364,12.0,-31.0,63.865734,-240.0,0


In [74]:
submission, fi, metrics = model(training, test)
print('Baseline metrics')
print(metrics)

Training Data Shape:  (307511, 712)
Testing Data Shape:  (48744, 712)
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.75122	train's auc: 0.766551
[400]	valid's auc: 0.765396	train's auc: 0.790797
[600]	valid's auc: 0.771696	train's auc: 0.805451
[800]	valid's auc: 0.774912	train's auc: 0.816571
[1000]	valid's auc: 0.776506	train's auc: 0.826096
Did not meet early stopping. Best iteration is:
[1000]	valid's auc: 0.776506	train's auc: 0.826096
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.749326	train's auc: 0.766552
[400]	valid's auc: 0.764549	train's auc: 0.790716
[600]	valid's auc: 0.771273	train's auc: 0.805214
[800]	valid's auc: 0.774463	train's auc: 0.816369
[1000]	valid's auc: 0.775991	train's auc: 0.825956
Did not meet early stopping. Best iteration is:
[998]	valid's auc: 0.775992	train's auc: 0.825879
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.74608	train's auc: 0.7

As it can be seen on performance of the model, trainin performance is better than validation performance. Training performance is about the how well the model learned the training data. Validation performance is how well the model performns on unseen testing data. Because of training performance is better than testing performance, we can say that our model overfits the data a bit. With hyperparameter tuning, we can reach better performance and try to avoid overfitting training data. 

In [75]:
submission.to_csv('raw_lgb.csv', index = False)