# Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#plt.rcParams['figure.figsize'] = [10, 5]
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import missingno as msno
import pandas_profiling

%matplotlib inline

!python --version

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn import metrics

Python 3.6.5 :: Anaconda, Inc.


In [4]:
## Show all rows and columns instead of showing only some part and hiding other parts for large data.
def show_full_data(data, row_size=None, column_size=None):
    with pd.option_context('display.max_rows', row_size, 'display.max_columns', column_size):
        display(data)

# Import Data

In [5]:
train = pd.read_csv('datasets/train.csv')
test = pd.read_csv('datasets/test.csv')
print(train.shape)
print(test.shape)

(307511, 714)
(48744, 713)


# Saving TARGET values and Column names

In [6]:
# Copying labels to not lose them and use later
train_labels = train.TARGET.copy()
train_labels.shape

(307511,)

In [7]:
# Saving IDs 
train_id = train.SK_ID_CURR.copy()
test_id = test.SK_ID_CURR.copy()

In [8]:
train.drop(columns='TARGET', inplace=True)
train.shape

(307511, 713)

In [9]:
# Extracting column names to not lose them
column_names = train.columns.copy()
column_names.shape

(713,)

## Imputing missing values

Even if we dropped some features before we create our dataset, there are some missing values in our dateset. Scikit-Learn algorithms generally does not work with missing values. If we want to use scaling algorithms, the algorithm probably gives error. Also, most algorithms (lightgbm works) does not work with NaN values. Therefore, if we want to use our dataset, we should find a solution for our NaN values.  
  
Imputer from Scikit performs different methods to fill NaN values. I use mean strategy to calculate mean of the values and fill NaN values with this algorithm.  I do this for train and test data because if we want to make predictions on test data, test data should not have NaN values.

### Train

In [10]:
from sklearn.preprocessing import Imputer

In [11]:
# Create our imputer to replace missing values with the mean e.g.
imp = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=1)
imp = imp.fit(train)
# Impute our data, then train
train_imp = imp.transform(train)

  "observed values: %s" % missing)


In [12]:
# Choosing dropped columns during imputing.
nanmask = np.isnan(imp.statistics_)

In [13]:
np.where(nanmask==True)

(array([380, 381]),)

In [14]:
# Names of dropped columns.
column_names.values[380:382]

array(['previous_app/credit_mean', 'previous_app/credit_max'],
      dtype=object)

These 2 columns are dropped because they cannot be filled.

In [15]:
column_names_imp = column_names.values
column_names_imp = np.delete(column_names_imp, [380, 381])

We also drop these columns from column names to equalize the size of columns and column names.

In [16]:
train_imp = pd.DataFrame(train_imp)
print(train_imp.shape)
train_imp.head()

(307511, 711)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,701,702,703,704,705,706,707,708,709,710
0,100002.0,0.0,0.0,1.0,0.0,202500.0,406597.5,24700.5,351000.0,0.018801,...,0.0,0.0,0.0,0.0,0.0,-20.421053,-12.0,-31.0,24.25731,-388.0
1,100003.0,0.0,0.0,0.0,0.0,270000.0,1293502.5,35698.5,1129500.0,0.003541,...,0.0,0.0,0.0,0.0,0.0,-7.16,-1.0,-14.0,13.89,-179.0
2,100004.0,1.0,1.0,1.0,0.0,67500.0,135000.0,6750.0,135000.0,0.010032,...,0.0,0.0,0.0,0.0,0.0,-7.666667,-3.0,-11.0,17.333333,-23.0
3,100006.0,0.0,0.0,1.0,0.0,135000.0,312682.5,29686.5,297000.0,0.008019,...,379.700182,24001.706113,-6797.926065,354638600.0,7133.394533,-11.130596,16.6321,-38.073301,787.185389,-353.020996
4,100007.0,0.0,0.0,1.0,0.0,121500.0,513000.0,21865.5,513000.0,0.028663,...,-452.384318,0.0,-22655.655,8084830.0,-29857.365,-3.636364,12.0,-31.0,63.865734,-240.0


In [17]:
train_imp.columns = column_names_imp
train_imp.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,installments_AMT_PAYMENT-INSTALLMENT_mean,installments_AMT_PAYMENT-INSTALLMENT_max,installments_AMT_PAYMENT-INSTALLMENT_min,installments_AMT_PAYMENT-INSTALLMENT_var,installments_AMT_PAYMENT-INSTALLMENT_sum,installments_DAYS_PAYMENT-INSTALLMENT_mean,installments_DAYS_PAYMENT-INSTALLMENT_max,installments_DAYS_PAYMENT-INSTALLMENT_min,installments_DAYS_PAYMENT-INSTALLMENT_var,installments_DAYS_PAYMENT-INSTALLMENT_sum
0,100002.0,0.0,0.0,1.0,0.0,202500.0,406597.5,24700.5,351000.0,0.018801,...,0.0,0.0,0.0,0.0,0.0,-20.421053,-12.0,-31.0,24.25731,-388.0
1,100003.0,0.0,0.0,0.0,0.0,270000.0,1293502.5,35698.5,1129500.0,0.003541,...,0.0,0.0,0.0,0.0,0.0,-7.16,-1.0,-14.0,13.89,-179.0
2,100004.0,1.0,1.0,1.0,0.0,67500.0,135000.0,6750.0,135000.0,0.010032,...,0.0,0.0,0.0,0.0,0.0,-7.666667,-3.0,-11.0,17.333333,-23.0
3,100006.0,0.0,0.0,1.0,0.0,135000.0,312682.5,29686.5,297000.0,0.008019,...,379.700182,24001.706113,-6797.926065,354638600.0,7133.394533,-11.130596,16.6321,-38.073301,787.185389,-353.020996
4,100007.0,0.0,0.0,1.0,0.0,121500.0,513000.0,21865.5,513000.0,0.028663,...,-452.384318,0.0,-22655.655,8084830.0,-29857.365,-3.636364,12.0,-31.0,63.865734,-240.0


In [18]:
train_imp.isnull().sum().sum()

0

We checked how many null values in our dataset. It is zero so we have completed our goal.

### Test

We are doing similar process for test data to fill all NaN values.

In [19]:
imp = imp.fit(test)
# Impute our data, then train
test_imp = imp.transform(test)
test_imp.shape

(48744, 713)

In [20]:
nanmask_test = np.isnan(imp.statistics_)
np.where(nanmask_test==True)

(array([], dtype=int64),)

In [21]:
test_imp = np.delete(test_imp, [380, 381], axis=1)
test_imp.shape

(48744, 711)

In [22]:
test_imp = pd.DataFrame(test_imp)
test_imp.columns = column_names_imp
test_imp.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,installments_AMT_PAYMENT-INSTALLMENT_mean,installments_AMT_PAYMENT-INSTALLMENT_max,installments_AMT_PAYMENT-INSTALLMENT_min,installments_AMT_PAYMENT-INSTALLMENT_var,installments_AMT_PAYMENT-INSTALLMENT_sum,installments_DAYS_PAYMENT-INSTALLMENT_mean,installments_DAYS_PAYMENT-INSTALLMENT_max,installments_DAYS_PAYMENT-INSTALLMENT_min,installments_DAYS_PAYMENT-INSTALLMENT_var,installments_DAYS_PAYMENT-INSTALLMENT_sum
0,100001.0,0.0,0.0,1.0,0.0,135000.0,568800.0,20560.5,450000.0,0.01885,...,0.0,0.0,0.0,0.0,0.0,-7.285714,11.0,-36.0,213.904762,-51.0
1,100005.0,0.0,0.0,1.0,0.0,99000.0,222768.0,17370.0,180000.0,0.035792,...,0.0,0.0,0.0,0.0,0.0,-23.555556,1.0,-37.0,182.527778,-212.0
2,100013.0,0.0,1.0,1.0,0.0,202500.0,663264.0,69777.0,630000.0,0.019101,...,-1157.662742,0.0,-23147.82,23467720.0,-179437.725,-5.180645,21.0,-38.0,127.200922,-803.0
3,100028.0,0.0,0.0,1.0,2.0,315000.0,1575000.0,49018.5,1575000.0,0.026392,...,-622.550708,0.0,-8505.0,2972476.0,-70348.23,-3.0,7.0,-19.0,26.678571,-339.0
4,100038.0,0.0,1.0,0.0,1.0,180000.0,625500.0,32067.0,625500.0,0.010032,...,571.951414,28566.898614,-7136.377516,380383500.0,15742.302464,-11.552455,12.531942,-38.759448,544.77243,-381.936021


In [23]:
test_imp.isnull().sum().sum()

0

In [24]:
train_imp = train_imp.drop(columns='SK_ID_CURR')
test_imp = test_imp.drop(columns='SK_ID_CURR')

In [25]:
# # Saving datasets
# train_imp.to_csv('datasets/train_imp.csv', index=False)
# test_imp.to_csv('datasets/test_imp.csv', index=False)

# Sampling Data

- There are a lot of models for classification problems. Some models like deep neural networks have high performance but they need high performance computers. In this problem, because of I am using a laptop to make computations, I would like to try less computationally expensive model.
- Moreover, we have imbalanced dataset; one class has way less instances than other one. Risky instances (TARGET=1) are rare instances in our dataset. There are different approaches like undersampling and oversampling to overcome this problem but none of them increases performance too much. In this type of problem, if we can reach the data source, we should try to collect more data for rare class. But, in this work, we cannot reach the data source and we try to increase the size of rare instances. 

There are a lot of different ranges of data in our dataset. Some of them are too large and some of them are too small. Calculations with these types of high range of numbers could be hard for computers. Relating values with each other could be hard and finding similarities in high dimensions could not be reachable. Moreover, if we want to use neural networks, we should normalize our values to increase performance of neural networks. Therefore, before we move to feature selection and modeling parts, I would like to scale feature values in between 0 and 1.   
  
This scaling could be better with transformations. For instance, we have some skewed data which have a lot of values in a short range on one side of distribution. When we scale highly skewed data, it would not have distinguishing values. However, I do not transform my data to see the results. 
  
I oversampled to increase the size of rare data and undersampled to decrease the size of common data. I use SMOTE method which is highly effective for oversampling. For undersampling, I use Random Under Sampler.

In [26]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [27]:
print("Number transactions X_train dataset: ", train_imp.shape)
print("Number transactions y_train dataset: ", test_imp.shape)

print("Before OverSampling, counts of label '0': {}".format(sum(train_labels.values==0)))
print("Before OverSampling, counts of label '1': {} \n".format(sum(train_labels.values==1)))

Number transactions X_train dataset:  (307511, 710)
Number transactions y_train dataset:  (48744, 710)
Before OverSampling, counts of label '0': 282686
Before OverSampling, counts of label '1': 24825 



In [28]:
sm = SMOTE()
train_os, label_os = sm.fit_sample(train_imp, train_labels.values.ravel())

print('After OverSampling, the shape of train_X: {}'.format(train_os.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(label_os.shape))
print("After OverSampling, counts of label '0': {}".format(sum(label_os==0)))
print("After OverSampling, counts of label '1': {}".format(sum(label_os==1)))

After OverSampling, the shape of train_X: (565372, 710)
After OverSampling, the shape of train_y: (565372,) 

After OverSampling, counts of label '0': 282686
After OverSampling, counts of label '1': 282686


In [29]:
rus = RandomUnderSampler(return_indices=True)
train_rus, label_rus, idx_resampled = rus.fit_sample(train_imp, train_labels.values.ravel())

print('After Undersampling, the shape of train_X: {}'.format(train_rus.shape))
print('After Undersampling, the shape of train_y: {} \n'.format(label_rus.shape))
print("After Undersampling, counts of label '0': {}".format(sum(label_rus==0)))
print("After Undersampling, counts of label '1': {}".format(sum(label_rus==1)))

After Undersampling, the shape of train_X: (49650, 710)
After Undersampling, the shape of train_y: (49650,) 

After Undersampling, counts of label '0': 24825
After Undersampling, counts of label '1': 24825


In [30]:
# # Saving datasets
# np.savetxt('datasets/train_os.out', train_os, delimiter=',')
# np.savetxt('datasets/label_os.out', label_os, delimiter=',')

# Feature Selection

We created a lot of new features during dataset creation. However, some of these features may not be useful for training machine learning model. They may decrease the performance or increase the chance of overfitting. Therefore, we can use feature selection techniques to eliminate unnecessary features. There are different types of approaches , some of them needs too much computing power and others may not.

In [30]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif, chi2, f_classif

## For Oversampled Data

In [31]:
selector_os = SelectKBest(mutual_info_classif, k=250)
selector_os.fit(train_os, label_os)

SelectKBest(k=250,
      score_func=<function mutual_info_classif at 0x7f0149c23840>)

In [32]:
train_os_sel = selector_os.transform(train_os)
train_os_sel.shape

(565372, 250)

In [33]:
# selector_os.get_support(indices=True)

In [34]:
# selected_features_os = selector_os.get_support(indices=True)
# selected_features_os = pd.DataFrame(selected_features_os)
# selected_features_os.head()

In [35]:
#selected_features_os.to_csv('selected_features_os_250.csv', index=False, header=False)

In [36]:
test_os_sel = selector_os.transform(test_imp)
test_os_sel.shape

(48744, 250)

In [37]:
# # Saving datasets
# np.savetxt('datasets/train_os_sel.out', train_os_sel, delimiter=',')
# np.savetxt('datasets/test_os_sel.out', test_os_sel, delimiter=',')

## For Undersampled Data

In [38]:
selector_rus = SelectKBest(mutual_info_classif, k=250)
selector_rus.fit(train_rus, label_rus)
train_rus_sel = selector_rus.transform(train_rus)
train_rus_sel.shape

(49650, 250)

In [39]:
# selector_rus.get_support(indices=True)

In [40]:
# selected_features_rus = selector_rus.get_support(indices=True)
# selected_features_rus = pd.DataFrame(selected_features_rus)
# selected_features_rus.head()

In [41]:
# selected_features_rus.to_csv('selected_features_rus_250.csv', index=False, header=False)

In [42]:
test_rus_sel = selector_rus.transform(test_imp)
test_rus_sel.shape

(48744, 250)

In [43]:
# # Saving datasets
# np.savetxt('datasets/train_rus_sel.out', train_rus_sel, delimiter=',')
# np.savetxt('datasets/test_rus_sel.out', test_rus_sel, delimiter=',')

# PCA

Explain PCA

In [44]:
from sklearn.decomposition import PCA

## Oversampled

In [45]:
concat_os = np.concatenate((train_os_sel, test_os_sel), axis=0)

In [46]:
pca_os = PCA(n_components=50)
pca_os.fit(concat_os)
concat_os_pca = pca_os.transform(concat_os)

In [47]:
train_os_pca = concat_os_pca[:565372, :]
test_os_pca = concat_os_pca[565372:, :]

In [48]:
print(train_os_pca.shape)
print(test_os_pca.shape)

(565372, 50)
(48744, 50)


In [49]:
# # Saving datasets
# np.savetxt('datasets/train_os_pca.out', train_os_pca, delimiter=',')
# np.savetxt('datasets/test_os_pca.out', test_os_pca, delimiter=',')

## Undersampled

In [50]:
concat_rus = np.concatenate((train_rus_sel, test_rus_sel), axis=0)

In [51]:
pca_rus = PCA(n_components=50)
pca_rus.fit(concat_os)
concat_rus_pca = pca_rus.transform(concat_rus)

In [52]:
train_rus_pca = concat_rus_pca[:49650, :]
test_rus_pca = concat_rus_pca[49650:, :]

In [53]:
print(train_rus_pca.shape)
print(test_rus_pca.shape)

(49650, 50)
(48744, 50)


In [54]:
# # Saving datasets
# np.savetxt('datasets/train_rus_pca.out', train_rus_pca, delimiter=',')
# np.savetxt('datasets/test_rus_pca.out', test_rus_pca, delimiter=',')

# Scaling

In [55]:
from sklearn.preprocessing import RobustScaler

## Oversampled Data

In [56]:
scaler = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True)

### Train

In [57]:
scaler.fit(train_os_sel)

RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)

In [58]:
train_os_sel_scaled = scaler.transform(train_os_sel)

In [59]:
train_os_sel_scaled = pd.DataFrame(train_os_sel_scaled)
train_os_sel_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,240,241,242,243,244,245,246,247,248,249
0,0.0,0.0,0.0,0.627962,-0.220674,-0.026348,-0.239638,0.0,0.881637,0.283086,...,-0.270597,-0.472836,0.521425,0.07063,0.046056,-0.140886,-1.584293,-1.082245,0.41081,-0.31552
1,0.0,-1.226408,0.0,1.377962,1.700038,0.682014,1.64479,-0.969528,-0.265606,0.003049,...,-0.300571,-0.195648,-0.537656,-0.423779,-0.141426,-0.234316,0.677102,-0.420873,1.807564,0.588405
2,1.107354,0.0,0.0,-0.872038,-0.808855,-1.18251,-0.762485,-0.557129,-0.623884,0.492479,...,0.395489,-1.212005,-1.200658,-1.059448,-0.217198,-0.413761,0.590701,-0.541122,2.05405,1.263106
3,0.0,0.0,0.0,-0.122038,-0.424059,0.294791,-0.37035,-0.685023,-0.617444,-0.937692,...,0.689048,0.189297,0.489747,0.469461,0.853128,0.709366,0.0,0.639252,-0.170347,-0.164236
4,0.0,0.0,0.0,-0.272038,0.009755,-0.208946,0.152497,0.626572,-0.763049,-0.937184,...,-0.060844,2.160453,-0.114572,-0.07063,-0.081979,0.266943,1.277985,0.360748,0.41081,0.32458


### Test

In [60]:
scaler_test = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True)
scaler_test.fit(test_os_sel)
test_os_sel_scaled = scaler_test.transform(test_os_sel)

In [61]:
test_os_sel_scaled = pd.DataFrame(test_os_sel_scaled)
test_os_sel_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,240,241,242,243,244,245,246,247,248,249
0,0.0,0.0,0.0,-0.2,0.286707,-0.291541,0.133333,0.0,-0.483966,-0.396327,...,-0.031154,-0.785714,-0.792839,-0.77273,-0.147964,-0.291908,0.535583,0.591194,-0.066667,0.89769
1,0.0,0.0,0.0,-0.52,-0.548393,-0.455852,-0.533333,0.908077,-0.319143,-1.214996,...,-0.081504,-0.714286,-0.371762,-0.450759,-0.098401,-0.258326,-1.752249,-0.147798,-0.133333,0.366337
2,1.0,0.0,0.0,0.4,0.514683,2.243105,0.577778,0.013453,-0.595575,-1.210788,...,0.103462,0.464286,6.762943,6.246231,9.889809,8.438212,0.831593,1.330186,-0.2,-1.584158
3,0.0,0.0,2.0,1.4,2.71503,1.174044,2.911111,0.404245,0.253326,-0.219204,...,0.791184,0.785714,4.326692,3.73486,3.625167,4.135796,1.138231,0.295597,1.066667,-0.052805
4,1.0,-1.0,1.0,0.2,0.423545,0.301043,0.566667,-0.472638,0.3844,-0.343535,...,0.787565,0.283725,0.615957,0.613635,0.908264,0.784298,-0.064398,0.704403,-0.25063,-0.194508


In [62]:
# # Saving datasets
# np.savetxt('datasets/train_os_sel_scaled.out', train_os_sel_scaled, delimiter=',')
# np.savetxt('datasets/test_os_sel_scaled.out', test_os_sel_scaled, delimiter=',')

## For Undersampled Data

### Train

In [63]:
scaler_rus_train = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True)
scaler_rus_train.fit(train_rus_sel)
train_rus_sel_scaled = scaler.transform(train_rus_sel)
train_rus_sel_scaled = pd.DataFrame(train_rus_sel_scaled)
train_rus_sel_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,240,241,242,243,244,245,246,247,248,249
0,0.0,551882.2,20979.0,3.377962,-1.101215,-2.551508,-1.091257,-107246.452668,1.738143,0.60734,...,13668.364853,264.722662,1238.200538,-1.265235,296.464297,-0.422659,1.89809,-0.951974,2.875669,1.107406
1,0.0,377895.4,15862.5,0.977962,-1.101215,-2.196168,-1.094868,-570408.734682,2.106473,0.60734,...,43932.97495,6530.528782,1469.718291,194937100.0,5892.504535,0.140442,1218.349279,-1.029972,4.324358,-0.164236
2,1.107354,2114595.0,50544.0,13.377962,-1.101215,-2.523555,-1.093467,-461829.26418,1.616099,0.60734,...,79464.752034,15362.335289,3.328419,133963900.0,22997.734839,-0.422659,1.89809,-0.734273,2.957831,-0.786946
3,0.0,431748.2,13401.0,1.127962,-1.101215,-2.780802,-1.116898,-331457.658872,2.11684,0.60734,...,65220.968011,9635.0967,1868.980047,183325100.0,3538.963401,-0.422659,1.89809,-1.831799,2.218374,-0.224695
4,1.107354,1155643.0,24853.5,5.877962,-1.101215,-2.741706,-1.093711,-340733.611297,2.211239,0.60734,...,43932.97495,6530.528782,1469.718291,194937100.0,5892.504535,0.140442,1218.349279,-1.029972,4.324358,-0.164236


### Test

In [64]:
scaler_rus_test = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True)
scaler_rus_test.fit(test_rus_sel)
test_rus_sel_scaled = scaler.transform(test_rus_sel)
test_rus_sel_scaled = pd.DataFrame(test_rus_sel_scaled)
test_rus_sel_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,240,241,242,243,244,245,246,247,248,249
0,0.0,697579.4,20560.5,3.377962,-1.101215,-2.856546,-1.094902,-328471.564599,2.24014,0.60734,...,13964.618541,802.352198,848.862401,1820318.0,353.414771,-0.422659,1.89809,-0.7988,3.861613,1.142006
1,0.0,273203.2,17370.0,0.377962,-1.101215,-2.780737,-1.100082,-579303.483582,2.112756,0.60734,...,14807.182023,814.287232,1034.459861,1294439.0,481.887581,-0.422659,1.89809,-1.777019,3.039993,0.44568
2,1.107354,813430.8,69777.0,5.377962,-1.101215,-2.907879,-1.100056,-138187.472057,1.817464,0.60734,...,25859.568566,16507.370966,12.898894,73425430.0,14500.25026,-2.13949,-30597.451604,-0.672233,4.683233,-2.110397
3,0.0,1931591.0,49018.5,15.877962,-1.101215,-2.517436,-1.093782,-127069.035932,1.706729,0.60734,...,11815.099029,1799.79759,-1.379324,1725857.0,4829.839399,-1.345911,-11994.520668,-0.541122,3.532965,-0.103595
4,1.107354,767116.8,32067.0,5.327962,-1.101215,-2.45715,-1.094568,-254136.877362,1.698247,0.60734,...,47555.253804,7154.81999,1581.851162,214718300.0,6579.41878,0.425553,2686.418406,-1.055336,3.987481,-0.289294


In [65]:
# # Saving datasets
# np.savetxt('datasets/train_rus_sel_scaled.out', train_rus_sel_scaled, delimiter=',')
# np.savetxt('datasets/test_rus_sel_scaled.out', test_rus_sel_scaled, delimiter=',')

# Model

## LightGBM

In [31]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import gc

In [32]:
# Taken from William Koehrsen and modified.
def model(features, test_features, encoding = 'ohe', n_folds = 10, n_estimators=10000, learning_rate = 0.01,
         reg_alpha = 0.1, reg_lambda = 0.1):
    
    """Train and test a light gradient boosting model using
    cross validation. 
    
    Parameters
    --------
        features (pd.DataFrame): 
            dataframe of training features to use 
            for training a model. Must include the TARGET column.
        test_features (pd.DataFrame): 
            dataframe of testing features to use
            for making predictions with the model. 
        encoding (str, default = 'ohe'): 
            method for encoding categorical variables. Either 'ohe' for one-hot encoding or 'le' for integer label encoding
            n_folds (int, default = 5): number of folds to use for cross validation
        
    Return
    --------
        submission (pd.DataFrame): 
            dataframe with `SK_ID_CURR` and `TARGET` probabilities
            predicted by the model.
        feature_importances (pd.DataFrame): 
            dataframe with the feature importances from the model.
        valid_metrics (pd.DataFrame): 
            dataframe with training and validation metrics (ROC AUC) for each fold and overall.
        
    """
    
    # Extract the ids
    if 'SK_ID_CURR' in features.columns:
        train_ids = features['SK_ID_CURR']
    
    if 'SK_ID_CURR' in test_features.columns:
        test_ids = test_features['SK_ID_CURR']
    
    # Extract the labels for training
    labels = features['TARGET']
    
    # Remove the ids and target
    features = features.drop(columns = ['TARGET'])
    if 'SK_ID_CURR' in features.columns:
        features = features.drop(columns = ['SK_ID_CURR'])
    
    if 'SK_ID_CURR' in test_features.columns:
        test_features = test_features.drop(columns = ['SK_ID_CURR'])
    
    
    # One Hot Encoding
    if encoding == 'ohe':
        features = pd.get_dummies(features)
        test_features = pd.get_dummies(test_features)
        
        # Align the dataframes by the columns
        features, test_features = features.align(test_features, join = 'inner', axis = 1)
        
        # No categorical indices to record
        cat_indices = 'auto'
    
    # Integer label encoding
    elif encoding == 'le':
        
        # Create a label encoder
        label_encoder = LabelEncoder()
        
        # List for storing categorical indices
        cat_indices = []
        
        # Iterate through each column
        for i, col in enumerate(features):
            if features[col].dtype == 'object':
                # Map the categorical features to integers
                features[col] = label_encoder.fit_transform(np.array(features[col].astype(str)).reshape((-1,)))
                test_features[col] = label_encoder.transform(np.array(test_features[col].astype(str)).reshape((-1,)))

                # Record the categorical indices
                cat_indices.append(i)
    
    # Catch error if label encoding scheme is not valid
    else:
        raise ValueError("Encoding must be either 'ohe' or 'le'")
        
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
    
    # Extract feature names
    feature_names = list(features.columns)
    
    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)
    
    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 53)
    
    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])
    
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []
    
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features):
        
        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]
        
        # Create the model
        model = lgb.LGBMClassifier(n_estimators=n_estimators, objective = 'binary', 
                                   class_weight = 'balanced', learning_rate = learning_rate, 
                                   reg_alpha = reg_alpha, reg_lambda = reg_lambda, 
                                   subsample = 0.8, n_jobs = -1, random_state = 50)
        
        # Train the model
        model.fit(train_features, train_labels, eval_metric = 'auc',
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'], categorical_feature = cat_indices,
                  early_stopping_rounds = 100, verbose = 200)
        
        # Record the best iteration
        best_iteration = model.best_iteration_
        
        # Record the feature importances
        feature_importance_values += model.feature_importances_ / k_fold.n_splits
        
        # Make predictions
        test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
        
        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
        
        # Record the best score
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        
        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()
        
    # Make the submission dataframe
    submission = pd.DataFrame({'SK_ID_CURR': test_id, 'TARGET': test_predictions})
    
    # Make the feature importance dataframe
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
    
    # Overall validation score
    valid_auc = roc_auc_score(labels, out_of_fold)
    
    # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores}) 
    
    return submission, feature_importances, metrics

In [33]:
train_os_sel = np.loadtxt('datasets/train_os_sel.out', delimiter=',')
train_rus_sel = np.loadtxt('datasets/train_rus_sel.out', delimiter=',')
train_os_pca = np.loadtxt('datasets/train_os_pca.out', delimiter=',')
train_rus_pca = np.loadtxt('datasets/train_rus_pca.out', delimiter=',')
train_os_sel_scaled = np.loadtxt('datasets/train_os_sel_scaled.out', delimiter=',')
train_rus_sel_scaled = np.loadtxt('datasets/train_rus_sel_scaled.out', delimiter=',')

In [34]:
test_os_sel = np.loadtxt('datasets/test_os_sel.out', delimiter=',')
test_rus_sel = np.loadtxt('datasets/test_rus_sel.out', delimiter=',')
test_os_pca = np.loadtxt('datasets/test_os_pca.out', delimiter=',')
test_rus_pca = np.loadtxt('datasets/test_rus_pca.out', delimiter=',')
test_os_sel_scaled = np.loadtxt('datasets/test_os_sel_scaled.out', delimiter=',')
test_rus_sel_scaled = np.loadtxt('datasets/test_rus_sel_scaled.out', delimiter=',')

In [44]:
train['TARGET'] = train_labels

train_imp['TARGET'] = train_labels

In [48]:
train_os = pd.DataFrame(train_os)
train_os.columns = train_imp.columns.drop('TARGET')
train_os['TARGET'] = label_os

train_rus = pd.DataFrame(train_rus)
train_rus.columns = train_imp.columns.drop('TARGET')
train_rus['TARGET'] = label_rus

train_os_sel = pd.DataFrame(train_os_sel)
train_os_sel['TARGET'] = label_os

train_rus_sel = pd.DataFrame(train_rus_sel)
train_rus_sel['TARGET'] = label_rus

train_os_pca = pd.DataFrame(train_os_pca)
train_os_pca['TARGET'] = label_os

train_rus_pca = pd.DataFrame(train_rus_pca)
train_rus_pca['TARGET'] = label_rus

train_os_sel_scaled = pd.DataFrame(train_os_sel_scaled)
train_os_sel_scaled['TARGET'] = label_os

train_rus_sel_scaled = pd.DataFrame(train_rus_sel_scaled)
train_rus_sel_scaled['TARGET'] = label_rus

In [56]:
test_os_sel = pd.DataFrame(test_os_sel)
test_rus_sel = pd.DataFrame(test_rus_sel)
test_os_pca = pd.DataFrame(test_os_pca)
test_rus_pca = pd.DataFrame(test_rus_pca)
test_os_sel_scaled = pd.DataFrame(test_os_sel_scaled)
test_rus_sel_scaled = pd.DataFrame(test_rus_sel_scaled)

In [49]:
trainsets = [train, train_imp, train_os, train_rus, train_os_sel, train_rus_sel, train_os_pca, train_rus_pca,
            train_os_sel_scaled, train_rus_sel_scaled]
testsets = [test, test_imp, test_imp, test_imp, test_os_sel, test_rus_sel, test_os_pca, test_rus_pca,
            test_os_sel_scaled, test_rus_sel_scaled]

In [50]:
result_names = ['raw', 'imputed', 'oversampled', 'undersampled', 'oversampled_selected', 'undersampled_selected', 
                'oversampled_pca', 'undersampled_pca', 'oversampled_selected_scaled', 'undersampled_selected_scaled']

In [59]:
for i in range(len(trainsets)):
    submission, fi, metrics = model(trainsets[i], testsets[i], n_estimators=10000, learning_rate = 0.05, 
                                reg_alpha = 0.1, reg_lambda = 0.1)
    submission.to_csv('results2/submission_' + result_names[i] + '.csv', index = False)
    fi.to_csv('results2/fi_' + result_names[i] + '.csv', index = False)
    metrics.to_csv('results2/metrics_' + result_names[i] + '.csv', index = False)

Training Data Shape:  (565372, 250)
Testing Data Shape:  (48744, 250)
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.979511	train's auc: 0.980071
[400]	valid's auc: 0.97998	train's auc: 0.98437
Early stopping, best iteration is:
[469]	valid's auc: 0.980025	train's auc: 0.985569
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.977087	train's auc: 0.98032
[400]	valid's auc: 0.977489	train's auc: 0.984451
Early stopping, best iteration is:
[488]	valid's auc: 0.97753	train's auc: 0.985943
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.978061	train's auc: 0.980188
[400]	valid's auc: 0.978589	train's auc: 0.984459
[600]	valid's auc: 0.978662	train's auc: 0.987592
Early stopping, best iteration is:
[639]	valid's auc: 0.9787	train's auc: 0.988108
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.978594	train's auc: 0.980237
[400]	valid's auc: 0.97906	tr

[600]	valid's auc: 0.911004	train's auc: 0.921319
[800]	valid's auc: 0.917225	train's auc: 0.92936
[1000]	valid's auc: 0.922553	train's auc: 0.936159
[1200]	valid's auc: 0.926889	train's auc: 0.941672
[1400]	valid's auc: 0.930861	train's auc: 0.94673
[1600]	valid's auc: 0.934233	train's auc: 0.95109
[1800]	valid's auc: 0.937183	train's auc: 0.95501
[2000]	valid's auc: 0.939827	train's auc: 0.958366
[2200]	valid's auc: 0.942365	train's auc: 0.961545
[2400]	valid's auc: 0.944737	train's auc: 0.964474
[2600]	valid's auc: 0.94693	train's auc: 0.967172
[2800]	valid's auc: 0.948937	train's auc: 0.969585
[3000]	valid's auc: 0.950673	train's auc: 0.971767
[3200]	valid's auc: 0.952274	train's auc: 0.973669
[3400]	valid's auc: 0.953765	train's auc: 0.975359
[3600]	valid's auc: 0.955276	train's auc: 0.976975
[3800]	valid's auc: 0.956659	train's auc: 0.978574
[4000]	valid's auc: 0.957935	train's auc: 0.979967
[4200]	valid's auc: 0.959265	train's auc: 0.981427
[4400]	valid's auc: 0.960464	train's a

[1000]	valid's auc: 0.922744	train's auc: 0.935544
[1200]	valid's auc: 0.927239	train's auc: 0.941281
[1400]	valid's auc: 0.931103	train's auc: 0.946209
[1600]	valid's auc: 0.93429	train's auc: 0.950475
[1800]	valid's auc: 0.937314	train's auc: 0.954398
[2000]	valid's auc: 0.940019	train's auc: 0.957823
[2200]	valid's auc: 0.942522	train's auc: 0.961003
[2400]	valid's auc: 0.944692	train's auc: 0.963837
[2600]	valid's auc: 0.9469	train's auc: 0.966469
[2800]	valid's auc: 0.948833	train's auc: 0.968888
[3000]	valid's auc: 0.950618	train's auc: 0.971071
[3200]	valid's auc: 0.952356	train's auc: 0.97317
[3400]	valid's auc: 0.953958	train's auc: 0.975008
[3600]	valid's auc: 0.955436	train's auc: 0.976742
[3800]	valid's auc: 0.956843	train's auc: 0.978373
[4000]	valid's auc: 0.9582	train's auc: 0.979838
[4200]	valid's auc: 0.959458	train's auc: 0.981208
[4400]	valid's auc: 0.960627	train's auc: 0.982459
[4600]	valid's auc: 0.961773	train's auc: 0.983632
[4800]	valid's auc: 0.962837	train's 

[1400]	valid's auc: 0.933834	train's auc: 0.947027
[1600]	valid's auc: 0.936959	train's auc: 0.951171
[1800]	valid's auc: 0.939937	train's auc: 0.955142
[2000]	valid's auc: 0.942512	train's auc: 0.958648
[2200]	valid's auc: 0.944704	train's auc: 0.96166
[2400]	valid's auc: 0.946852	train's auc: 0.964516
[2600]	valid's auc: 0.948759	train's auc: 0.967
[2800]	valid's auc: 0.95062	train's auc: 0.969423
[3000]	valid's auc: 0.95232	train's auc: 0.971447
[3200]	valid's auc: 0.953929	train's auc: 0.973521
[3400]	valid's auc: 0.955465	train's auc: 0.975352
[3600]	valid's auc: 0.956973	train's auc: 0.977105
[3800]	valid's auc: 0.958301	train's auc: 0.978653
[4000]	valid's auc: 0.959412	train's auc: 0.979989
[4200]	valid's auc: 0.960601	train's auc: 0.981302
[4400]	valid's auc: 0.961791	train's auc: 0.982629
[4600]	valid's auc: 0.962844	train's auc: 0.983768
[4800]	valid's auc: 0.963909	train's auc: 0.984856
[5000]	valid's auc: 0.964882	train's auc: 0.985816
[5200]	valid's auc: 0.965716	train's 

[200]	valid's auc: 0.624199	train's auc: 0.732056
Early stopping, best iteration is:
[137]	valid's auc: 0.625451	train's auc: 0.705626
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[68]	valid's auc: 0.617607	train's auc: 0.672786
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.606953	train's auc: 0.732045
Early stopping, best iteration is:
[128]	valid's auc: 0.608358	train's auc: 0.702488
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.63116	train's auc: 0.731104
Early stopping, best iteration is:
[266]	valid's auc: 0.631938	train's auc: 0.754708
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.621315	train's auc: 0.73385
Early stopping, best iteration is:
[272]	valid's auc: 0.622932	train's auc: 0.758955
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.614512	train's auc: 0.731293
Early stoppin