In [61]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import os
print(os.listdir("../input"))

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# import models and scoring functions
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier

['application_test.csv', '.DS_Store', 'POS_CASH_balance.csv', 'credit_card_balance.csv', 'installments_payments.csv', 'application_train.csv', 'bureau.csv', 'previous_application.csv', 'bureau_balance.csv']


# Read in Training Data

In [36]:
# Training data
train = pd.read_csv('../input/application_train.csv')
print('Training data shape: ', train.shape)
train.head()

Training data shape:  (307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


# Read in Testing Data

In [37]:
# Testing data features
test = pd.read_csv('../input/application_test.csv')
print('Testing data shape: ', test.shape)
test.head()

Testing data shape:  (48744, 121)


Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,,,,,,


## One-Hot Encoding

In [38]:
# one-hot encoding of categorical variables
train = pd.get_dummies(train)
test = pd.get_dummies(test)

print('Training Features shape: ', train.shape)
print('Testing Features shape: ', test.shape)

Training Features shape:  (307511, 246)
Testing Features shape:  (48744, 242)


## Handle outliers and awkward data

In [39]:
train['DAYS_BIRTH'] = abs(train['DAYS_BIRTH'])
test["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace = True)

## Feature Engineering: Create New Features

In [40]:
# CREDIT_INCOME_PERCENT: the percentage of the credit amount relative to a client's income
train['CREDIT_INCOME_PERCENT'] = train['AMT_CREDIT'] / train['AMT_INCOME_TOTAL']

# ANNUITY_INCOME_PERCENT: the percentage of the loan annuity relative to a client's income
train['ANNUITY_INCOME_PERCENT'] = train['AMT_ANNUITY'] / train['AMT_INCOME_TOTAL']

# CREDIT_TERM: the length of the payment in months (since the annuity is the monthly amount due
train['CREDIT_TERM'] = train['AMT_ANNUITY'] / train['AMT_CREDIT']

# DAYS_EMPLOYED_PERCENT: the percentage of the days employed relative to the client's age
train['DAYS_EMPLOYED_PERCENT'] = train['DAYS_EMPLOYED'] / train['DAYS_BIRTH']

## Align Training and Testing Data

In [41]:
train_labels = train['TARGET']

# Align the training and testing data, keep only columns present in both dataframes
train, test = train.align(test, join = 'inner', axis = 1)

# Add the target back in
train['TARGET'] = train_labels

print('Training Features shape: ', train.shape)
print('Testing Features shape: ', test.shape)

Training Features shape:  (307511, 243)
Testing Features shape:  (48744, 242)


## Missing Values
Remove columns from training and testing datasets with greater than 80 percent of values missing.

In [42]:
# Function to calculate missing values by column# Funct 
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [18]:
# Missing values statistics
missing_train = missing_values_table(train)
missing_train.head(10)

Your selected dataframe has 243 columns.
There are 61 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
COMMONAREA_MODE,214865,69.9
COMMONAREA_MEDI,214865,69.9
COMMONAREA_AVG,214865,69.9
NONLIVINGAPARTMENTS_MODE,213514,69.4
NONLIVINGAPARTMENTS_AVG,213514,69.4
NONLIVINGAPARTMENTS_MEDI,213514,69.4
LIVINGAPARTMENTS_MEDI,210199,68.4
LIVINGAPARTMENTS_AVG,210199,68.4
LIVINGAPARTMENTS_MODE,210199,68.4
FLOORSMIN_MEDI,208642,67.8


In [43]:
missing_train_vars = list(missing_train.index[missing_train['% of Total Values'] > 80])
len(missing_train_vars)

0

In [44]:
missing_test = missing_values_table(test)
missing_test.head(10)

Your selected dataframe has 242 columns.
There are 59 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
COMMONAREA_MODE,33495,68.7
COMMONAREA_AVG,33495,68.7
COMMONAREA_MEDI,33495,68.7
NONLIVINGAPARTMENTS_MODE,33347,68.4
NONLIVINGAPARTMENTS_MEDI,33347,68.4
NONLIVINGAPARTMENTS_AVG,33347,68.4
LIVINGAPARTMENTS_AVG,32780,67.2
LIVINGAPARTMENTS_MODE,32780,67.2
LIVINGAPARTMENTS_MEDI,32780,67.2
FLOORSMIN_MODE,32466,66.6


In [45]:
missing_test_vars = list(missing_test.index[missing_test['% of Total Values'] > 80])
len(missing_test_vars)

0

In [46]:
missing_columns = list(set(missing_test_vars + missing_train_vars))
print('There are %d columns with more than 80%% missing in either the training or testing data.' % len(missing_columns))

There are 0 columns with more than 80% missing in either the training or testing data.


In [47]:
# Drop the missing columns
train = train.drop(columns = missing_columns)
test = test.drop(columns = missing_columns)

## Correlations
Examine the correlations of the variables with the target. We can see in any of the variables we created have a greater correlation than those already present in the training data.

In [48]:
# Calculate all correlations in dataframe
corrs = train.corr()

corrs = corrs.sort_values('TARGET', ascending = False)

In [49]:
# Ten most positive correlations
pd.DataFrame(corrs['TARGET'].head(10))

Unnamed: 0,TARGET
TARGET,1.0
REGION_RATING_CLIENT_W_CITY,0.060893
REGION_RATING_CLIENT,0.058899
NAME_INCOME_TYPE_Working,0.057481
DAYS_LAST_PHONE_CHANGE,0.055218
CODE_GENDER_M,0.054713
DAYS_ID_PUBLISH,0.051457
REG_CITY_NOT_WORK_CITY,0.050994
NAME_EDUCATION_TYPE_Secondary / secondary special,0.049824
FLAG_EMP_PHONE,0.045982


In [50]:
# Ten most negative correlations
pd.DataFrame(corrs['TARGET'].dropna().tail(10))

Unnamed: 0,TARGET
FLOORSMAX_AVG,-0.044003
DAYS_EMPLOYED,-0.044932
ORGANIZATION_TYPE_XNA,-0.045987
NAME_INCOME_TYPE_Pensioner,-0.046209
CODE_GENDER_F,-0.054704
NAME_EDUCATION_TYPE_Higher education,-0.056593
DAYS_BIRTH,-0.078239
EXT_SOURCE_1,-0.155317
EXT_SOURCE_2,-0.160472
EXT_SOURCE_3,-0.178919


## Colinear Variables
Calculate  the correlation of each variable with every other variable. This will allow us to see if there are highly collinear variables that should perhaps be removed from the data. Look for any variables that have a greather than 0.8 correlation with other variables.

In [51]:
# Set the threshold
threshold = 0.8

# Empty dictionary to hold correlated variables
above_threshold_vars = {}

# For each column, record the variables that are above the threshold
for col in corrs:
    above_threshold_vars[col] = list(corrs.index[corrs[col] > threshold])
    
# Track columns to remove and columns already examined
cols_to_remove = []
cols_seen = []
cols_to_remove_pair = []

# Iterate through columns and correlated columns
for key, value in above_threshold_vars.items():
    # Keep track of columns already examined
    cols_seen.append(key)
    for x in value:
        if x == key:
            next
        else:
            # Only want to remove one in a pair
            if x not in cols_seen:
                cols_to_remove.append(x)
                cols_to_remove_pair.append(key)
            
cols_to_remove = list(set(cols_to_remove))
print('Number of columns to remove: ', len(cols_to_remove))

Number of columns to remove:  42


In [52]:
# remove columns from training and testing sets
train = train.drop(columns = cols_to_remove)
test = test.drop(columns = cols_to_remove)

print('Training Corrs Removed Shape: ', train.shape)
print('Testing Corrs Removed Shape: ', test.shape)

Training Corrs Removed Shape:  (307511, 201)
Testing Corrs Removed Shape:  (48744, 200)


## Impute Missing Values and Scale Features

In [53]:
from sklearn.preprocessing import MinMaxScaler, Imputer

# Extract the labels for training
labels = train['TARGET']

# Remove the ids and target
features = train.drop(columns = ['SK_ID_CURR', 'TARGET'])
test_features = test.drop(columns = ['SK_ID_CURR'])

print('Training Data Shape: ', features.shape)
print('Testing Data Shape: ', test_features.shape)

# Median imputation of missing values
imputer = Imputer(strategy = 'median')

# Scale each feature to 0-1
scaler = MinMaxScaler(feature_range = (0, 1))

# Fit on the training data
imputer.fit(features)

# Transform both training and testing data
features = imputer.transform(features)
test_features = imputer.transform(test_features)

# Repeat with the scaler
scaler.fit(features)
features = scaler.transform(features)
test_features = scaler.transform(test_features)


# Convert to np arrays
X_train = np.array(features)
X_test = np.array(test_features)

Training Data Shape:  (307511, 199)
Testing Data Shape:  (48744, 199)


# Build Models

## Training and Cross-Validation

### Logistic Regression: Initial Attempt (Benchmark)

In [55]:
name = 'Logistic Regression'

# initialize model
lr_model = LogisticRegression(C=0.0001)

# run basic model
kfold = KFold(n_splits=2, random_state=42)
cv_results = cross_val_score(lr_model, X_train, labels, cv=kfold, scoring='roc_auc')
message = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(message) # results

Decision Tree Classifier (CART): 0.667460 (0.002645)


### Logistic Regression: Improved with Parameter Tuning

In [56]:
param_grid = [
    {'C': [0.0001,0.001,0.01,0.1,1]}  
]

# initialize model
lr_model = LogisticRegression()

# find best hyper params
grid_search = GridSearchCV(lr_model, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, labels)

# reset to improved version after grid search
lr_model = grid_search.best_estimator_

kfold = KFold(n_splits=5, random_state=42)
cv_results = cross_val_score(lr_model, X_train, labels, cv=kfold, scoring='roc_auc')
message = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(message) # results
print() # line break
print(lr_model) # best model
print()

Decision Tree Classifier (CART): 0.742304 (0.003446)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



### Decision Tree Classifier (CART): Initial Attempt

In [58]:
name = 'Decision Tree Classifier (CART)'

# initialize model
cart_model = DecisionTreeClassifier()

# run basic model
kfold = KFold(n_splits=2, random_state=42)
cv_results = cross_val_score(cart_model, X_train, labels, cv=kfold, scoring='roc_auc')
message = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(message)

Decision Tree Classifier (CART): 0.535586 (0.001748)


### Decision Tree Classifier (CART): Improved with Parameter Tuning

In [59]:
param_grid = [
    {'criterion':['gini','entropy'],'max_depth':[4,10,16]}
]

# initialize model
cart_model = DecisionTreeClassifier()

# find best hyper params
grid_search = GridSearchCV(cart_model, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, labels)

# reset to improved version after grid search
cart_model = grid_search.best_estimator_

# run cross validation and get score
kfold = KFold(n_splits=5, random_state=42)
cv_results = cross_val_score(cart_model, X_train, labels, cv=kfold, scoring='roc_auc')
message = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(message)
print() # line break
print(cart_model) # best model
print()

Decision Tree Classifier (CART): 0.710788 (0.002829)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')



### Random Forest Classifier: Initial Attempt

In [62]:
name = 'Random Forest Classifier'

# initialize model
rf_model = RandomForestClassifier()

# run cross validation and get score
kfold = KFold(n_splits=2, random_state=42)
cv_results = cross_val_score(rf_model, X_train, labels, cv=kfold, scoring='roc_auc')
message = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(message)

Random Forest Classifier: 0.627652 (0.000847)


### Random Forest Classifier: Improved with Parameter Tuning

In [65]:
param_grid = [
    {'n_estimators' : [3, 10],'max_features':[1,3,6,10]},
]

# initialize model
rf_model = RandomForestClassifier()

# find best hyper params
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, labels)

# reset to improved version after grid search
rf_model = grid_search.best_estimator_

# run cross validation and get score
kfold = KFold(n_splits=5, random_state=42)
cv_results = cross_val_score(rf_model, X_train, labels, cv=kfold, scoring='roc_auc')
message = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(message)
print() # line break
print(rf_model) # best model
print()

Random Forest Classifier: 0.624488 (0.002187)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)



### Gaussian Naive Bayes: Single Attempt (no parameter tuning)

In [66]:
name = 'Gaussian Naive Bayes'
nb_model = GaussianNB()
kfold = KFold(n_splits=5, random_state=42)
cv_results = cross_val_score(nb_model, X_train, labels, cv=kfold, scoring='roc_auc')
message = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(message)
print() # line break
print(nb_model) # best model
print()

Gaussian Naive Bayes: 0.597065 (0.010164)

GaussianNB(priors=None)



### AdaBoost Classifier: Initial Attempt

In [67]:
name = 'AdaBoost Classifier'

# initialize model
ab_model = AdaBoostClassifier()

# run cross validation and get score
kfold = KFold(n_splits=2, random_state=42)
cv_results = cross_val_score(ab_model, X_train, labels, cv=kfold, scoring='roc_auc')
message = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(message)

AdaBoost Classifier: 0.741354 (0.002857)


### AdaBoost Classifier: Improved with Parameter Tuning

In [71]:
param_grid = {'n_estimators': [50] }

# initialize model
ab_model = AdaBoostClassifier()

# find best hyper params
grid_search = GridSearchCV(ab_model, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, labels)

# reset to improved version after grid search
ab_model = grid_search.best_estimator_

# run cross validation and get score
kfold = KFold(n_splits=5, random_state=42)
cv_results = cross_val_score(ab_model, X_train, labels, cv=kfold, scoring='roc_auc')
message = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(message)
print() # line break
print(ab_model) # best model
print()

KeyboardInterrupt: 

### Ensemble Learning: Voting Classifier (Initial Attempt)

In [35]:
name = 'Voting Classifier'

voting_model = VotingClassifier(estimators=[('lr', lr_model), ('rfc', rf_model), ('cart', cart_model), ('nm', nb_model), ('abc', ab_model)], 
                          voting='soft')

kfold = KFold(n_splits=2, random_state=42)
cv_results = cross_val_score(voting_model, X_train, labels, cv=kfold, scoring='roc_auc')
message = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(message)

ABC: 0.688264 (0.003476)


### Ensemble Learning: Voting Classifier (Improved with Parameter Tuning)

In [None]:
params = {'weights':[[1,2,3,4,5],[5,4,3,2,1]}

voting_model = VotingClassifier(estimators=[('lr', lr_model), ('rfc', rf_model), ('cart', cart_model), ('nm', nb_model), ('abc', ab_model)], 
                          voting='soft')

# find best hyper params
grid_search = GridSearchCV(voting_model, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, labels)

# reset to improved version after grid search
voting_model = grid_search.best_estimator_

kfold = KFold(n_splits=5, random_state=42)
cv_results = cross_val_score(voting_model, X_train, labels, cv=kfold, scoring='roc_auc')
message = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(message)

## Evaluation on Test Set

### Logistic Regression

In [None]:
predictions = lr_model.predict_proba(X_test)
score = roc_auc_score(labels, predictions)
message = "Logistic Regression: " + str(score)
print(message)

### Decision Tree Classifier (CART)

In [None]:
predictions = cart_model.predict_proba(X_test)
score = roc_auc_score(labels, predictions)
message = "Decision Tree Classifier (CART): " + str(score)
print(message)

### Random Forest Classifier

In [None]:
predictions = rf_model.predict_proba(X_test)
score = roc_auc_score(labels, predictions)
message = "Random Forest Classifier: " + str(score)
print(message)

### Gaussian Naive Bayes

In [None]:
predictions = nb_model.predict_proba(X_test)
score = roc_auc_score(labels, predictions)
message = "Gaussian Naive Bayes: " + str(score)
print(message)

### AdaBoost Classifier

In [None]:
predictions = ab_model.predict_proba(X_test)
score = roc_auc_score(labels, predictions)
message = "AdaBoost: " + str(score)
print(message)

### Voting Model (Ensemble)

In [None]:
predictions = voting_model.predict_proba(X_test)
score = roc_auc_score(labels, predictions)
message = "AdaBoost: " + str(score)
print(message)