In [41]:
# Import libraries
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import os
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline

In [51]:
# Training data and Testing data switch the commented lines to produce valid submission test.
#app_train = pd.read_csv('../Input/application_train.csv')
data = pd.read_csv('../Input/application_train.csv')
app_train, app_test = train_test_split(data, test_size = 0.2, random_state = 12)
# Testing data
#app_test = pd.read_csv('../Input/application_test.csv')

In [52]:
# Label encode

# Create a label encoder object
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in app_train:
    if app_train[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(app_train[col].unique())) <= 2:
            # Train on the training data
            le.fit(app_train[col])
            # Transform both training and testing data
            app_train[col] = le.transform(app_train[col])
            app_test[col] = le.transform(app_test[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
print('%d columns were label encoded.' % le_count)

3 columns were label encoded.


In [53]:
# one-hot encoding of categorical variables
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)

print('Training Features shape: ', app_train.shape)
print('Testing Features shape: ', app_test.shape)

Training Features shape:  (246008, 243)
Testing Features shape:  (61503, 240)


In [54]:
train_labels = app_train['TARGET']
#comment out next 2 lines when creating submission
test_labels = app_test['TARGET'] 
app_test = app_test.drop(columns = ['TARGET'])

# Align the training and testing data, keep only columns present in both dataframes
app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)

# Add the target back in
app_train['TARGET'] = train_labels

print('Training Features shape: ', app_train.shape)
print('Testing Features shape: ', app_test.shape)

Training Features shape:  (246008, 240)
Testing Features shape:  (61503, 239)


In [55]:
# Create an anomalous flag column
app_train['DAYS_EMPLOYED_ANOM'] = app_train["DAYS_EMPLOYED"] == 365243

# Replace the anomalous values with nan
app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

app_test['DAYS_EMPLOYED_ANOM'] = app_test["DAYS_EMPLOYED"] == 365243
app_test["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace = True)

In [56]:
from sklearn.preprocessing import MinMaxScaler, Imputer

# Drop the target from the training data
if 'TARGET' in app_train:
    train = app_train.drop(columns = ['TARGET'])
else:
    train = app_train.copy()
    
# Feature names
features = list(train.columns)

# Copy of the testing data
test = app_test.copy()

# Median impuation of missing values
imputer = Imputer(strategy = 'median')

# Scale each feature to 0-1
scaler = MinMaxScaler(feature_range = (0,1))

# Fit on the traing data
imputer.fit(train)

# Transform both the training and testing data
train = imputer.transform(train)
test = imputer.transform(app_test)

# Repeat with the scaler
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)

Training data shape:  (246008, 240)
Testing data shape:  (61503, 240)


In [57]:
import xgboost as xgb

In [70]:
dtrain = xgb.DMatrix(train, train_labels)
dtest = xgb.DMatrix(test)
param = {'max_depth':6, 'eta':.1, 'silent':1, 'objective':'binary:logistic' }
num_round = 100
bst = xgb.train(param, dtrain, num_round)
preds = bst.predict(dtest)

In [71]:
from sklearn.metrics import roc_auc_score
score = roc_auc_score(test_labels, preds)
print(score)

0.760516029142


In [50]:
# Make a submission dataframe
submit = app_test[['SK_ID_CURR']]
submit['TARGET'] = preds

# Save to a csv file
submit.to_csv('../Submissions/XGBoost.csv', index = False)