Goal of this notebook is to explore some of the secondary files in the dataset and see what value they might add

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from imblearn.over_sampling import SMOTE
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import KFold
%matplotlib inline



In [3]:
#Read the credit and application data sets
credit_card_balance_data = pd.read_csv('../Data/credit_card_balance.csv')
application_data = pd.read_csv('../Data/application_train.csv')

In [4]:
#Merge the data
merged_data = application_data.join(credit_card_balance_data, how='outer', on='SK_ID_CURR', lsuffix='application', rsuffix='cc_balance')

In [5]:
merged_data.head(10)

Unnamed: 0,SK_ID_CURR,SK_ID_CURRapplication,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,...,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,100002,100002.0,1.0,Cash loans,M,N,Y,0.0,202500.0,406597.5,...,0.0,0.0,0.0,0,0.0,0.0,17.0,Active,0,0
1,100003,100003.0,0.0,Cash loans,F,N,N,0.0,270000.0,1293502.5,...,0.0,0.0,,0,,,0.0,Active,0,0
2,100004,100004.0,0.0,Revolving loans,M,Y,Y,0.0,67500.0,135000.0,...,0.0,0.0,,0,,,0.0,Active,0,0
3,100006,100006.0,0.0,Cash loans,F,N,Y,0.0,135000.0,312682.5,...,0.0,0.0,0.0,0,0.0,0.0,36.0,Active,0,0
4,100007,100007.0,0.0,Cash loans,M,N,Y,0.0,121500.0,513000.0,...,0.0,0.0,0.0,0,0.0,0.0,65.0,Active,0,0
5,100008,100008.0,0.0,Cash loans,M,N,Y,0.0,99000.0,490495.5,...,265.5,265.5,0.0,0,0.0,0.0,50.0,Active,0,0
6,100009,100009.0,0.0,Cash loans,F,Y,Y,1.0,171000.0,1560726.0,...,85591.44,85591.44,0.0,0,0.0,0.0,88.0,Active,0,0
7,100010,100010.0,0.0,Cash loans,M,Y,Y,0.0,360000.0,1530000.0,...,0.0,0.0,0.0,0,0.0,0.0,68.0,Active,0,0
8,100011,100011.0,0.0,Cash loans,F,N,Y,0.0,112500.0,1019610.0,...,0.0,0.0,0.0,0,0.0,0.0,28.0,Active,0,0
9,100012,100012.0,0.0,Revolving loans,M,N,Y,0.0,135000.0,405000.0,...,0.0,0.0,,0,,,0.0,Active,0,0


In [8]:
y, X = dmatrices('TARGET ~ NAME_CONTRACT_TYPE + CODE_GENDER+ \
                  FLAG_OWN_CAR+ FLAG_OWN_REALTY+ CNT_CHILDREN+\
                  AMT_INCOME_TOTAL+ AMT_CREDIT+ AMT_ANNUITY+\
                  AMT_GOODS_PRICE+ NAME_TYPE_SUITE+ NAME_INCOME_TYPE+\
                  NAME_EDUCATION_TYPE+NAME_FAMILY_STATUS+ NAME_HOUSING_TYPE+\
                  REGION_POPULATION_RELATIVE+ DAYS_BIRTH+ DAYS_EMPLOYED+\
                  DAYS_REGISTRATION+ DAYS_ID_PUBLISH+ OWN_CAR_AGE+ FLAG_MOBIL+\
                  OCCUPATION_TYPE+ REGION_RATING_CLIENT+\
                  REGION_RATING_CLIENT_W_CITY+ WEEKDAY_APPR_PROCESS_START+ \
                  ORGANIZATION_TYPE+ DAYS_LAST_PHONE_CHANGE + MONTHS_BALANCE + \
                  AMT_BALANCE + AMT_CREDIT_LIMIT_ACTUAL + AMT_DRAWINGS_ATM_CURRENT + \
                  AMT_DRAWINGS_CURRENT + AMT_DRAWINGS_OTHER_CURRENT + AMT_DRAWINGS_POS_CURRENT + \
                  AMT_INST_MIN_REGULARITY + AMT_PAYMENT_CURRENT + AMT_PAYMENT_TOTAL_CURRENT + \
                  AMT_RECEIVABLE_PRINCIPAL + AMT_TOTAL_RECEIVABLE + \
                  NAME_CONTRACT_STATUS',
                 merged_data, 
                 return_type = 'dataframe')

In [9]:
y = np.ravel(y)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 32588)

In [12]:
sm = SMOTE(random_state = 32588, ratio = 'minority', k_neighbors = 3)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

In [13]:
clf = RandomForestClassifier(n_estimators =20, random_state = 32588)
clf.fit(X_train_res, y_train_res)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=32588, verbose=0,
            warm_start=False)

In [14]:
clf.score(X_train_res, y_train_res)

0.9970376686124046

In [15]:
predicted = clf.predict(X_test)
probs = clf.predict_proba(X_test)

In [17]:
pd.crosstab(y_test, predicted, rownames = ['Difficulty or not'], colnames= ['Prediction'])

Prediction,0.0,1.0
Difficulty or not,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,11438,2
1.0,941,0
