In [148]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

%matplotlib inline

In [149]:
# loading train dataset
train_csv = pd.read_csv('../input/application_train.csv.zip', compression='infer')

In [150]:
# removing 5 rows
train_csv = train_csv[train_csv['NAME_INCOME_TYPE'] != 'Maternity leave']

# removing 4 rows
train_csv = train_csv[train_csv['NAME_FAMILY_STATUS'] != 'Unknown']

# removing 2 rows
train_csv = train_csv[train_csv['CODE_GENDER'] != 'XNA']

In [151]:
# loading history data
installments = pd.read_csv('../input/installments_payments.csv.zip', compression='infer')
previous = pd.read_csv('../input/previous_application.csv.zip', compression='infer')
bureau = pd.read_csv('../input/bureau.csv.zip', compression='infer')

In [152]:
# loading balance data
credit_card_balance = pd.read_csv('../input/credit_card_balance.csv.zip', compression='infer')
bureau_balance = pd.read_csv('../input/bureau_balance.csv.zip', compression='infer')
cash_balance = pd.read_csv('../input/POS_CASH_balance.csv.zip', compression='infer')

In [153]:
def aggregate_df(df, value, func, labels, by='SK_ID_CURR', column=None):
    
    if column != None:
        pivoted = df.pivot_table(index=by, 
                                 columns=column, 
                                 values=value, 
                                 aggfunc=func, 
                                 fill_value=0, 
                                 dropna=False)
    else:
        pivoted = df.pivot_table(index=by, 
                                 values=value, 
                                 aggfunc=func, 
                                 fill_value=0, 
                                 dropna=False)
        
    pivoted_df = pd.DataFrame(pivoted.to_records())
    
    pivoted_df.rename(columns=labels, inplace=True)
    
    return pivoted_df

def min_row(row):
    return np.nanmin([row['EXT_SOURCE_1'], row['EXT_SOURCE_2'], row['EXT_SOURCE_3']])

In [154]:
X = train_csv
X.set_index('SK_ID_CURR',inplace=True)
# converting the days_birth into years
#X['AGE'] = -1*train_csv['DAYS_BIRTH']/365.0
#X['MONTHS_EMPLOYED'] = -1*train_csv['DAYS_EMPLOYED']/30.0

#X['EXT_SCORE_MIN'] = train_csv.apply(lambda x: min_row(x), axis=1)

In [155]:
labels = {
          'Active'   : 'CNT_ACTIVE_LOANS',
          'Bad debt' : 'CNT_BAD_DEBT',
          'Closed'   : 'CNT_CLOSED_DEBT',
          'Sold'     : 'CNT_SOLD_DEBT'
         }

cnt_loans = aggregate_df(df=bureau, value='CREDIT_TYPE', column='CREDIT_ACTIVE', func='count', labels=labels)

labels = {
          'Active'   : 'AMT_ACTIVE_LOANS',
          'Bad debt' : 'AMT_BAD_DEBT',
          'Closed'   : 'AMT_CLOSED_DEBT',
          'Sold'     : 'AMT_SOLD_DEBT'
         }

amt_loans = aggregate_df(df=bureau, value='AMT_CREDIT_SUM', column='CREDIT_ACTIVE', func='sum', labels=labels)

labels = {
          'OVERDUE'   : 'MAX_OVERDUE'
         }

installments['OVERDUE'] = installments['DAYS_ENTRY_PAYMENT'] - installments['DAYS_INSTALMENT']
max_overdue = aggregate_df(df=installments, value='OVERDUE', func='max', labels=labels)

labels = {
          'CNT_INSTALMENT_FUTURE'   : 'CNT_FUTURE_INSTALLMENTS'
         }

cnt_future_installments = aggregate_df(df=cash_balance, value='CNT_INSTALMENT_FUTURE', func='sum', labels=labels)

labels = {
          'Approved'     : 'CNT_LOANS_APPROVED',
          'Canceled'     : 'CNT_LOANS_CANCELLED',
          'Refused'      : 'CNT_LOANS_REFUSED',
          'Unused offer' : 'CNT_UNUSED_OFFERS'
         }

cnt_prev_loans = aggregate_df(df=previous, 
                              value='NAME_CONTRACT_TYPE', column='NAME_CONTRACT_STATUS', func='count', labels=labels)

labels = {
          'CNT_DRAWINGS_CURRENT'   : 'CNT_CC_DRAWINGS'
         }

cc_draw_cnt = aggregate_df(df=credit_card_balance,
                          value='CNT_DRAWINGS_CURRENT', func='sum', labels=labels)

cc_balance = pd.DataFrame(credit_card_balance.pivot_table(index='SK_ID_CURR', 
                                             values=['AMT_PAYMENT_TOTAL_CURRENT'], 
                                             columns='MONTHS_BALANCE',
                                             aggfunc='mean').to_records())[['SK_ID_CURR',"('AMT_PAYMENT_TOTAL_CURRENT', -1)"]]

cc_balance.columns = ['SK_ID_CURR', 'CREDIT_CARD_BALANCE']

X = pd.concat([X, cnt_loans], axis=1, join='outer', join_axes=[X.index])
X = pd.concat([X, amt_loans], axis=1, join='outer', join_axes=[X.index])
X = pd.concat([X, max_overdue], axis=1, join='outer', join_axes=[X.index])
X = pd.concat([X, cnt_future_installments], axis=1, join='outer', join_axes=[X.index])
X = pd.concat([X, cnt_prev_loans], axis=1, join='outer', join_axes=[X.index])
X = pd.concat([X, cc_draw_cnt], axis=1, join='outer', join_axes=[X.index])
X = pd.concat([X, cc_balance], axis=1, join='outer', join_axes=[X.index])

del cnt_loans, amt_loans, max_overdue, cnt_future_installments, cnt_prev_loans, cc_balance, cc_draw_cnt
del previous, cash_balance, installments, bureau, credit_card_balance

In [156]:
# assigning X,y
y = X['TARGET']
X = pd.get_dummies(X.drop(['TARGET','SK_ID_CURR'], axis=1), dummy_na=True)

In [157]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [158]:
gboost_baseline = GradientBoostingClassifier(n_estimators=200)

In [None]:
gboost_baseline.fit(X_train.fillna(value=0), y_train)

In [None]:
predictions = gboost_baseline.predict(X_test.fillna(0))
pred_prob = gboost_baseline.predict_proba(X_test.fillna(0))

In [None]:
print(classification_report(y_test, predictions))

In [None]:
print(confusion_matrix(y_test, predictions))

In [None]:
roc_auc_score(y_test, pred_prob[:,1])

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, pred_prob[:,1])

In [None]:
plt.figure(figsize=(16,8))
plt.title('Receiver Operating Characteristic')
plt.plot(fpr,tpr, label='AUC = %0.4f'% roc_auc_score(y_test, pred_prob[:,1]))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc='upper left')

In [None]:
summary = pd.Series(gboost_baseline.feature_importances_)
summary.index = X.columns

In [None]:
summary.sort_values(ascending=False, inplace=True)

In [None]:
plt.figure(figsize=(10,55))
sns.barplot(y=summary.index.values, x=summary.values)

In [None]:
plt.figure(figsize=(20,10))

g = sns.FacetGrid(train_csv, col='TARGET')
g.map(plt.hist, "EXT_SOURCE_3", color="steelblue")

#sns.distplot(train_csv['EXT_SOURCE_2'].fillna(0), kde=False)