In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [2]:
path = 'Data/'

In [3]:
'''Applies label encoding to columns with only 2 unique categories'''
def label_df(train_df, test_df):
    # Create a label encoder object
    le = LabelEncoder()
    le_count = 0

    # Iterate through the columns
    for col in train_df:
        if train_df[col].dtype == 'object':
            # If 2 or fewer unique categories
            if len(list(train_df[col].unique())) <= 2:
                # Train on the training data
                le.fit(train_df[col])
                # Transform both training and testing data
                train_df[col] = le.transform(train_df[col])
                test_df[col] = le.transform(test_df[col])

                # Keep track of how many columns were label encoded
                le_count += 1

    print('%d columns were label encoded.' % le_count)

In [4]:
app_train = pd.read_csv(path + 'test.csv')
print("Training data shape: ", app_train.shape)
app_train.head()

Training data shape:  (307511, 241)


Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,TARGET,DAYS_EMPLOYED_ANOM
0,100002,0,0,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,...,0,0,0,0,1,0,1,0,1,False
1,100003,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,...,0,0,0,0,0,0,1,0,0,False
2,100004,1,1,1,0,67500.0,135000.0,6750.0,135000.0,0.010032,...,0,0,0,0,0,0,0,0,0,False
3,100006,0,0,1,0,135000.0,312682.5,29686.5,297000.0,0.008019,...,0,0,0,0,0,0,0,0,0,False
4,100007,0,0,1,0,121500.0,513000.0,21865.5,513000.0,0.028663,...,0,0,0,0,0,0,0,0,0,False


In [5]:
app_test = pd.read_csv(path + 'app_test_smooth.csv')
print("Test data shape: ", app_test.shape)
app_test.head()

Test data shape:  (48744, 240)


Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,DAYS_EMPLOYED_ANOM
0,100001,0,0,1,0,135000.0,568800.0,20560.5,450000.0,0.01885,...,0,0,0,0,0,1,0,1,0,False
1,100005,0,0,1,0,99000.0,222768.0,17370.0,180000.0,0.035792,...,0,0,0,0,0,0,0,0,0,False
2,100013,0,1,1,0,202500.0,663264.0,69777.0,630000.0,0.019101,...,0,0,0,0,0,0,0,0,0,False
3,100028,0,0,1,2,315000.0,1575000.0,49018.5,1575000.0,0.026392,...,0,0,0,0,1,0,0,1,0,False
4,100038,0,1,0,1,180000.0,625500.0,32067.0,625500.0,0.010032,...,0,0,0,0,0,0,0,0,0,False


In [6]:
train_labels = app_train['TARGET']

# Align the training and testing data, keep only columns present in both dataframes
app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)

# Add the target back in
app_train['TARGET'] = train_labels

print('Training Features shape: ', app_train.shape)
print('Testing Features shape: ', app_test.shape)

Training Features shape:  (307511, 241)
Testing Features shape:  (48744, 240)


### Merge with trimmed DFs

In [7]:
def load_csv(path, csvf):
    df = pd.read_csv(path + csvf)
    print("Shape of loaded df: ", df.shape)
#    print(df.head())
    return df

In [8]:
def merge_with_trimmed(df, df_trim):
    df_new_trim = df_trim[df_trim['SK_ID_CURR'].isin(df['SK_ID_CURR'])]
    df = df.merge(df_new_trim, on='SK_ID_CURR', how='inner')
    print("The shape of the df merged with the trimmed df is: ", df.shape)
    return df

In [9]:
bureau_trim = load_csv(path, 'bureau_trim.csv')
bureau_trim.head()

Shape of loaded df:  (1716428, 6)


Unnamed: 0,SK_ID_CURR,DAYS_CREDIT,CREDIT_ACTIVE,DAYS_CREDIT_UPDATE,DAYS_ENDDATE_FACT,CREDIT_TYPE
0,215354,-497,Closed,-131,-153.0,Consumer credit
1,215354,-208,Active,-20,,Credit card
2,215354,-203,Active,-16,,Consumer credit
3,215354,-203,Active,-16,,Credit card
4,215354,-629,Active,-21,,Consumer credit


In [10]:
app_train = merge_with_trimmed(app_train, bureau_trim)

The shape of the df merged with the trimmed df is:  (1465325, 246)


In [11]:
app_test = merge_with_trimmed(app_test, bureau_trim)

The shape of the df merged with the trimmed df is:  (251103, 245)


In [12]:
credit_card_trim = load_csv(path, 'credit_card_trim.csv')
credit_card_trim.head()

Shape of loaded df:  (3840312, 10)


Unnamed: 0,SK_ID_CURR,AMT_BALANCE,AMT_TOTAL_RECEIVABLE,AMT_RECIVABLE,AMT_RECEIVABLE_PRINCIPAL,AMT_INST_MIN_REGULARITY,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,MONTHS_BALANCE,CNT_DRAWINGS_POS_CURRENT
0,378907,56.97,0.0,0.0,0.0,1700.325,0.0,1,-6,1.0
1,363914,63975.555,64875.555,64875.555,60175.08,2250.0,1.0,1,-1,0.0
2,371185,31815.225,31460.085,31460.085,26926.425,2250.0,0.0,0,-7,0.0
3,337855,236572.11,233048.97,233048.97,224949.285,11795.76,1.0,1,-4,0.0
4,126868,453919.455,453919.455,453919.455,443044.395,22924.89,0.0,1,-1,1.0


In [13]:
app_train = merge_with_trimmed(app_train, credit_card_trim)

The shape of the df merged with the trimmed df is:  (18655456, 255)


In [14]:
app_test = merge_with_trimmed(app_test, credit_card_trim)

The shape of the df merged with the trimmed df is:  (3726412, 254)


In [16]:
prev_app_trim = load_csv(path, 'prev_app_trim.csv')
prev_app_trim.head()

Shape of loaded df:  (1670214, 8)


Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_STATUS,NAME_PRODUCT_TYPE,DAYS_DECISION,CODE_REJECT_REASON,CNT_PAYMENT,NAME_PORTFOLIO,DAYS_FIRST_DRAWING
0,271877,Approved,XNA,-73,XAP,12.0,POS,365243.0
1,108129,Approved,x-sell,-164,XAP,36.0,Cash,365243.0
2,122040,Approved,x-sell,-301,XAP,12.0,Cash,365243.0
3,176158,Approved,x-sell,-512,XAP,12.0,Cash,365243.0
4,202054,Refused,walk-in,-781,HC,24.0,Cash,


In [None]:
app_train = merge_with_trimmed(app_train, prev_app_trim)

In [None]:
app_test = merge_with_trimmed(app_test, prev_app_trim)