In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
df = pd.read_csv('Data/train_indessa.csv')

In [3]:
df.drop('batch_enrolled', axis=1, inplace=True)
df.drop('emp_title', axis=1, inplace=True)
df.drop('desc', axis=1, inplace=True)
df.drop('member_id', axis=1, inplace=True)

In [4]:
pd.set_option('display.max_columns', 45)

# Handling Missing Values

In [5]:
df['emp_length'].fillna('10+ years', inplace=True)

In [6]:
df['annual_inc'].fillna(6500, inplace=True)

In [7]:
df['title'].fillna('Debt consolidation', inplace=True)
df.dropna(axis='index', how='all', subset=['delinq_2yrs'], inplace=True)

In [8]:
df['revol_util'].fillna(24, inplace=True)
df['collections_12_mths_ex_med'].fillna(0.0, inplace=True)

In [9]:
def impute_mths_since_derog(cols):
    mths_since_derog = cols[0]
    delinq_2yrs = cols[1]
    
    if pd.isnull(mths_since_derog):
        if delinq_2yrs==0.0:
            return 52
        else:
            return 18
    
    else:
        return mths_since_derog

In [10]:
df['mths_since_last_major_derog'] = df[['mths_since_last_major_derog', 'delinq_2yrs']].apply(impute_mths_since_derog, axis=1)

In [11]:
df.drop('verification_status_joint', axis=1, inplace=True)

In [12]:
df['tot_coll_amt'].fillna(0.0, inplace=True)
df['tot_cur_bal'].fillna(0.0, inplace=True)
df['total_rev_hi_lim'].fillna(0.0, inplace=True)

In [13]:
df.drop('mths_since_last_record', axis=1, inplace=True)

In [14]:
df.drop('mths_since_last_delinq', axis=1, inplace=True)

# FEATURE ENGINEERING

In [15]:
def clean_term(term):
    ls = term.split(' ')
    return np.int64(ls[0])

df['term'] = df['term'].apply(clean_term)

In [16]:
def clean_emp_length(length):
    if length=='10+ years':
        x = length.split('+')
        return np.int64(x[0])
    elif length=='< 1 year':
        x = length.split(' ')
        return np.int64(x[1])
    else:
        x = length.split(' ')
        return np.int64(x[0])
    
df['emp_length'] = df['emp_length'].apply(clean_emp_length)

In [17]:
df.drop('title', axis=1, inplace=True)

In [18]:
def clean_zip(zip):
    x = zip.split('x')
    return np.int64(x[0])

df['zip_code'] = df['zip_code'].apply(clean_zip)

In [19]:
df.drop('addr_state', axis=1, inplace=True)

In [20]:
def clean_last_week_pay(week):
    x = week.split('th')
    if x[0] == 'NA':
        return 0
    else:
        return np.int64(x[0])

df['last_week_pay'] = df['last_week_pay'].apply(clean_last_week_pay)

In [21]:
df_clean = pd.get_dummies(df, drop_first=True)

In [22]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)

In [23]:
X = df_clean.drop('loan_status', axis=1)
y = df_clean['loan_status']

In [24]:
X2D = pca.fit_transform(X)

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [26]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier()

In [27]:
forest_clf.fit(X_train, y_train)

RandomForestClassifier()

In [28]:
y_pred = forest_clf.predict(X_test)

In [29]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[80917,   640],
       [15889,  9037]])

In [30]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.8447733441018754

# TEST DATA PREDICTION

In [31]:
df_test = pd.read_csv('Data/test_indessa.csv')

In [32]:
df_test.drop('batch_enrolled', axis=1, inplace=True)
df_test.drop('emp_title', axis=1, inplace=True)
df_test.drop('desc', axis=1, inplace=True)
df_test.drop('member_id', axis=1, inplace=True)
df_test.drop('verification_status_joint', axis=1, inplace=True)
df_test.drop('mths_since_last_record', axis=1, inplace=True)
df_test.drop('mths_since_last_delinq', axis=1, inplace=True)
df_test.drop('title', axis=1, inplace=True)
df_test.drop('addr_state', axis=1, inplace=True)

In [33]:
df_test['emp_length'].fillna('10+ years', inplace=True)
df_test['annual_inc'].fillna(6500, inplace=True)
df_test['revol_util'].fillna(24, inplace=True)
df_test['collections_12_mths_ex_med'].fillna(0.0, inplace=True)
df_test['mths_since_last_major_derog'] = df_test[['mths_since_last_major_derog', 'delinq_2yrs']].apply(impute_mths_since_derog, axis=1)

In [34]:
df_test['tot_coll_amt'].fillna(0.0, inplace=True)
df_test['tot_cur_bal'].fillna(0.0, inplace=True)
df_test['total_rev_hi_lim'].fillna(0.0, inplace=True)

In [35]:
df_test.describe()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,collections_12_mths_ex_med,mths_since_last_major_derog,acc_now_delinq,tot_coll_amt,tot_cur_bal,total_rev_hi_lim
count,354951.0,354951.0,354951.0,354951.0,354951.0,354951.0,354938.0,354938.0,354938.0,354938.0,354951.0,354951.0,354938.0,354951.0,354951.0,354951.0,354951.0,354951.0,354951.0,354938.0,354951.0,354951.0,354951.0
mean,14751.76792,14738.287116,14698.770903,13.252396,75024.01,18.184447,0.314432,0.694654,11.552781,0.195981,16920.05,55.064623,25.26903,1756.867641,0.399299,46.221362,4.913062,0.014498,46.566966,0.004956,224.4993,128217.8,29497.83
std,8437.019324,8431.045701,8443.341658,4.38525,63938.71,25.17405,0.865533,1.000579,5.326112,0.579484,22432.19,23.810774,11.836602,2098.587607,4.082242,409.763865,63.128236,0.13594,15.041033,0.075333,15474.64,152016.1,36214.87
min,500.0,500.0,0.0,5.32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8000.0,8000.0,8000.0,9.99,45000.0,11.89,0.0,0.0,8.0,0.0,6441.0,37.6,17.0,441.345,0.0,0.0,0.0,0.0,50.0,0.0,0.0,23111.0,11600.0
50%,13000.0,13000.0,13000.0,12.99,65000.0,17.65,0.0,0.0,11.0,0.0,11873.0,56.0,24.0,1074.12,0.0,0.0,0.0,0.0,52.0,0.0,0.0,65101.0,21800.0
75%,20000.0,20000.0,20000.0,16.2,90000.0,23.94,0.0,1.0,14.0,0.0,20811.0,73.5,32.0,2243.075,0.0,0.0,0.0,0.0,52.0,0.0,0.0,195431.5,37700.0
max,35000.0,35000.0,35000.0,28.99,9000000.0,9999.0,39.0,33.0,76.0,54.0,2904836.0,182.8,169.0,23062.45,286.747566,29282.07,5569.92,20.0,188.0,5.0,9152545.0,4447397.0,9999999.0


In [36]:
df_test['delinq_2yrs'].fillna(0, inplace=True)
df_test['inq_last_6mths'].fillna(1, inplace=True)
df_test['open_acc'].fillna(6, inplace=True)
df_test['pub_rec'].fillna(0, inplace=True)
df_test['total_acc'].fillna(15, inplace=True)
df_test['acc_now_delinq'].fillna(0, inplace=True)

In [37]:
df_test['term'] = df_test['term'].apply(clean_term)
df_test['emp_length'] = df_test['emp_length'].apply(clean_emp_length)
df_test['zip_code'] = df_test['zip_code'].apply(clean_zip)
df_test['last_week_pay'] = df_test['last_week_pay'].apply(clean_last_week_pay)

In [38]:
df_test_clean = pd.get_dummies(df_test, drop_first=True)

In [39]:
pd.set_option('max_column', 90)

In [40]:
df_test_clean['home_ownership_mortage'] = 0

In [41]:
test_pred = forest_clf.predict(df_test_clean)

In [44]:
df_test_copy = pd.read_csv('Data/test_indessa.csv')

In [47]:
test_pred_proba = forest_clf.predict_proba(df_test_clean)

In [58]:
temp = pd.read_csv('Data/submission.csv')

In [67]:
temp.loc[temp['loan_status']==1, 'loan_status'] = 0.99
temp.loc[temp['loan_status']==0, 'loan_status'] = 0.01

In [68]:
temp.to_csv('Data/submission.csv', index=False)

In [69]:
temp.loc[temp['member_id']==1004221]

Unnamed: 0,member_id,loan_status
86662,1004221,0.01
