In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
df = pd.read_csv('Data/train_indessa.csv')

In [3]:
df.drop('batch_enrolled', axis=1, inplace=True)
df.drop('emp_title', axis=1, inplace=True)
df.drop('desc', axis=1, inplace=True)
df.drop('member_id', axis=1, inplace=True)

In [4]:
pd.set_option('display.max_columns', 45)

# Handling Missing Values

In [5]:
df['emp_length'].fillna('10+ years', inplace=True)

In [6]:
df['annual_inc'].fillna(6500, inplace=True)

In [7]:
df['title'].fillna('Debt consolidation', inplace=True)
df.dropna(axis='index', how='all', subset=['delinq_2yrs'], inplace=True)

In [8]:
df['revol_util'].fillna(24, inplace=True)
df['collections_12_mths_ex_med'].fillna(0.0, inplace=True)

In [9]:
def impute_mths_since_derog(cols):
    mths_since_derog = cols[0]
    delinq_2yrs = cols[1]
    
    if pd.isnull(mths_since_derog):
        if delinq_2yrs==0.0:
            return 52
        else:
            return 18
    
    else:
        return mths_since_derog

In [10]:
df['mths_since_last_major_derog'] = df[['mths_since_last_major_derog', 'delinq_2yrs']].apply(impute_mths_since_derog, axis=1)

In [11]:
df.drop('verification_status_joint', axis=1, inplace=True)

In [12]:
df['tot_coll_amt'].fillna(0.0, inplace=True)
df['tot_cur_bal'].fillna(0.0, inplace=True)
df['total_rev_hi_lim'].fillna(0.0, inplace=True)

In [13]:
df.drop('mths_since_last_record', axis=1, inplace=True)

In [14]:
df.drop('mths_since_last_delinq', axis=1, inplace=True)

# FEATURE ENGINEERING

In [15]:
def clean_term(term):
    ls = term.split(' ')
    return np.int64(ls[0])

df['term'] = df['term'].apply(clean_term)

In [16]:
def clean_emp_length(length):
    if length=='10+ years':
        x = length.split('+')
        return np.int64(x[0])
    elif length=='< 1 year':
        x = length.split(' ')
        return np.int64(x[1])
    else:
        x = length.split(' ')
        return np.int64(x[0])
    
df['emp_length'] = df['emp_length'].apply(clean_emp_length)

In [17]:
df.drop('title', axis=1, inplace=True)

In [18]:
def clean_zip(zip):
    x = zip.split('x')
    return np.int64(x[0])

df['zip_code'] = df['zip_code'].apply(clean_zip)

In [19]:
df.drop('addr_state', axis=1, inplace=True)

In [20]:
def clean_last_week_pay(week):
    x = week.split('th')
    if x[0] == 'NA':
        return 0
    else:
        return np.int64(x[0])

df['last_week_pay'] = df['last_week_pay'].apply(clean_last_week_pay)

In [21]:
df_clean = pd.get_dummies(df, drop_first=True)

In [22]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)

In [23]:
X = df_clean.drop('loan_status', axis=1)
y = df_clean['loan_status']

In [24]:
X2D = pca.fit_transform(X)

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [26]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier()

In [27]:
forest_clf.fit(X_train, y_train)

RandomForestClassifier()

In [28]:
y_pred = forest_clf.predict(X_test)

In [29]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[80917,   640],
       [15889,  9037]])

In [30]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.8447733441018754

# TEST DATA PREDICTION