In [1]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer # The data should not have any missing values but this is added to be safe
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV, train_test_split


In [2]:
# Import main data
df_main = pd.read_csv("data/main_data.csv")

In [3]:
df_main.head(3)

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,...,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,perc_princ_paid,credit_length_years,secondary_app_credit_length_years,has_desc
0,5000.0,3.0,0.1065,162.87,B,B2,Unknown,10,RENT,24000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.172631,26,0,1
1,2500.0,5.0,0.1527,59.83,C,C4,Ryder,1,RENT,30000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.405812,12,0,1
2,2400.0,3.0,0.1596,84.33,C,C5,Unknown,10,RENT,12252.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.252361,10,0,0


In [4]:
df_main['grade'].dtype == 'O'

True

In [5]:
df_main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1517416 entries, 0 to 1517415
Data columns (total 88 columns):
loan_amnt                            1517416 non-null float64
term                                 1517416 non-null float64
int_rate                             1517416 non-null float64
installment                          1517416 non-null float64
grade                                1517416 non-null object
sub_grade                            1517416 non-null object
emp_title                            1517416 non-null object
emp_length                           1517416 non-null int64
home_ownership                       1517416 non-null object
annual_inc                           1517416 non-null float64
verification_status                  1517416 non-null object
issue_d                              1517416 non-null object
loan_status                          1517416 non-null int64
pymnt_plan                           1517416 non-null object
purpose                       

In [6]:
# Check the number of unique entries for each categorical column
for col_name in df_main.columns:
    col_data_type = df_main[col_name].dtype
    if (col_data_type == 'O'): # If column type is of object class
        uniq_entries = len(df_main[col_name].unique())
        print(col_name, "has", uniq_entries, "unique entries")

grade has 7 unique entries
sub_grade has 35 unique entries
emp_title has 406938 unique entries
home_ownership has 6 unique entries
verification_status has 3 unique entries
issue_d has 145 unique entries
pymnt_plan has 2 unique entries
purpose has 14 unique entries
title has 63120 unique entries
zip_code has 949 unique entries
addr_state has 51 unique entries


In [7]:
#
# Take out all the categorical data that we do not want to one hot encode
#

# Columns to be dropped
cols_to_be_dropped = [
    'grade',
    'sub_grade',
    'emp_title',
    'pymnt_plan',
    'title',
    'zip_code'
]

df_main_rf = df_main.drop(cols_to_be_dropped, axis=1)# DataFrame for random forest

In [8]:
# Covert issue_d to just the year
df_main_rf['issue_d'] = pd.to_datetime(df_main_rf['issue_d']).dt.year

In [9]:
# Get the categorical data
categorical_data = []

for col_name in df_main_rf.columns:
    col_data_type = df_main_rf[col_name].dtype
    if (col_data_type == 'O'): # If column type is of object class
        categorical_data.append(col_name)
        print(col_name)

home_ownership
verification_status
purpose
addr_state


In [10]:
# Get the numerical data
numerical_data = []

for col_name in df_main_rf.columns:
    col_data_type = df_main_rf[col_name].dtype
    if (col_data_type != 'O'): # If column type is of object class
        numerical_data.append(col_name)
        print(col_name)

# Ooppsies, remove loan_status
numerical_data.remove('loan_status')
numerical_data.remove('perc_princ_paid')

loan_amnt
term
int_rate
installment
emp_length
annual_inc
issue_d
loan_status
dti
delinq_2yrs
inq_last_6mths
mths_since_last_delinq
mths_since_last_record
open_acc
pub_rec
revol_bal
revol_util
total_acc
mths_since_last_major_derog
acc_now_delinq
tot_coll_amt
tot_cur_bal
open_acc_6m
open_act_il
open_il_12m
open_il_24m
mths_since_rcnt_il
total_bal_il
il_util
open_rv_12m
open_rv_24m
max_bal_bc
all_util
total_rev_hi_lim
inq_fi
total_cu_tl
inq_last_12m
acc_open_past_24mths
avg_cur_bal
bc_open_to_buy
bc_util
chargeoff_within_12_mths
delinq_amnt
mo_sin_old_il_acct
mo_sin_old_rev_tl_op
mo_sin_rcnt_rev_tl_op
mo_sin_rcnt_tl
mort_acc
mths_since_recent_bc
mths_since_recent_bc_dlq
mths_since_recent_inq
mths_since_recent_revol_delinq
num_accts_ever_120_pd
num_actv_bc_tl
num_actv_rev_tl
num_bc_sats
num_bc_tl
num_il_tl
num_op_rev_tl
num_rev_accts
num_rev_tl_bal_gt_0
num_sats
num_tl_120dpd_2m
num_tl_30dpd
num_tl_90g_dpd_24m
num_tl_op_past_12m
pct_tl_nvr_dlq
percent_bc_gt_75
pub_rec_bankruptcies
tax_lie

In [11]:
# Pipeline for categorical data preprocessing
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])
# Pipeline for numerical data preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

In [12]:
# Preprocess for pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_data),
        ('cat', categorical_transformer, categorical_data)])

In [13]:
# Actual pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

In [14]:
# Split test and train set
X = df_main_rf.drop(['loan_status'], axis=1)
y = df_main_rf['loan_status']

# Split into test and training set
X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.33, random_state=42)

In [15]:
# Hyerparameters grid to search within
param_grid = [
    {'classifier__bootstrap': [False, True],
     'classifier__n_estimators': [90, 100, 110],
     'classifier__max_features': [0.7, 0.73, 0.75],
     #'classifier__min_samples_leaf': [10, 12, 14],
     #'classifier__min_samples_split': [3, 5, 7]
    },
]

In [16]:
# Create the GridSearchCV object for random forest classification using roc_auc scoring
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, verbose=2, cv=5, scoring='roc_auc', refit=True, n_jobs=8)

In [None]:
# Fit the model
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.


In [None]:
Dont run past this yet

In [None]:
# Get the best model
final_model = grid_search.best_estimator_