In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pickle

from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import ADASYN, SMOTE

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

### Load the Saved Test Data

In [2]:
df_test = pd.read_csv('df_test_m11.csv', index_col=0)
df_test_pd = pd.read_csv('df_test_m12.csv', index_col=0)

### Predefined Columns

In [3]:
# Split the Features into Numerical and Non Numerical for charting purposes
features_numerical = list(df_test.dtypes[df_test.dtypes != 'object'].index)

features_nonnumerc = list(df_test.dtypes[df_test.dtypes == 'object'].index)
features_nonnumerc.remove('title')
features_nonnumerc.remove('emp_title')
features_nonnumerc.remove('desc')
features_nonnumerc.remove('loan_status')

features_nlp = ['title', 'emp_title', 'desc']

In [4]:
# Independent Variables
x_test_m11 = df_test[features_numerical]
x_test_m12 = df_test_pd
x_test_m13 = df_test['title']
x_test_m14 = df_test['desc']
x_test_m15 = df_test['emp_title']

# Target Variable
y_test = df_test["loan_status"]

### Sample 5 Test Data

In [5]:
sampled_records = [28384, 242322, 94760]
# sampled_records.extend(df_test[df_test['loan_status']=='Fully Paid'].sample(3).index.values)
# sampled_records.extend(df_test[df_test['loan_status']=='Late (31-120 days)'].sample(3).index.values)
# sampled_records.extend(df_test[df_test['loan_status']=='Charged Off'].sample(3).index.values)
# sampled_records.extend(df_test[df_test['loan_status']=='Default'].sample(3).index.values)

In [6]:
df_test[df_test.index.isin(sampled_records)]['loan_status']

28384     Charged Off
242322     Fully Paid
94760     Charged Off
Name: loan_status, dtype: object

In [7]:
x_test_m11 = x_test_m11[x_test_m11.index.isin(sampled_records)]
x_test_m12 = x_test_m12[x_test_m12.index.isin(sampled_records)]
x_test_m13 = x_test_m13[x_test_m13.index.isin(sampled_records)]
x_test_m14 = x_test_m14[x_test_m14.index.isin(sampled_records)]
x_test_m15 = x_test_m15[x_test_m15.index.isin(sampled_records)]

y_test = y_test[y_test.index.isin(sampled_records)]
y_test = np.where(y_test == 'Fully Paid', 0, 1)

In [8]:
x_test_m11

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,mths_since_last_delinq,...,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m
28384,3600.0,3600.0,3178.61,14.59,124.08,12996.0,8.13,1.0,0.0,23.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
242322,14400.0,14400.0,14350.0,19.99,381.44,50000.0,27.17,1.0,0.0,17.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3500.0,0.0,0.0,0.0
94760,7000.0,7000.0,7000.0,16.29,247.11,36000.0,23.03,1.0,2.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9700.0,0.0,0.0,0.0


In [9]:
x_test_m12

Unnamed: 0,term_ 36 months,term_ 60 months,grade_A,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,sub_grade_A1,...,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,initial_list_status_f,initial_list_status_w
28384,1,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
242322,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
94760,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [10]:
x_test_m13

28384            Matt Rohwer
242322    Debt consolidation
94760                my loan
Name: title, dtype: object

In [11]:
x_test_m14

28384     Not Specified
242322    Not Specified
94760     Not Specified
Name: desc, dtype: object

In [12]:
x_test_m15

28384              JCPenney Corp
242322                        Rn
94760     Reimbursement Concepts
Name: emp_title, dtype: object

### Load the Models and TFIDF

In [13]:
rfcm11 = pickle.load(open('rfcm11.sav', 'rb'))
rfcm12 = pickle.load(open('rfcm12.sav', 'rb'))
rfcm13 = pickle.load(open('rfcm13.sav', 'rb'))
rfcm14 = pickle.load(open('rfcm14.sav', 'rb'))
rfcm15 = pickle.load(open('rfcm15.sav', 'rb'))

tv13 = pickle.load(open('tv13.sav', 'rb'))
tv14 = pickle.load(open('tv14.sav', 'rb'))
tv15 = pickle.load(open('tv15.sav', 'rb'))

In [14]:
x_test_m13 = tv13.transform(x_test_m13)
x_test_m14 = tv14.transform(x_test_m14)
x_test_m15 = tv15.transform(x_test_m15)

### Run the Test Data against the Model

In [15]:
y_pred_m11 = rfcm11.predict_proba(x_test_m11)
y_pred_m12 = rfcm12.predict_proba(x_test_m12)
y_pred_m13 = rfcm13.predict_proba(x_test_m13)
y_pred_m14 = rfcm14.predict_proba(x_test_m14)
y_pred_m15 = rfcm15.predict_proba(x_test_m15)

### Stack the Models Together

In [16]:
df_proba_m1 = pd.DataFrame()
df_proba_m1['y_test'] = y_test
df_proba_m1['y_proba1'] = y_pred_m11[:,1]
df_proba_m1['y_proba2'] = y_pred_m12[:,1]
df_proba_m1['y_proba3'] = y_pred_m13[:,1]
df_proba_m1['y_proba4'] = y_pred_m14[:,1]
df_proba_m1['y_proba5'] = y_pred_m15[:,1]

### Calculate Final Probability

In [17]:
df_proba_m1['y_proba_conft'] = df_proba_m1['y_proba1']*df_proba_m1['y_proba2']*df_proba_m1['y_proba3']*df_proba_m1['y_proba4']*df_proba_m1['y_proba5']
df_proba_m1['y_proba_confb'] = (1-df_proba_m1['y_proba1'])*(1-df_proba_m1['y_proba2'])*(1-df_proba_m1['y_proba3'])*(1-df_proba_m1['y_proba4'])*(1-df_proba_m1['y_proba5'])
df_proba_m1['y_proba_confa'] = df_proba_m1['y_proba_conft'] / (df_proba_m1['y_proba_conft'] + df_proba_m1['y_proba_confb'])

In [18]:
df_proba_m1['y_proba_confa']

0    0.692256
1    0.001119
2    0.829062
Name: y_proba_confa, dtype: float64

In [19]:
df_proba_m1

Unnamed: 0,y_test,y_proba1,y_proba2,y_proba3,y_proba4,y_proba5,y_proba_conft,y_proba_confb,y_proba_confa
0,1,0.995,0.274509,0.22423,0.237606,0.249044,0.003624,0.001611,0.692256
1,0,0.01,0.763022,0.249882,0.237606,0.249044,0.000113,0.100755,0.001119
2,1,0.995,0.703928,0.22423,0.237606,0.102169,0.003813,0.000786,0.829062
