In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pickle

from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import ADASYN, SMOTE

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

In [2]:
# Load the saved dataframe for modeling purposes
df_post_eda = pd.read_csv('loan_post_eda.csv', low_memory=False)

In [3]:
# Split the Features into Numerical and Non Numerical for charting purposes
features_numerical = list(df_post_eda.dtypes[df_post_eda.dtypes != 'object'].index)

features_nonnumerc = list(df_post_eda.dtypes[df_post_eda.dtypes == 'object'].index)
features_nonnumerc.remove('title')
features_nonnumerc.remove('emp_title')
features_nonnumerc.remove('desc')
features_nonnumerc.remove('loan_status')

features_nlp = ['title', 'emp_title', 'desc']

# Model 1 - Fully Paid vs Risky Loans (Late, Charged Off/Default)

In [4]:
# Model 1 Parameters
TEST_SIZE = 0.25 
RANDOM_STATE = 42
N_JOBS = -1
N_ESTIMATORS = 200

In [5]:
# Split the newly combined data frame
df_train, df_test = train_test_split(df_post_eda, test_size=TEST_SIZE, random_state=RANDOM_STATE)

# x train and test to be defined separately
# Set the target variable against all other statuses
y_train = np.where(df_train['loan_status'] == 'Fully Paid', 0, 1)
y_test = np.where(df_test['loan_status'] == 'Fully Paid', 0, 1)

df_test['loan_status'].value_counts()

Fully Paid            52124
Charged Off           11106
Late (31-120 days)     2902
Default                 314
Name: loan_status, dtype: int64

In [6]:
df_train['loan_status'].value_counts()

Fully Paid            155599
Charged Off            34142
Late (31-120 days)      8689
Default                  905
Name: loan_status, dtype: int64

## Model 1.1 - Numerical Features

In [7]:
x_train_m11 = df_train[features_numerical]
y_train_m11 = y_train

x_test_m11 = df_test[features_numerical]
y_test_m11 = y_test

In [8]:
rfcm11 = RandomForestClassifier(n_estimators=N_ESTIMATORS, random_state=RANDOM_STATE, n_jobs=N_JOBS)

In [9]:
rfcm11.fit(x_train_m11, y_train_m11)
rfcm11.score(x_test_m11, y_test_m11)

0.9971254853565301

In [10]:
y_pred_m11 = rfcm11.predict_proba(x_test_m11)

In [11]:
confusion_matrix(y_test_m11, rfcm11.predict(x_test_m11), labels=[1,0])

array([[14133,   189],
       [    2, 52122]], dtype=int64)

In [12]:
print(classification_report(y_test_m11, rfcm11.predict(x_test_m11)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     52124
           1       1.00      0.99      0.99     14322

   micro avg       1.00      1.00      1.00     66446
   macro avg       1.00      0.99      1.00     66446
weighted avg       1.00      1.00      1.00     66446



## Model 1.2 - Non-Numerical Features

In [13]:
df_m12 = df_post_eda[features_nonnumerc]
df_m12 = pd.get_dummies(df_m12)

In [14]:
# Split the newly combined data frame
df_train_m12, df_test_m12 = train_test_split(df_m12, test_size=TEST_SIZE, random_state=RANDOM_STATE)

In [15]:
x_train_m12 = df_train_m12
y_train_m12 = y_train

x_test_m12 = df_test_m12
y_test_m12 = y_test

In [16]:
# Undersample the Majority
from imblearn.under_sampling import TomekLinks
t12 = TomekLinks(ratio='majority', random_state=RANDOM_STATE, n_jobs=N_JOBS)

In [17]:
x_train12_tl, y_train12_tl = t12.fit_sample(x_train_m12, y_train_m12)

In [18]:
# Oversample the Minority
from imblearn.over_sampling import ADASYN
ads12 = ADASYN(ratio='minority', random_state=RANDOM_STATE, n_jobs=N_JOBS)

In [19]:
x_train12_ads, y_train12_ads = ads12.fit_sample(x_train12_tl, y_train12_tl)

In [20]:
x_train_m12 = x_train12_ads
y_train_m12 = y_train12_ads

In [21]:
rfcm12 = RandomForestClassifier(n_estimators=N_ESTIMATORS, random_state=RANDOM_STATE, n_jobs=N_JOBS)

In [22]:
rfcm12.fit(x_train_m12, y_train_m12)
rfcm12.score(x_test_m12, y_test_m12)

0.6466303464467387

In [23]:
y_pred_m12 = rfcm12.predict_proba(x_test_m12)

In [24]:
confusion_matrix(y_test, rfcm12.predict(x_test_m12), labels=[1,0])

array([[ 7141,  7181],
       [16299, 35825]], dtype=int64)

In [25]:
print(classification_report(y_test_m12, rfcm12.predict(x_test_m12)))

              precision    recall  f1-score   support

           0       0.83      0.69      0.75     52124
           1       0.30      0.50      0.38     14322

   micro avg       0.65      0.65      0.65     66446
   macro avg       0.57      0.59      0.57     66446
weighted avg       0.72      0.65      0.67     66446



## Model 1.3 - NLP, Loan Title

In [26]:
#tv = TfidfVectorizer(stop_words='english', ngram_range=(2,2), lowercase=True)
#x_train_m13 = tv.fit_transform(df_train['title'])
#y_train_m13 = y_train
#
#x_test_m13 = tv.transform(df_test['title'])
#y_test_m13 = y_test

In [62]:
tv13 = TfidfVectorizer(stop_words='english', ngram_range=(2,2), lowercase=True)
x_train_m13 = tv13.fit_transform(df_train['title'])
y_train_m13 = y_train

x_test_m13 = tv13.transform(df_test['title'])
y_test_m13 = y_test

In [27]:
rfcm13 = RandomForestClassifier(n_estimators=N_ESTIMATORS, random_state=RANDOM_STATE, n_jobs=N_JOBS)
rfcm13.fit(x_train_m13, y_train_m13)
rfcm13.score(x_test_m13, y_test_m13)

0.7801824037564338

In [28]:
y_pred_m13 = rfcm13.predict_proba(x_test_m13)

In [29]:
confusion_matrix(y_test_m13, rfcm13.predict(x_test_m13), labels=[1,0])

array([[   85, 14237],
       [  369, 51755]], dtype=int64)

In [30]:
print(classification_report(y_test_m13, rfcm13.predict(x_test_m13)))

              precision    recall  f1-score   support

           0       0.78      0.99      0.88     52124
           1       0.19      0.01      0.01     14322

   micro avg       0.78      0.78      0.78     66446
   macro avg       0.49      0.50      0.44     66446
weighted avg       0.66      0.78      0.69     66446



## Model 1.4 - NLP, Loan Description

In [64]:
#df_train['desc'] = df_train['desc'].map(lambda x: re.sub('\n', ' ', x))
#df_train['desc'] = df_train['desc'].map(lambda x: re.sub('br', ' ', x))
#df_train['desc'] = df_train['desc'].map(lambda x: re.sub('[^A-Za-z]+', ' ', x))
#
#df_test['desc'] = df_test['desc'].map(lambda x: re.sub('\n', ' ', x))
#df_test['desc'] = df_test['desc'].map(lambda x: re.sub('br', ' ', x))
#df_test['desc'] = df_test['desc'].map(lambda x: re.sub('[^A-Za-z]+', ' ', x))
#
#tv = TfidfVectorizer(stop_words='english', ngram_range=(1,2), lowercase=True)
#x_train_m14 = tv.fit_transform(df_train['desc'])
#x_test_m14 = tv.transform(df_test['desc'])
#
#y_train_m14 = y_train
#y_test_m14 = y_test

In [65]:
df_train['desc'] = df_train['desc'].map(lambda x: re.sub('\n', ' ', x))
df_train['desc'] = df_train['desc'].map(lambda x: re.sub('br', ' ', x))
df_train['desc'] = df_train['desc'].map(lambda x: re.sub('[^A-Za-z]+', ' ', x))

df_test['desc'] = df_test['desc'].map(lambda x: re.sub('\n', ' ', x))
df_test['desc'] = df_test['desc'].map(lambda x: re.sub('br', ' ', x))
df_test['desc'] = df_test['desc'].map(lambda x: re.sub('[^A-Za-z]+', ' ', x))

tv14 = TfidfVectorizer(stop_words='english', ngram_range=(1,2), lowercase=True)
x_train_m14 = tv14.fit_transform(df_train['desc'])
x_test_m14 = tv14.transform(df_test['desc'])

y_train_m14 = y_train
y_test_m14 = y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [32]:
rfcm14 = RandomForestClassifier(n_estimators=N_ESTIMATORS, random_state=RANDOM_STATE, n_jobs=N_JOBS)
rfcm14.fit(x_train_m14, y_train_m14)
rfcm14.score(x_test_m14, y_test_m14)

0.7832224663636638

In [33]:
y_pred_m14 = rfcm14.predict_proba(x_test_m14)

In [34]:
confusion_matrix(y_test_m14, rfcm14.predict(x_test_m14), labels=[1,0])

array([[   38, 14284],
       [  120, 52004]], dtype=int64)

In [35]:
print(classification_report(y_test_m14, rfcm14.predict(x_test_m14)))

              precision    recall  f1-score   support

           0       0.78      1.00      0.88     52124
           1       0.24      0.00      0.01     14322

   micro avg       0.78      0.78      0.78     66446
   macro avg       0.51      0.50      0.44     66446
weighted avg       0.67      0.78      0.69     66446



## Model 1.5 - NLP, Employee Title

In [66]:
#tv = TfidfVectorizer(stop_words='english', ngram_range=(2,2), lowercase=True)
#x_train_m15 = tv.fit_transform(df_train['emp_title'])
#y_train_m15 = y_train
#
#x_test_m15 = tv.transform(df_test['emp_title'])
#y_test_m15 = y_test

In [67]:
tv15 = TfidfVectorizer(stop_words='english', ngram_range=(2,2), lowercase=True)
x_train_m15 = tv15.fit_transform(df_train['emp_title'])
y_train_m15 = y_train

x_test_m15 = tv15.transform(df_test['emp_title'])
y_test_m15 = y_test

In [37]:
rfcm15 = RandomForestClassifier(n_estimators=N_ESTIMATORS, random_state=RANDOM_STATE, n_jobs=N_JOBS)
rfcm15.fit(x_train_m15, y_train_m15)
rfcm15.score(x_test_m15, y_test_m15)

0.7668783673960811

In [38]:
y_pred_m15 = rfcm15.predict_proba(x_test_m15)

In [39]:
confusion_matrix(y_test_m15, rfcm15.predict(x_test_m15), labels=[1,0])

array([[  700, 13622],
       [ 1868, 50256]], dtype=int64)

In [40]:
print(classification_report(y_test_m15, rfcm15.predict(x_test_m15)))

              precision    recall  f1-score   support

           0       0.79      0.96      0.87     52124
           1       0.27      0.05      0.08     14322

   micro avg       0.77      0.77      0.77     66446
   macro avg       0.53      0.51      0.47     66446
weighted avg       0.68      0.77      0.70     66446



## Model Stacking - Combine all the models by stacking their probabilities

In [41]:
df_proba_m1 = pd.DataFrame()
df_proba_m1['y_test'] = y_test
df_proba_m1['y_proba1'] = y_pred_m11[:,1]
df_proba_m1['y_proba2'] = y_pred_m12[:,1]
df_proba_m1['y_proba3'] = y_pred_m13[:,1]
df_proba_m1['y_proba4'] = y_pred_m14[:,1]
df_proba_m1['y_proba5'] = y_pred_m15[:,1]

In [42]:
df_proba_m1.describe()

Unnamed: 0,y_test,y_proba1,y_proba2,y_proba3,y_proba4,y_proba5
count,66446.0,66446.0,66446.0,66446.0,66446.0,66446.0
mean,0.215543,0.214914,0.399836,0.222053,0.206645,0.22472
std,0.411202,0.400249,0.215417,0.069537,0.065898,0.13419
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.256583,0.222488,0.179167,0.157825
50%,0.0,0.0,0.423122,0.229247,0.237606,0.249044
75%,0.0,0.035,0.554034,0.249882,0.237606,0.249044
max,1.0,1.0,1.0,0.950799,0.947083,1.0


In [43]:
df_proba_m1['y_proba5'].shape, y_test.shape

((66446,), (66446,))

In [44]:
# Save a Copy
df_proba_m1.to_csv('proba_m1.csv', index=False)

In [45]:
# Save a memory copy
df_proba_m1_copy = df_proba_m1.copy()

### Save the Models and TFIDF

In [46]:
pickle.dump(rfcm11, open('rfcm11.sav', 'wb'))
pickle.dump(rfcm12, open('rfcm12.sav', 'wb'))
pickle.dump(rfcm13, open('rfcm13.sav', 'wb'))
pickle.dump(rfcm14, open('rfcm14.sav', 'wb'))
pickle.dump(rfcm15, open('rfcm15.sav', 'wb'))

In [68]:
pickle.dump(tv13, open('tv13.sav', 'wb'))
pickle.dump(tv14, open('tv14.sav', 'wb'))
pickle.dump(tv15, open('tv15.sav', 'wb'))

### Save the Test Data

In [75]:
df_test_m11 = df_test

In [76]:
df_test_m11.to_csv('df_test_m11.csv', index=True)
df_test_m12.to_csv('df_test_m12.csv', index=True)

### Load the Models and TFIDF

In [None]:
rfcm11 = pickle.load(open('rfcm11.sav', 'rb'))
rfcm12 = pickle.load(open('rfcm12.sav', 'rb'))
rfcm13 = pickle.load(open('rfcm13.sav', 'rb'))
rfcm14 = pickle.load(open('rfcm14.sav', 'rb'))
rfcm15 = pickle.load(open('rfcm15.sav', 'rb'))

tv13 = pickle.load(open('tv13.sav', 'rb'))
tv14 = pickle.load(open('tv14.sav', 'rb'))
tv15 = pickle.load(open('tv15.sav', 'rb'))

### Average Proba

In [47]:
df_proba_m1_copy['y_proba_avg'] = df_proba_m1_copy.apply('mean', axis=1)
df_proba_m1_copy['y_proba_avg_pred'] = np.where(df_proba_m1_copy['y_proba_avg'] >= 0.5, 1, 0)

In [48]:
confusion_matrix(y_test, df_proba_m1_copy['y_proba_avg_pred'], labels=[1,0])

array([[10071,  4251],
       [    0, 52124]], dtype=int64)

In [49]:
accuracy_score(y_test, df_proba_m1_copy['y_proba_avg_pred'])

0.936023236914186

In [50]:
print(classification_report(df_proba_m1_copy['y_test'], df_proba_m1_copy['y_proba_avg_pred']))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96     52124
           1       1.00      0.70      0.83     14322

   micro avg       0.94      0.94      0.94     66446
   macro avg       0.96      0.85      0.89     66446
weighted avg       0.94      0.94      0.93     66446



### Bayes Theorem (Conflated Probabilities) - FINAL

In [51]:
df_proba_m1_copy['y_proba_conft'] = df_proba_m1_copy['y_proba1']*df_proba_m1_copy['y_proba2']*df_proba_m1_copy['y_proba3']*df_proba_m1_copy['y_proba4']*df_proba_m1_copy['y_proba5']
df_proba_m1_copy['y_proba_confb'] = (1-df_proba_m1_copy['y_proba1'])*(1-df_proba_m1_copy['y_proba2'])*(1-df_proba_m1_copy['y_proba3'])*(1-df_proba_m1_copy['y_proba4'])*(1-df_proba_m1_copy['y_proba5'])
df_proba_m1_copy['y_proba_confa'] = df_proba_m1_copy['y_proba_conft'] / (df_proba_m1_copy['y_proba_conft'] + df_proba_m1_copy['y_proba_confb'])

df_proba_m1_copy['y_proba_confa_pred'] = np.where(df_proba_m1_copy['y_proba_confa'] >= 0.5, 1, 0)

In [52]:
print(confusion_matrix(y_test, df_proba_m1_copy['y_proba_confa_pred']))

[[52116     8]
 [ 3714 10608]]


In [55]:
accuracy_score(y_test, df_proba_m1_copy['y_proba_confa_pred'])

0.9439845889895554

In [56]:
print(classification_report(df_proba_m1_copy['y_test'], df_proba_m1_copy['y_proba_confa_pred']))

              precision    recall  f1-score   support

           0       0.93      1.00      0.97     52124
           1       1.00      0.74      0.85     14322

   micro avg       0.94      0.94      0.94     66446
   macro avg       0.97      0.87      0.91     66446
weighted avg       0.95      0.94      0.94     66446



In [None]:
df_test_copy = df_test.copy()
df_test_copy['loan_status'] = y_test
df_test_copy['loan_status_pred'] = df_proba_m1_copy['y_proba_confa_pred']

In [None]:
c1 = df_test_copy['loan_status'] == 1
c2 = df_test_copy['loan_status'] != df_test_copy['loan_status_pred']
df_f1_mispred = df_test_copy[c1 & c2]
df_f1_mispred.shape

In [None]:
df_test_copy['loan_status'].value_counts()

In [None]:
df_test_copy['loan_status_pred'].value_counts(dropna=False)

In [None]:
df_test_copy.shape

In [None]:
# Check how much money is gained or lost by comparing the amount loaned vs amount received
plt.figure(figsize=(15,7))
ax1 = plt.gca()
x_loan_amnt = np.array([df_test_copy['total_pymnt'].T,
                     df_test_copy['loan_amnt'].T])
ax1.hist(x_loan_amnt, 15, density=False, log=False, histtype='bar')
ax1.set_title('Total Payment Received vs Loan Amount')
ax1.legend(['Total Payment Received', 'Loan Amount'])

ax1.set_xlabel(ax1.get_xlabel(), fontsize=15)
ax1.set_ylabel(ax1.get_ylabel(), fontsize=15)
ax1.set_title(ax1.get_title(), fontsize=20)