In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import ADASYN, SMOTE

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

In [2]:
# Load the saved dataframe for modeling purposes
df_post_eda = pd.read_csv('loan_post_eda.csv', low_memory=False)

In [3]:
# Split the Features into Numerical and Non Numerical for charting purposes
features_numerical = list(df_post_eda.dtypes[df_post_eda.dtypes != 'object'].index)

features_nonnumerc = list(df_post_eda.dtypes[df_post_eda.dtypes == 'object'].index)
features_nonnumerc.remove('title')
features_nonnumerc.remove('emp_title')
features_nonnumerc.remove('desc')
features_nonnumerc.remove('loan_status')

features_nlp = ['title', 'emp_title', 'desc']

## Model 2 - Late vs Charged Off/Default

In [4]:
# Model 1 Parameters
TEST_SIZE = 0.25 
RANDOM_STATE = 42
N_JOBS = -1
N_ESTIMATORS = 200

In [5]:
# Split the newly combined data frame
df_post_eda = df_post_eda[df_post_eda['loan_status'].isin(['Charged Off', 'Default', 'Late (31-120 days)'])]
df_train, df_test = train_test_split(df_post_eda, test_size=TEST_SIZE, random_state=RANDOM_STATE)

# x train and test to be defined separately
# Set the target variable against all other statuses
y_train = np.where(df_train['loan_status'] == 'Late (31-120 days)', 0, 1)
y_test = np.where(df_test['loan_status'] == 'Late (31-120 days)', 0, 1)

df_test['loan_status'].value_counts()

Charged Off           2935
Late (31-120 days)     247
Default                 28
Name: loan_status, dtype: int64

In [6]:
df_train['loan_status'].value_counts()

Charged Off           8778
Late (31-120 days)     769
Default                 80
Name: loan_status, dtype: int64

## Model 2.1 - Numerical Features

In [7]:
x_train_m11 = df_train[features_numerical]
y_train_m11 = y_train

x_test_m11 = df_test[features_numerical]
y_test_m11 = y_test

In [8]:
rfcm11 = RandomForestClassifier(n_estimators=N_ESTIMATORS, random_state=RANDOM_STATE, n_jobs=N_JOBS)

In [9]:
rfcm11.fit(x_train_m11, y_train_m11)
rfcm11.score(x_test_m11, y_test_m11)

0.991588785046729

In [10]:
y_pred_m11 = rfcm11.predict_proba(x_test_m11)

In [11]:
confusion_matrix(y_test_m11, rfcm11.predict(x_test_m11), labels=[1,0])

array([[2936,   27],
       [   0,  247]], dtype=int64)

In [12]:
print(classification_report(y_test_m11, rfcm11.predict(x_test_m11)))

              precision    recall  f1-score   support

           0       0.90      1.00      0.95       247
           1       1.00      0.99      1.00      2963

   micro avg       0.99      0.99      0.99      3210
   macro avg       0.95      1.00      0.97      3210
weighted avg       0.99      0.99      0.99      3210



## Model 2.2 - Non-Numerical Features

In [13]:
df_m12 = df_post_eda[features_nonnumerc]
df_m12 = pd.get_dummies(df_m12)

In [14]:
# Split the newly combined data frame
df_train_m12, df_test_m12 = train_test_split(df_m12, test_size=TEST_SIZE, random_state=RANDOM_STATE)

In [15]:
x_train_m12 = df_train_m12
y_train_m12 = y_train

x_test_m12 = df_test_m12
y_test_m12 = y_test

In [None]:
# Undersample the Majority
from imblearn.under_sampling import TomekLinks
t12 = TomekLinks(ratio='majority', random_state=RANDOM_STATE, n_jobs=N_JOBS)

In [None]:
x_train12_tl, y_train12_tl = t12.fit_sample(x_train_m12, y_train_m12)

In [None]:
# Oversample the Minority
from imblearn.over_sampling import ADASYN
ads12 = ADASYN(ratio='minority', random_state=RANDOM_STATE, n_jobs=N_JOBS)

In [None]:
x_train12_ads, y_train12_ads = ads12.fit_sample(x_train12_tl, y_train12_tl)

In [None]:
x_train_m12 = x_train12_ads
y_train_m12 = y_train12_ads

In [16]:
rfcm12 = RandomForestClassifier(n_estimators=N_ESTIMATORS, random_state=RANDOM_STATE, n_jobs=N_JOBS)

In [17]:
rfcm12.fit(x_train_m12, y_train_m12)
rfcm12.score(x_test_m12, y_test_m12)

0.9093457943925234

In [18]:
y_pred_m12 = rfcm12.predict_proba(x_test_m12)

In [19]:
confusion_matrix(y_test, rfcm12.predict(x_test_m12), labels=[1,0])

array([[2913,   50],
       [ 241,    6]], dtype=int64)

In [20]:
print(classification_report(y_test_m12, rfcm12.predict(x_test_m12)))

              precision    recall  f1-score   support

           0       0.11      0.02      0.04       247
           1       0.92      0.98      0.95      2963

   micro avg       0.91      0.91      0.91      3210
   macro avg       0.52      0.50      0.50      3210
weighted avg       0.86      0.91      0.88      3210



## Model 2.3 - NLP, Loan Title

In [21]:
tv = TfidfVectorizer(stop_words='english', ngram_range=(2,2), lowercase=True)
x_train_m13 = tv.fit_transform(df_train['title'])
y_train_m13 = y_train

x_test_m13 = tv.transform(df_test['title'])
y_test_m13 = y_test

In [22]:
rfcm13 = RandomForestClassifier(n_estimators=N_ESTIMATORS, random_state=RANDOM_STATE, n_jobs=N_JOBS)
rfcm13.fit(x_train_m13, y_train_m13)
rfcm13.score(x_test_m13, y_test_m13)

0.9190031152647975

In [23]:
y_pred_m13 = rfcm13.predict_proba(x_test_m13)

In [24]:
confusion_matrix(y_test_m13, rfcm13.predict(x_test_m13), labels=[1,0])

array([[2950,   13],
       [ 247,    0]], dtype=int64)

In [25]:
print(classification_report(y_test_m13, rfcm13.predict(x_test_m13)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       247
           1       0.92      1.00      0.96      2963

   micro avg       0.92      0.92      0.92      3210
   macro avg       0.46      0.50      0.48      3210
weighted avg       0.85      0.92      0.88      3210



## Model 2.4 - NLP, Loan Description

In [26]:
df_train['desc'] = df_train['desc'].map(lambda x: re.sub('\n', ' ', x))
df_train['desc'] = df_train['desc'].map(lambda x: re.sub('br', ' ', x))
df_train['desc'] = df_train['desc'].map(lambda x: re.sub('[^A-Za-z]+', ' ', x))

df_test['desc'] = df_test['desc'].map(lambda x: re.sub('\n', ' ', x))
df_test['desc'] = df_test['desc'].map(lambda x: re.sub('br', ' ', x))
df_test['desc'] = df_test['desc'].map(lambda x: re.sub('[^A-Za-z]+', ' ', x))

tv = TfidfVectorizer(stop_words='english', ngram_range=(1,2), lowercase=True)
x_train_m14 = tv.fit_transform(df_train['desc'])
x_test_m14 = tv.transform(df_test['desc'])

y_train_m14 = y_train
y_test_m14 = y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [27]:
rfcm14 = RandomForestClassifier(n_estimators=N_ESTIMATORS, random_state=RANDOM_STATE, n_jobs=N_JOBS)
rfcm14.fit(x_train_m14, y_train_m14)
rfcm14.score(x_test_m14, y_test_m14)

0.922429906542056

In [28]:
y_pred_m14 = rfcm14.predict_proba(x_test_m14)

In [29]:
confusion_matrix(y_test_m14, rfcm14.predict(x_test_m14), labels=[1,0])

array([[2961,    2],
       [ 247,    0]], dtype=int64)

In [30]:
print(classification_report(y_test_m14, rfcm14.predict(x_test_m14)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       247
           1       0.92      1.00      0.96      2963

   micro avg       0.92      0.92      0.92      3210
   macro avg       0.46      0.50      0.48      3210
weighted avg       0.85      0.92      0.89      3210



## Model 2.5 - NLP, Employee Title

In [31]:
tv = TfidfVectorizer(stop_words='english', ngram_range=(2,2), lowercase=True)
x_train_m15 = tv.fit_transform(df_train['emp_title'])
y_train_m15 = y_train

x_test_m15 = tv.transform(df_test['emp_title'])
y_test_m15 = y_test

In [32]:
rfcm15 = RandomForestClassifier(n_estimators=N_ESTIMATORS, random_state=RANDOM_STATE, n_jobs=N_JOBS)
rfcm15.fit(x_train_m15, y_train_m15)
rfcm15.score(x_test_m15, y_test_m15)

0.9161993769470405

In [33]:
y_pred_m15 = rfcm15.predict_proba(x_test_m15)

In [34]:
confusion_matrix(y_test_m15, rfcm15.predict(x_test_m15), labels=[1,0])

array([[2936,   27],
       [ 242,    5]], dtype=int64)

In [35]:
print(classification_report(y_test_m15, rfcm15.predict(x_test_m15)))

              precision    recall  f1-score   support

           0       0.16      0.02      0.04       247
           1       0.92      0.99      0.96      2963

   micro avg       0.92      0.92      0.92      3210
   macro avg       0.54      0.51      0.50      3210
weighted avg       0.86      0.92      0.89      3210



## Model Stacking - Combine all the models by stacking their probabilities

In [36]:
df_proba_m1 = pd.DataFrame()
df_proba_m1['y_proba1'] = y_pred_m11[:,1]
df_proba_m1['y_proba2'] = y_pred_m12[:,1]
df_proba_m1['y_proba3'] = y_pred_m13[:,1]
df_proba_m1['y_proba4'] = y_pred_m14[:,1]
df_proba_m1['y_proba5'] = y_pred_m15[:,1]

In [37]:
df_proba_m1.describe()

Unnamed: 0,y_proba1,y_proba2,y_proba3,y_proba4,y_proba5
count,3210.0,3210.0,3210.0,3210.0,3210.0
mean,0.92367,0.916721,0.917348,0.923976,0.915039
std,0.243228,0.13201,0.062701,0.047957,0.086206
min,0.015,0.03175,0.276086,0.285,0.070492
25%,0.995,0.891077,0.887665,0.904173,0.920113
50%,1.0,0.974472,0.92983,0.904173,0.920113
75%,1.0,0.997074,0.92983,0.964479,0.920113
max,1.0,1.0,1.0,1.0,1.0


In [38]:
df_proba_m1['y_proba5'].shape, y_test.shape

((3210,), (3210,))

In [39]:
# Save a Copy
df_proba_m1.to_csv('proba_m2.csv', index=False)

In [40]:
# Save a memory copy
df_proba_m1_copy = df_proba_m1.copy()

### Average Proba

In [41]:
df_proba_m1_copy['y_proba_avg'] = df_proba_m1_copy.apply('mean', axis=1)
df_proba_m1_copy['y_proba_avg_pred'] = np.where(df_proba_m1_copy['y_proba_avg'] >= 0.5, 1, 0)

In [44]:
confusion_matrix(y_test, df_proba_m1_copy['y_proba_avg_pred'], labels=[1,0])

array([[2963,    0],
       [ 247,    0]], dtype=int64)

In [45]:
accuracy_score(y_test, df_proba_m1_copy['y_proba_avg_pred'])

0.9230529595015576

In [46]:
print(classification_report(y_test, df_proba_m1_copy['y_proba_avg_pred']))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       247
           1       0.92      1.00      0.96      2963

   micro avg       0.92      0.92      0.92      3210
   macro avg       0.46      0.50      0.48      3210
weighted avg       0.85      0.92      0.89      3210



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


### Bayes Theorem (Conflated Probabilities) - FINAL

In [47]:
df_proba_m1_copy['y_proba_conft'] = df_proba_m1_copy['y_proba1']*df_proba_m1_copy['y_proba2']*df_proba_m1_copy['y_proba3']*df_proba_m1_copy['y_proba4']*df_proba_m1_copy['y_proba5']
df_proba_m1_copy['y_proba_confb'] = (1-df_proba_m1_copy['y_proba1'])*(1-df_proba_m1_copy['y_proba2'])*(1-df_proba_m1_copy['y_proba3'])*(1-df_proba_m1_copy['y_proba4'])*(1-df_proba_m1_copy['y_proba5'])
df_proba_m1_copy['y_proba_confa'] = df_proba_m1_copy['y_proba_conft'] / (df_proba_m1_copy['y_proba_conft'] + df_proba_m1_copy['y_proba_confb'])
df_proba_m1_copy['y_proba_confa_pred'] = np.where(df_proba_m1_copy['y_proba_confa'] >= 0.5, 1, 0)

In [52]:
confusion_matrix(y_test, df_proba_m1_copy['y_proba_confa_pred'])

array([[   0,  247],
       [   0, 2963]], dtype=int64)

In [53]:
accuracy_score(y_test, df_proba_m1_copy['y_proba_confa_pred'])

0.9230529595015576

In [54]:
print(classification_report(y_test, df_proba_m1_copy['y_proba_confa_pred']))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       247
           1       0.92      1.00      0.96      2963

   micro avg       0.92      0.92      0.92      3210
   macro avg       0.46      0.50      0.48      3210
weighted avg       0.85      0.92      0.89      3210



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
