In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import ADASYN, SMOTE

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

In [2]:
# Load the saved dataframe for modeling purposes
df_post_eda = pd.read_csv('loan_post_eda.csv', low_memory=False)

In [3]:
# Split the Features into Numerical and Non Numerical for charting purposes
features_numerical = list(df_post_eda.dtypes[df_post_eda.dtypes != 'object'].index)

features_nonnumerc = list(df_post_eda.dtypes[df_post_eda.dtypes == 'object'].index)
features_nonnumerc.remove('title')
features_nonnumerc.remove('emp_title')
features_nonnumerc.remove('desc')
features_nonnumerc.remove('loan_status')

features_nlp = ['title', 'emp_title', 'desc']

## Model 3 - Charged Off vs Default

In [4]:
# Model 1 Parameters
TEST_SIZE = 0.25 
RANDOM_STATE = 42
N_JOBS = -1
N_ESTIMATORS = 200

In [5]:
# Split the newly combined data frame
df_post_eda = df_post_eda[df_post_eda['loan_status'].isin(['Charged Off', 'Default'])]
df_train, df_test = train_test_split(df_post_eda, test_size=TEST_SIZE, random_state=RANDOM_STATE)

# x train and test to be defined separately
# Set the target variable against all other statuses
y_train = np.where(df_train['loan_status'] == 'Default', 1, 0)
y_test = np.where(df_test['loan_status'] == 'Default', 1, 0)

df_test['loan_status'].value_counts()

Charged Off    2930
Default          26
Name: loan_status, dtype: int64

In [6]:
df_train['loan_status'].value_counts()

Charged Off    8783
Default          82
Name: loan_status, dtype: int64

In [7]:
# Model 1.1 - Numerical Features
x_train_m11 = df_train[features_numerical]
y_train_m11 = y_train

x_test_m11 = df_test[features_numerical]
y_test_m11 = y_test

In [8]:
# Undersample - TOMEK
tl = TomekLinks(ratio='majority', random_state=RANDOM_STATE, n_jobs=N_JOBS)
x_train_tl, y_train_tl = tl.fit_sample(x_train_m11, y_train_m11)

In [9]:
# Oversample - ADASYN
ads = ADASYN(ratio='minority', random_state=RANDOM_STATE, n_jobs=N_JOBS)
x_train_ads, y_train_ads = ads.fit_sample(x_train_tl, y_train_tl)

In [10]:
rfcm11 = RandomForestClassifier(n_estimators=N_ESTIMATORS, random_state=RANDOM_STATE, n_jobs=N_JOBS)

In [11]:
# rfcm11.fit(x_train_m11, y_train_m11)
# rfcm11.score(x_test_m11, y_test_m11)

rfcm11.fit(x_train_ads, y_train_ads)
rfcm11.score(x_test_m11, y_test_m11)

1.0

In [12]:
y_pred_m11 = rfcm11.predict_proba(x_test_m11)

In [13]:
confusion_matrix(y_test_m11, rfcm11.predict(x_test_m11), labels=[1,0])

array([[  26,    0],
       [   0, 2930]], dtype=int64)

In [14]:
print(classification_report(y_test_m11, rfcm11.predict(x_test_m11)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2930
           1       1.00      1.00      1.00        26

   micro avg       1.00      1.00      1.00      2956
   macro avg       1.00      1.00      1.00      2956
weighted avg       1.00      1.00      1.00      2956



In [15]:
# Model 1.2 - Non-Numerical Features
df_m12 = df_post_eda[features_nonnumerc]
df_m12 = pd.get_dummies(df_m12)

In [16]:
# Split the newly combined data frame
df_train_m12, df_test_m12 = train_test_split(df_m12, test_size=TEST_SIZE, random_state=RANDOM_STATE)

In [17]:
x_train_m12 = df_train_m12
y_train_m12 = y_train

x_test_m12 = df_test_m12
y_test_m12 = y_test

In [18]:
# Undersample - TOMEK
tl2 = TomekLinks(ratio='majority', random_state=RANDOM_STATE, n_jobs=N_JOBS)
x_train_tl2, y_train_tl2 = tl.fit_sample(x_train_m12, y_train_m12)

In [19]:
# Oversample - ADASYN
ads2 = ADASYN(ratio='minority', random_state=RANDOM_STATE, n_jobs=N_JOBS)
x_train_ads2, y_train_ads2 = ads.fit_sample(x_train_tl2, y_train_tl2)

In [20]:
rfcm12 = RandomForestClassifier(n_estimators=N_ESTIMATORS, random_state=RANDOM_STATE, n_jobs=N_JOBS)

In [21]:
# rfcm12.fit(x_train_m12, y_train_m12)
# rfcm12.score(x_test_m12, y_test_m12)

rfcm12.fit(x_train_ads2, y_train_ads2)
rfcm12.score(x_test_m12, y_test_m12)

0.9878213802435724

In [22]:
y_pred_m12 = rfcm12.predict_proba(x_test_m12)

In [23]:
confusion_matrix(y_test, rfcm12.predict(x_test_m12), labels=[1,0])

array([[   0,   26],
       [  10, 2920]], dtype=int64)

In [24]:
print(classification_report(y_test_m12, rfcm12.predict(x_test_m12)))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2930
           1       0.00      0.00      0.00        26

   micro avg       0.99      0.99      0.99      2956
   macro avg       0.50      0.50      0.50      2956
weighted avg       0.98      0.99      0.99      2956



In [25]:
# Model 1.3
tv = TfidfVectorizer(stop_words='english', ngram_range=(2,2), lowercase=True)
x_train_m13 = tv.fit_transform(df_train['title'])
y_train_m13 = y_train

x_test_m13 = tv.transform(df_test['title'])
y_test_m13 = y_test

In [26]:
rfcm13 = RandomForestClassifier(n_estimators=N_ESTIMATORS, random_state=RANDOM_STATE, n_jobs=N_JOBS)
rfcm13.fit(x_train_m13, y_train_m13)
rfcm13.score(x_test_m13, y_test_m13)

0.9912043301759134

In [27]:
y_pred_m13 = rfcm13.predict_proba(x_test_m13)

In [28]:
confusion_matrix(y_test_m13, rfcm13.predict(x_test_m13), labels=[1,0])

array([[   0,   26],
       [   0, 2930]], dtype=int64)

In [29]:
print(classification_report(y_test_m13, rfcm13.predict(x_test_m13)))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2930
           1       0.00      0.00      0.00        26

   micro avg       0.99      0.99      0.99      2956
   macro avg       0.50      0.50      0.50      2956
weighted avg       0.98      0.99      0.99      2956



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [30]:
# Model 1.4
df_train['desc'] = df_train['desc'].map(lambda x: re.sub('\n', ' ', x))
df_train['desc'] = df_train['desc'].map(lambda x: re.sub('br', ' ', x))
df_train['desc'] = df_train['desc'].map(lambda x: re.sub('[^A-Za-z]+', ' ', x))

df_test['desc'] = df_test['desc'].map(lambda x: re.sub('\n', ' ', x))
df_test['desc'] = df_test['desc'].map(lambda x: re.sub('br', ' ', x))
df_test['desc'] = df_test['desc'].map(lambda x: re.sub('[^A-Za-z]+', ' ', x))

tv = TfidfVectorizer(stop_words='english', ngram_range=(1,2), lowercase=True)
x_train_m14 = tv.fit_transform(df_train['desc'])
x_test_m14 = tv.transform(df_test['desc'])

y_train_m14 = y_train
y_test_m14 = y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [31]:
rfcm14 = RandomForestClassifier(n_estimators=N_ESTIMATORS, random_state=RANDOM_STATE, n_jobs=N_JOBS)
rfcm14.fit(x_train_m14, y_train_m14)
rfcm14.score(x_test_m14, y_test_m14)

0.9912043301759134

In [32]:
y_pred_m14 = rfcm14.predict_proba(x_test_m14)

In [33]:
confusion_matrix(y_test_m14, rfcm14.predict(x_test_m14), labels=[1,0])

array([[   0,   26],
       [   0, 2930]], dtype=int64)

In [34]:
print(classification_report(y_test_m14, rfcm14.predict(x_test_m14)))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2930
           1       0.00      0.00      0.00        26

   micro avg       0.99      0.99      0.99      2956
   macro avg       0.50      0.50      0.50      2956
weighted avg       0.98      0.99      0.99      2956



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [35]:
# Model 1.5
tv = TfidfVectorizer(stop_words='english', ngram_range=(2,2), lowercase=True)
x_train_m15 = tv.fit_transform(df_train['emp_title'])
y_train_m15 = y_train

x_test_m15 = tv.transform(df_test['emp_title'])
y_test_m15 = y_test

In [36]:
rfcm15 = RandomForestClassifier(n_estimators=N_ESTIMATORS, random_state=RANDOM_STATE, n_jobs=N_JOBS)
rfcm15.fit(x_train_m15, y_train_m15)
rfcm15.score(x_test_m15, y_test_m15)

0.9912043301759134

In [37]:
y_pred_m15 = rfcm15.predict_proba(x_test_m15)

In [38]:
confusion_matrix(y_test_m15, rfcm15.predict(x_test_m15), labels=[1,0])

array([[   0,   26],
       [   0, 2930]], dtype=int64)

In [39]:
print(classification_report(y_test_m15, rfcm15.predict(x_test_m15)))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2930
           1       0.00      0.00      0.00        26

   micro avg       0.99      0.99      0.99      2956
   macro avg       0.50      0.50      0.50      2956
weighted avg       0.98      0.99      0.99      2956



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [40]:
df_proba_m1 = pd.DataFrame()
df_proba_m1['y_proba1'] = y_pred_m11[:,1]
df_proba_m1['y_proba2'] = y_pred_m12[:,1]
df_proba_m1['y_proba3'] = y_pred_m13[:,1]
df_proba_m1['y_proba4'] = y_pred_m14[:,1]
df_proba_m1['y_proba5'] = y_pred_m15[:,1]

In [41]:
df_proba_m1.describe()

Unnamed: 0,y_proba1,y_proba2,y_proba3,y_proba4,y_proba5
count,2956.0,2956.0,2956.0,2956.0,2956.0
mean,0.009197,0.011976,0.009506,0.008609,0.010264
std,0.08718,0.057641,0.014262,0.011939,0.01573
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.000628,0.0,0.007786
50%,0.0,0.0,0.009937,0.011389,0.011671
75%,0.0,0.000625,0.011019,0.011389,0.011671
max,1.0,0.98,0.267355,0.331323,0.310034


In [42]:
df_proba_m1['y_proba5'].shape, y_test.shape

((2956,), (2956,))

In [43]:
# Save a Copy
df_proba_m1.to_csv('proba_m3.csv', index=False)

In [44]:
# Save a memory copy
df_proba_m1_copy = df_proba_m1.copy()

In [45]:
# Average Proba

In [46]:
df_proba_m1_copy['y_proba_avg'] = df_proba_m1_copy.apply('mean', axis=1)

In [47]:
df_proba_m1_copy['y_proba_avg_pred'] = np.where(df_proba_m1_copy['y_proba_avg'] >= 0.5, 1, 0)

In [48]:
confusion_matrix(y_test, df_proba_m1_copy['y_proba_avg_pred'], labels=[1,0])

array([[   0,   26],
       [   0, 2930]], dtype=int64)

In [49]:
accuracy_score(y_test, df_proba_m1_copy['y_proba_avg_pred'])

0.9912043301759134

In [50]:
print(classification_report(y_test, df_proba_m1_copy['y_proba_avg_pred']))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2930
           1       0.00      0.00      0.00        26

   micro avg       0.99      0.99      0.99      2956
   macro avg       0.50      0.50      0.50      2956
weighted avg       0.98      0.99      0.99      2956



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [51]:
# Bayes Proba (Conflated)

In [52]:
df_proba_m1_copy['y_proba_conft'] = df_proba_m1_copy['y_proba1']*df_proba_m1_copy['y_proba2']*df_proba_m1_copy['y_proba3']*df_proba_m1_copy['y_proba4']*df_proba_m1_copy['y_proba5']

In [53]:
df_proba_m1_copy['y_proba_confb'] = (1-df_proba_m1_copy['y_proba1'])*(1-df_proba_m1_copy['y_proba2'])*(1-df_proba_m1_copy['y_proba3'])*(1-df_proba_m1_copy['y_proba4'])*(1-df_proba_m1_copy['y_proba5'])

In [54]:
df_proba_m1_copy['y_proba_confa'] = df_proba_m1_copy['y_proba_conft'] / (df_proba_m1_copy['y_proba_conft'] + df_proba_m1_copy['y_proba_confb'])

In [55]:
df_proba_m1_copy['y_proba_confa_pred'] = np.where(df_proba_m1_copy['y_proba_confa'] >= 0.5, 1, 0)

In [56]:
confusion_matrix(y_test, df_proba_m1_copy['y_proba_confa_pred'], labels=[1,0])

array([[   0,   26],
       [   0, 2930]], dtype=int64)

In [57]:
accuracy_score(y_test, df_proba_m1_copy['y_proba_confa_pred'])

0.9912043301759134

In [58]:
print(classification_report(y_test, df_proba_m1_copy['y_proba_confa_pred']))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2930
           1       0.00      0.00      0.00        26

   micro avg       0.99      0.99      0.99      2956
   macro avg       0.50      0.50      0.50      2956
weighted avg       0.98      0.99      0.99      2956



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
