In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import math
import seaborn as sns
import random

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve,precision_recall_fscore_support

import joblib

### Using Entire Dataset (memory issues possible)

In [9]:
#X_train
filename = "X_train.pkl"
X_train_all = pd.read_pickle(filename)#, skiprows=skip, sep='\t',index_col=0)

#X_test
filename = "X_test.pkl"
X_test_all = pd.read_pickle(filename)#, skiprows=skip, sep='\t',index_col=0)

#y_train
filename = "y_train.pkl"
y_train_all = pd.read_pickle(filename)#, skiprows=skip, sep='\t',index_col=0)

#y_test
filename = "y_test.pkl"
y_test_all = pd.read_pickle(filename)#, skiprows=skip, sep='\t',index_col=0)

In [10]:
print(X_train_all.shape)
print(X_test_all.shape)
print(y_train_all.shape)
print(y_test_all.shape)

(880950, 1133)
(377551, 1133)
(880950, 1)
(377551, 1)


#### Drop non-numeric columns to prepare for classification model fitting

In [11]:
#drop OHE source columns & unuseful categorical variables
X_train_all.drop(columns=['term','verification_status',
                          'grade','emp_title', 'addr_state',
                          #ALSO, drop redundant columns that new OHE columns provide the info for
                          'debt_settlement_flag',#ALSO, drop columns clearly not predictive of class
                          'issue_d','last_pymnt_d'],inplace=True) #ALSO, drop date columns

In [12]:
#REPEAT DROPS for X_test
X_test_all.drop(columns=['term','verification_status', 
                         'grade','emp_title', 'addr_state',
                         #ALSO, drop redundant columns that new OHE columns provide the info for
                         'debt_settlement_flag',#ALSO, drop columns clearly not predictive of class
                         'issue_d','last_pymnt_d'],inplace=True) #ALSO, drop date columns

In [14]:
X_train_all.set_index('index',inplace=True)
X_test_all.set_index('index',inplace=True)

In [15]:
print(X_train_all.shape)
print(X_test_all.shape)
print(y_train_all.shape)
print(y_test_all.shape)

(880950, 1124)
(377551, 1124)
(880950, 1)
(377551, 1)


In [16]:
X_train_all.head()

Unnamed: 0_level_0,loan_amnt,funded_amnt,total_pymnt,int_rate,installment,emp_length,annual_inc,dti,delinq_2yrs,earliest_cr_line,...,emp_title_2_Technician,emp_title_2_Truck Driver,emp_title_2_Vice President,emp_title_2_driver,emp_title_2_manager,emp_title_2_owner,emp_title_2_sales,emp_title_2_supervisor,emp_title_2_teacher,emp_title_2_truck driver
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
798015,20000,20000.0,24704.737119,0.1239,448.85,1.0,62000.0,4.82,0,12173.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
962964,15000,15000.0,10780.58,0.1299,505.34,1.0,55000.0,31.05,1,7882.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
534392,16000,16000.0,18049.571386,0.0818,502.72,8.0,72000.0,18.67,1,9162.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
668134,24000,24000.0,27124.598857,0.0818,754.07,1.0,70000.0,21.02,0,10804.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1068586,24000,24000.0,26527.946309,0.0662,736.89,1.0,175000.0,14.71,0,10439.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Standard Scaling to Prep for Principal Component Analysis

In [17]:
# Standardize data
# scaler = StandardScaler()
# scaler.fit(X_train_10pcnt)
# X_train_all_scaled = scaler.transform(X_train_all)
# X_test_all_scaled = scaler.transform(X_test_all)

### Principal Component Analysis (PCA)

In [18]:
# pca = PCA(n_components=0.95, svd_solver='full')
# X_train_pca = pca.fit_transform(X_train_10pcnt_scaled)
# X_train_pca

In [19]:
# sum(pca.explained_variance_ratio_)

In [20]:
# print('num_components:', len(pca.explained_variance_ratio_))

- After the above preprocessing, we ended up with 1,124 features. I ran PCA on the dataset with the hope to further reduce feature size. Unfortunately, the 95% variance threshold corresponds to around 972 features, which is close to 95% of the total number of features. After testing a few thresholds, it seems I cannot significantly reduce the feature size without sacrificing variances. 
- I decided to keep all features.

## Classification Modeling (Fully Paid = 1, Charged-Off = 0)

#### Logistic Regression v1

In [21]:
y_train_all['loan_status'] = y_train_all['loan_status'].astype(int)
y_test_all['loan_status'] = y_test_all['loan_status'].astype(int)

In [22]:
log_reg = LogisticRegression(class_weight='balanced', solver='lbfgs', max_iter=1000)
log_reg.fit(X_train_all,y_train_all)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=1000, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
y_test_all_preds = log_reg.predict(X_test_all)

In [24]:
print ("Precision Fully Paid: {}".format(precision_score(y_test_all,y_test_all_preds)))
print ("Recall Fully Paid: {}".format(recall_score(y_test_all,y_test_all_preds)))
print ("F-1 Score Fully Paid: {}".format(f1_score(y_test_all,y_test_all_preds)))
print ("ROC-AUC Score: {}".format(roc_auc_score(y_test_all,y_test_all_preds)))

Precision Fully Paid: 0.9996676669562242
Recall Fully Paid: 0.9994637699276911
F-1 Score Fully Paid: 0.9995657080439424
ROC-AUC Score: 0.9990455291733132


In [25]:
#precision, recall, f1-score
print ("Precision, Recall, F, & Support By Class [0,1] aka [Default,Fully Paid]: {}".format(
    precision_recall_fscore_support(y_test_all,y_test_all_preds)))

print ("Precision, Recall, F, & Support Weighted Average by Support: {}".format(
    precision_recall_fscore_support(y_test_all,y_test_all_preds,average='weighted')))

Precision, Recall, F, & Support By Class [0,1] aka [Default,Fully Paid]: (array([0.9977865 , 0.99966767]), array([0.99862729, 0.99946377]), array([0.99820672, 0.99956571]), array([ 73577, 303974]))
Precision, Recall, F, & Support Weighted Average by Support: (0.9993010656372601, 0.9993007567189598, 0.9993008683031771, None)


### Utilize JobLib to save

In [31]:
# save the model to disk
filename = 'log_reg_v1.joblib'
joblib.dump(log_reg, filename)

['log_reg_v1.joblib']

### Quick test of JobLib loading of fitted model

In [32]:
# test load the model from disk
loaded_log_reg_v1 = joblib.load(filename)
result = loaded_log_reg_v1.predict(X_test_all)

In [33]:
print ("Precision Fully Paid: {}".format(precision_score(y_test_all,result)))
print ("Recall Fully Paid: {}".format(recall_score(y_test_all,result)))
print ("F-1 Score Fully Paid: {}".format(f1_score(y_test_all,result)))
print ("ROC-AUC Score: {}".format(roc_auc_score(y_test_all,result)))

Precision Fully Paid: 0.9996676669562242
Recall Fully Paid: 0.9994637699276911
F-1 Score Fully Paid: 0.9995657080439424
ROC-AUC Score: 0.9990455291733132


## Use Model #1: Classification Model to Filter Rows for Model #2: Regression on IRR
- I will use my predicted classification from Model #1 on train set to feed to Model #2, rather than filtering using pre-labelled classes
- This is what I would need to do with new, real-world data so my training process should mimic this


## IRR Target Variable Calculation/Extrapolation

In [None]:
#calculating NAR
lc_df[['total_rec_int','total_rec_late_fee','installment','collection_recovery_fee','out_prncp','loan_status']].head(5)

In [None]:
lc_df['last_pymnt_d'].head()

In [None]:
lc_df['last_payment_date'] = lc_df['last_pymnt_d'].str[:3]+'/'+'1'+'/'+lc_df['last_pymnt_d'].str[4:]
lc_df['issue_date'] = lc_df['issue_d'].str[:3]+'/'+'1'+'/'+lc_df['issue_d'].str[4:]

In [None]:
lc_df['last_payment_date'] = lc_df['last_payment_date'].astype(str)
lc_df['issue_date'] = lc_df['issue_date'].astype(str)

In [None]:
lc_df = lc_df[lc_df['last_payment_date'] != 'nan'].reset_index()

In [None]:
lc_df['last_payment_date'][1319]

In [None]:
from datetime import datetime

lc_df['last_payment_date_dt'] = lc_df['last_payment_date'].map(lambda x: datetime.strptime(x,'%b/%d/%Y'))
lc_df['issue_date_dt'] = lc_df['issue_date'].map(lambda x: datetime.strptime(x,'%b/%d/%Y'))

In [None]:
lc_df['issue_date_dt'].tail()

In [None]:
lc_df['days_btwn_funding_lastpayment'] = (lc_df['last_payment_date_dt'].dt.date -
                                          lc_df['issue_date_dt'].dt.date).dt.days

In [None]:
#raw_lc_df['y_stanford'] = (raw_lc_df['total_pymnt']/raw_lc_df['funded_amnt'])-1

In [None]:
lc_df[['addr_state','annual_inc','collection_recovery_fee','emp_title',
           'fico_range_high','fico_range_low','funded_amnt','grade','home_ownership',
           'int_rate','loan_amnt','loan_status','purpose','sub_grade','title','total_rec_int',
           'total_rec_late_fee','total_rec_prncp','zip_code','debt_settlement_flag','out_prncp',
           'collection_recovery_fee','days_btwn_funding_lastpayment','y_stanford']][lc_df['loan_status'] == 1].tail(5)

In [None]:

# for i in range(50): 
#     if raw_lc_df.loc[i,'loan_status'] != 'Charged Off':
#         raw_lc_df.loc[i,'NAR_test'] = ((1+((((raw_lc_df['total_rec_int'][i] #(interest received
#                                     +raw_lc_df['total_rec_late_fee'][i] # + late fees received
#                                     -(0.01*raw_lc_df['installment'][i]) # - service fee paid
#                                     +((raw_lc_df['collection_recovery_fee'][i]/.4)*.6) # + collection fees received
#                                     - 0) # - 0 or out.principal
#                                  /(raw_lc_df['out_prncp'][i])) #ALL THE ABOVE divided by out.principal
#                                     *raw_lc_df['out_prncp'][i]) #FRACTION ABOVE times out.principal
#                                  / (raw_lc_df['out_prncp'][i])))**12)-1 #TERM ABOVE divided by out.principal, 
#                                                                     #& EVERYTHING to the power of 12 & ALL OF THAT minus 1
#     else: 
#         raw_lc_df.loc[i,'NAR_test'] = ((1+((((raw_lc_df['total_rec_int'][i] #(interest received
#                                     +raw_lc_df['total_rec_late_fee'][i] # + late fees received
#                                     -(0.01*raw_lc_df['installment'][i]) # - service fee paid
#                                     +((raw_lc_df['collection_recovery_fee'][i]/.4)*.6) # + collection fees received
#                                     - raw_lc_df['out_prncp'][i]) # - 0 or out.principal
#                                  /(raw_lc_df['out_prncp'][i])) #ALL THE ABOVE divided by out.principal
#                                     *raw_lc_df['out_prncp'][i]) #FRACTION ABOVE times out.principal
#                                  / (raw_lc_df['out_prncp'][i])))**12)-1 #TERM ABOVE divided by out.principal, 
#                                                                     #& EVERYTHING to the power of 12 & ALL OF THAT minus 1
#     print('row completed')

In [None]:
lc_df.head(10)

In [None]:
lc_df['loan_status'].value_counts()