In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import math
import seaborn as sns
import random

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

### Randomly Sample 10% of Train & Test sets due to large file size

In [49]:
filename = "X_train.tsv"
n = sum(1 for line in open(filename)) - 1 #number of records in file (excludes header)
s = 114663 #desired sample size
skip = sorted(random.sample(range(1,n+1),n-s)) #the 0-indexed header will not be included in the skip list
X_train_10pcnt = pd.read_csv(filename, skiprows=skip, sep='\t',index_col=0)

#X_test
filename = "X_test.tsv"
s = 64334
X_test_10pcnt = pd.read_csv(filename, skiprows=skip, sep='\t',index_col=0)

#y_train
filename = "y_train.tsv"
s = 88036
y_train_10pcnt = pd.read_csv(filename, skiprows=skip, sep='\t',index_col=0)

#y_test
filename = "y_test.tsv"
s = 37882
y_test_10pcnt = pd.read_csv(filename, skiprows=skip, sep='\t',index_col=0)

In [50]:
print(X_train_10pcnt.shape)
print(X_test_10pcnt.shape)
print(y_train_10pcnt.shape)
print(y_test_10pcnt.shape)

(114663, 1133)
(63979, 1133)
(88003, 1)
(37693, 1)


In [51]:
X_train_10pcnt.info(verbose=1)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 114663 entries, 12 to 1260151
Data columns (total 1133 columns):
loan_amnt                               float64
funded_amnt                             float64
term                                    object
int_rate                                float64
installment                             float64
emp_length                              float64
home_ownership                          object
annual_inc                              float64
verification_status                     object
purpose                                 object
zip_code                                object
addr_state                              object
dti                                     float64
delinq_2yrs                             float64
earliest_cr_line                        float64
fico_range_low                          float64
fico_range_high                         float64
inq_last_6mths                          float64
mths_since_last_delinq     

In [52]:
#drop OHE source columns & unuseful categorical variables
X_train_10pcnt.drop(columns=['term','verification_status','home_ownership','purpose','zip_code',
                             'addr_state','sub_grade','emp_title_2','application_type', 
                             'grade','emp_title','title', 
                             #ALSO, drop redundant columns that new OHE columns provide the info for
                             'debt_settlement_flag',#ALSO, drop columns clearly not predictive of class
                             'issue_d','last_pymnt_d'],inplace=True) #ALSO, drop date columns

In [53]:
#REPEAT DROPS for X_test
X_test_10pcnt.drop(columns=['term','verification_status','home_ownership','purpose','zip_code',
                            'addr_state','sub_grade','emp_title_2','application_type', 
                            'grade','emp_title','title', 
                            #ALSO, drop redundant columns that new OHE columns provide the info for
                            'debt_settlement_flag',#ALSO, drop columns clearly not predictive of class
                            'issue_d','last_pymnt_d'],inplace=True) #ALSO, drop date columns

In [60]:
X_train_10pcnt.head()

Unnamed: 0,loan_amnt,funded_amnt,int_rate,installment,emp_length,annual_inc,dti,delinq_2yrs,earliest_cr_line,fico_range_low,...,emp_title_2_Technician,emp_title_2_Truck Driver,emp_title_2_Vice President,emp_title_2_driver,emp_title_2_manager,emp_title_2_owner,emp_title_2_sales,emp_title_2_supervisor,emp_title_2_teacher,emp_title_2_truck driver
12,16000.0,16000.0,0.1262,360.95,7.0,55000.0,13.02,0.0,13939.0,710.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33,25000.0,25000.0,0.1042,536.36,1.0,154000.0,4.67,1.0,9862.0,680.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38,1000.0,1000.0,0.1199,33.21,9.0,63000.0,10.04,1.0,14092.0,660.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49,4000.0,4000.0,0.1359,135.92,5.0,35000.0,4.36,0.0,14092.0,710.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [61]:
X_test_10pcnt.head()

Unnamed: 0,loan_amnt,funded_amnt,int_rate,installment,emp_length,annual_inc,dti,delinq_2yrs,earliest_cr_line,fico_range_low,...,emp_title_2_Technician,emp_title_2_Truck Driver,emp_title_2_Vice President,emp_title_2_driver,emp_title_2_manager,emp_title_2_owner,emp_title_2_sales,emp_title_2_supervisor,emp_title_2_teacher,emp_title_2_truck driver
12,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52,14000.0,14000.0,0.2291,541.28,1.0,60000.0,21.82,1.0,10166.0,665.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
X_train_10pcnt[X_train_10pcnt.isn()]

Unnamed: 0,loan_amnt,funded_amnt,int_rate,installment,emp_length,annual_inc,dti,delinq_2yrs,earliest_cr_line,fico_range_low,...,emp_title_2_Technician,emp_title_2_Truck Driver,emp_title_2_Vice President,emp_title_2_driver,emp_title_2_manager,emp_title_2_owner,emp_title_2_sales,emp_title_2_supervisor,emp_title_2_teacher,emp_title_2_truck driver
2,,,,,,,,,,,...,,,,,,,,,,
19,,,,,,,,,,,...,,,,,,,,,,
27,,,,,,,,,,,...,,,,,,,,,,
46,,,,,,,,,,,...,,,,,,,,,,
70,,,,,,,,,,,...,,,,,,,,,,
75,,,,,,,,,,,...,,,,,,,,,,
77,,,,,,,,,,,...,,,,,,,,,,
84,,,,,,,,,,,...,,,,,,,,,,
86,,,,,,,,,,,...,,,,,,,,,,
100,,,,,,,,,,,...,,,,,,,,,,


In [40]:
X_train_10pcnt

Unnamed: 0,loan_amnt,funded_amnt,int_rate,installment,emp_length,annual_inc,dti,delinq_2yrs,earliest_cr_line,fico_range_low,...,emp_title_2_Technician,emp_title_2_Truck Driver,emp_title_2_Vice President,emp_title_2_driver,emp_title_2_manager,emp_title_2_owner,emp_title_2_sales,emp_title_2_supervisor,emp_title_2_teacher,emp_title_2_truck driver
2,15000.0,15000.0,0.1042,486.98,3.0,55000.00,17.48,0.0,14092.0,670.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,,,,,,,,,,,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
27,2700.0,2700.0,0.0993,87.04,3.0,55000.00,37.12,0.0,13848.0,750.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
46,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70,6800.0,6800.0,0.0944,217.64,9.0,134000.00,12.67,0.0,12143.0,695.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75,1500.0,1500.0,0.2000,55.75,1.0,100000.00,30.17,3.0,10804.0,670.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
77,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84,8000.0,8000.0,0.2388,313.36,1.0,60000.00,22.58,0.0,11474.0,715.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100,24000.0,24000.0,0.2630,722.84,1.0,99460.00,23.65,0.0,12662.0,660.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
X_train_10pcnt.info(verbose=1)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 114663 entries, 4 to 1260154
Data columns (total 1118 columns):
loan_amnt                               float64
funded_amnt                             float64
int_rate                                float64
installment                             float64
emp_length                              float64
annual_inc                              float64
dti                                     float64
delinq_2yrs                             float64
earliest_cr_line                        float64
fico_range_low                          float64
fico_range_high                         float64
inq_last_6mths                          float64
mths_since_last_delinq                  float64
mths_since_last_record                  float64
open_acc                                float64
pub_rec                                 float64
revol_bal                               float64
revol_util                              float64
total_acc             

### Standard Scaling to Prep for Principal Component Analysis

In [26]:
# Standardize data
scaler = StandardScaler()
scaler.fit(X_train_10pcnt)
X_train_10pcnt_scaled = scaler.transform(X_train_10pcnt)
X_test_10pcnt_scaled = scaler.transform(X_test_10pcnt)

In [31]:
X_train_10pcnt[X_train_10pcnt.isna()]

Unnamed: 0,loan_amnt,funded_amnt,int_rate,installment,emp_length,annual_inc,dti,delinq_2yrs,earliest_cr_line,fico_range_low,...,emp_title_2_Technician,emp_title_2_Truck Driver,emp_title_2_Vice President,emp_title_2_driver,emp_title_2_manager,emp_title_2_owner,emp_title_2_sales,emp_title_2_supervisor,emp_title_2_teacher,emp_title_2_truck driver
4,,,,,,,,,,,...,,,,,,,,,,
13,,,,,,,,,,,...,,,,,,,,,,
32,,,,,,,,,,,...,,,,,,,,,,
38,,,,,,,,,,,...,,,,,,,,,,
56,,,,,,,,,,,...,,,,,,,,,,
57,,,,,,,,,,,...,,,,,,,,,,
59,,,,,,,,,,,...,,,,,,,,,,
77,,,,,,,,,,,...,,,,,,,,,,
78,,,,,,,,,,,...,,,,,,,,,,
89,,,,,,,,,,,...,,,,,,,,,,


### Principal Component Analysis (PCA)

In [27]:
pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train_10pcnt_scaled)
X_train_pca

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
pca.explained_variance_ratio_

In [None]:
pca.components_

## Classification Modeling (Fully Paid = 1, Charged-Off = 0)

#### Logistic Regression v1

## Use Model #1: Classification Model to Filter Rows for Model #2: Regression on IRR
- I will use my predicted classification from Model #1 on train set to feed to Model #2, rather than filtering using pre-labelled classes
- This is what I would need to do with new, real-world data so my training process should mimic this


## IRR Target Variable Calculation/Extrapolation

In [None]:
#calculating NAR
lc_df[['total_rec_int','total_rec_late_fee','installment','collection_recovery_fee','out_prncp','loan_status']].head(5)

In [None]:
lc_df['last_pymnt_d'].head()

In [None]:
lc_df['last_payment_date'] = lc_df['last_pymnt_d'].str[:3]+'/'+'1'+'/'+lc_df['last_pymnt_d'].str[4:]
lc_df['issue_date'] = lc_df['issue_d'].str[:3]+'/'+'1'+'/'+lc_df['issue_d'].str[4:]

In [None]:
lc_df['last_payment_date'] = lc_df['last_payment_date'].astype(str)
lc_df['issue_date'] = lc_df['issue_date'].astype(str)

In [None]:
lc_df = lc_df[lc_df['last_payment_date'] != 'nan'].reset_index()

In [None]:
lc_df['last_payment_date'][1319]

In [None]:
from datetime import datetime

lc_df['last_payment_date_dt'] = lc_df['last_payment_date'].map(lambda x: datetime.strptime(x,'%b/%d/%Y'))
lc_df['issue_date_dt'] = lc_df['issue_date'].map(lambda x: datetime.strptime(x,'%b/%d/%Y'))

In [None]:
lc_df['issue_date_dt'].tail()

In [None]:
lc_df['days_btwn_funding_lastpayment'] = (lc_df['last_payment_date_dt'].dt.date -
                                          lc_df['issue_date_dt'].dt.date).dt.days

In [None]:
#raw_lc_df['y_stanford'] = (raw_lc_df['total_pymnt']/raw_lc_df['funded_amnt'])-1

In [None]:
lc_df[['addr_state','annual_inc','collection_recovery_fee','emp_title',
           'fico_range_high','fico_range_low','funded_amnt','grade','home_ownership',
           'int_rate','loan_amnt','loan_status','purpose','sub_grade','title','total_rec_int',
           'total_rec_late_fee','total_rec_prncp','zip_code','debt_settlement_flag','out_prncp',
           'collection_recovery_fee','days_btwn_funding_lastpayment','y_stanford']][lc_df['loan_status'] == 1].tail(5)

In [None]:

# for i in range(50): 
#     if raw_lc_df.loc[i,'loan_status'] != 'Charged Off':
#         raw_lc_df.loc[i,'NAR_test'] = ((1+((((raw_lc_df['total_rec_int'][i] #(interest received
#                                     +raw_lc_df['total_rec_late_fee'][i] # + late fees received
#                                     -(0.01*raw_lc_df['installment'][i]) # - service fee paid
#                                     +((raw_lc_df['collection_recovery_fee'][i]/.4)*.6) # + collection fees received
#                                     - 0) # - 0 or out.principal
#                                  /(raw_lc_df['out_prncp'][i])) #ALL THE ABOVE divided by out.principal
#                                     *raw_lc_df['out_prncp'][i]) #FRACTION ABOVE times out.principal
#                                  / (raw_lc_df['out_prncp'][i])))**12)-1 #TERM ABOVE divided by out.principal, 
#                                                                     #& EVERYTHING to the power of 12 & ALL OF THAT minus 1
#     else: 
#         raw_lc_df.loc[i,'NAR_test'] = ((1+((((raw_lc_df['total_rec_int'][i] #(interest received
#                                     +raw_lc_df['total_rec_late_fee'][i] # + late fees received
#                                     -(0.01*raw_lc_df['installment'][i]) # - service fee paid
#                                     +((raw_lc_df['collection_recovery_fee'][i]/.4)*.6) # + collection fees received
#                                     - raw_lc_df['out_prncp'][i]) # - 0 or out.principal
#                                  /(raw_lc_df['out_prncp'][i])) #ALL THE ABOVE divided by out.principal
#                                     *raw_lc_df['out_prncp'][i]) #FRACTION ABOVE times out.principal
#                                  / (raw_lc_df['out_prncp'][i])))**12)-1 #TERM ABOVE divided by out.principal, 
#                                                                     #& EVERYTHING to the power of 12 & ALL OF THAT minus 1
#     print('row completed')

In [None]:
lc_df.head(10)

In [None]:
lc_df['loan_status'].value_counts()