In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as sm
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
lm=LinearRegression()
%matplotlib inline
from sklearn.cluster import KMeans

Location="loansmall.csv"
rawdata = pd.read_csv(Location)
workdata=rawdata

In [5]:
#For remaking the dataframe when I want to start over
workdata=rawdata

Copied the data into a new dataframe.

In [None]:
#I use this so I can do a quick check when I want to.
workdata.head()

In [6]:
#Columns with completely missing data
workdata=workdata.dropna(axis=1,how='all')

#Columns with no variation
workdata=workdata.drop('application_type',axis=1)
workdata=workdata.drop('policy_code',axis=1)

#Columns that are replaced
workdata=workdata.drop('sub_grade',axis=1)
workdata=workdata.drop('home_ownership',axis=1)
workdata=workdata.drop('verification_status',axis=1)
workdata=workdata.drop('loan_status',axis=1)

#Columns with no apparent quantifiable purpose for the type of model I intend
workdata=workdata.drop('url',axis=1)
workdata=workdata.drop('next_pymnt_d',axis=1) #Missing Data
#I do not think that these date fields are useful for developing the model that I want.
workdata=workdata.drop('earliest_cr_line',axis=1) #Missing Data
workdata=workdata.drop('last_pymnt_d',axis=1) #Missing Data
workdata=workdata.drop('last_credit_pull_d',axis=1) #Missing Data

# desc field is too qualitative to work into the type of model I intend
workdata=workdata.drop('desc',axis=1)
workdata=workdata.drop('title',axis=1) #Missing Data
# member_id field seems like it is inapporpriate to use for my analysis
workdata=workdata.drop('member_id',axis=1)
# I am dropping mths_since_last_delinq because it has too many null values for me to feel comfortable with
workdata=workdata.drop('mths_since_last_delinq',axis=1)
# I am cleaning up my rows to remove rows with null data that I feel can not be easily replaced
workdata=workdata.dropna(axis=0,subset=['tot_coll_amt','emp_length','revol_util'])
#Too few values to be significant (more than 30% null)
workdata= workdata.dropna(axis=1,thresh=int(workdata.index.size * .3 ))
'''

workdata=workdata.drop('earliest_cr_line',axis=1)
workdata=workdata.drop('title',axis=1)
'''

"\n\nworkdata=workdata.drop('earliest_cr_line',axis=1)\nworkdata=workdata.drop('title',axis=1)\n"

In [7]:
#Filling in missing data where I can with appopriate values
workdata['emp_title']=workdata['emp_title'].fillna('unknown')


In [14]:
workdata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 144876 entries, 42535 to 194110
Data columns (total 51 columns):
id                            144876 non-null int64
loan_amnt                     144876 non-null int64
funded_amnt                   144876 non-null int64
funded_amnt_inv               144876 non-null float64
term                          144876 non-null int64
int_rate                      144876 non-null float64
installment                   144876 non-null float64
grade                         144876 non-null int64
emp_title                     144876 non-null object
emp_length                    144876 non-null float64
annual_inc                    144876 non-null float64
issue_d                       144876 non-null object
pymnt_plan                    144876 non-null int64
purpose                       144876 non-null object
zip_code                      144876 non-null object
addr_state                    144876 non-null object
dti                           144876 n

In [9]:
#Changed term length to a numerical value. Only two possibilities so I used a binary option
def term_to_num(x):
    if x == ' 36 months':
        return 0
    if x == ' 60 months':
        return 1
workdata['term']=rawdata['term'].apply(term_to_num)

def pmnt_plan_to_num(x):
    if x == 'n':
        return 0
    if x == 'y':
        return 1
workdata['pymnt_plan']=rawdata['pymnt_plan'].apply(pmnt_plan_to_num)

def initial_list_status_to_num(x):
    if x == 'f':
        return 0
    if x == 'w':
        return 1
workdata['initial_list_status']=rawdata['initial_list_status'].apply(initial_list_status_to_num)

In [10]:
#I am using sub grades to get a better spectrum of values.
def grade_to_num(x):
    if x == 'G5':
        return 0
    if x == 'G4':
        return 1
    if x == 'G3':
        return 2
    if x == 'G2':
        return 3
    if x == 'G1':
        return 4
    if x == 'F5':
        return 5
    if x == 'F4':
        return 6
    if x == 'F3':
        return 7
    if x == 'F2':
        return 8
    if x == 'F1':
        return 9
    if x == 'E5':
        return 10
    if x == 'E4':
        return 11
    if x == 'E3':
        return 12
    if x == 'E2':
        return 13
    if x == 'E1':
        return 14
    if x == 'D5':
        return 15
    if x == 'D4':
        return 16
    if x == 'D3':
        return 17
    if x == 'D2':
        return 18
    if x == 'D1':
        return 19
    if x == 'C5':
        return 20
    if x == 'C4':
        return 21
    if x == 'C3':
        return 22
    if x == 'C2':
        return 23
    if x == 'C1':
        return 24
    if x == 'B5':
        return 25
    if x == 'B4':
        return 26
    if x == 'B3':
        return 27
    if x == 'B2':
        return 28
    if x == 'B1':
        return 29
    if x == 'A5':
        return 30
    if x == 'A4':
        return 31
    if x == 'A3':
        return 32
    if x == 'A2':
        return 33
    if x == 'A1':
        return 34

workdata['grade']=rawdata['sub_grade'].apply(grade_to_num)

#Changing employment length to numerical values.
def emp_length_to_num(x):
    if x == '< 1 year':
        return 0
    if x == '1 year':
        return 1
    if x == '2 years':
        return 2
    if x == '3 years':
        return 3
    if x == '4 years':
        return 4
    if x == '5 years':
        return 5
    if x == '6 years':
        return 6
    if x == '7 years':
        return 7
    if x == '8 years':
        return 8
    if x == '9 years':
        return 9
    if x == '10+ years':
        return 10

workdata['emp_length']=rawdata['emp_length'].apply(emp_length_to_num)

In [11]:
#Home Ownership. I did it this way because I can't see putting levels to this data, so I used dummy columns.
def rent_to_num(x):
    if x == 'RENT':
        return 1
    else:
        return 0

def mortgage_to_num(x):
    if x == 'MORTGAGE':
        return 1
    else:
        return 0
    
def none_to_num(x):
    if x == 'NONE':
        return 1
    else:
        return 0
    
def other_to_num(x):
    if x == 'OTHER':
        return 1
    else:
        return 0
    
def own_to_num(x):
    if x == 'OWN':
        return 1
    else:
        return 0
#expdata=workdata
workdata['home_rent']=rawdata['home_ownership'].apply(rent_to_num)
workdata['home_mortgage']=rawdata['home_ownership'].apply(mortgage_to_num)
workdata['home_none']=rawdata['home_ownership'].apply(none_to_num)
workdata['home_other']=rawdata['home_ownership'].apply(other_to_num)
workdata['home_own']=rawdata['home_ownership'].apply(own_to_num)
#expdata.head()
#workdata['emp_length']

In [12]:
#Verification Status. I did it this way because I can't see putting levels to this data, so I used dummy columns.
def verified_to_num(x):
    if x == 'Verified':
        return 1
    else:
        return 0

def not_verified_to_num(x):
    if x == 'Not Verified':
        return 1
    else:
        return 0
    
def source_verified_to_num(x):
    if x == 'Source Verified':
        return 1
    else:
        return 0
#expdata=workdata
workdata['verified']=rawdata['verification_status'].apply(verified_to_num)
workdata['not_verified']=rawdata['verification_status'].apply(not_verified_to_num) 
workdata['source_verified']=rawdata['verification_status'].apply(source_verified_to_num)

In [13]:
#loan Status. I did it this way because I can't see putting levels to this data, so I used dummy columns.
def charged_off_to_num(x):
    if x == 'Charged Off' or x == 'Does not meet the credit policy. Status:Charged Off':
        return 1
    else:
        return 0

def default_to_num(x):
    if x == 'Default' :
        return 1
    else:
        return 0
    
def fully_paid_to_num(x):
    if x == 'Fully Paid' or x == 'Does not meet the credit policy. Status:Fully Paid' :
        return 1
    else:
        return 0

#expdata=workdata
workdata['status_charged_off']=rawdata['loan_status'].apply(charged_off_to_num)
workdata['status_default']=rawdata['loan_status'].apply(default_to_num)
workdata['status_fully_paid']=rawdata['loan_status'].apply(fully_paid_to_num)

In [None]:
#I am using this for testing what I am doing. It is basically pointless for the overall project
pd.pivot_table(workdata, index= 'initial_list_status', values= "member_id", aggfunc='count', fill_value= 0,
              margins = True, margins_name= 'Total')

In [15]:
workdata.corr()

Unnamed: 0,id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,emp_length,annual_inc,...,home_mortgage,home_none,home_other,home_own,verified,not_verified,source_verified,status_charged_off,status_default,status_fully_paid
id,1.0,-0.008016,-0.007999,-0.00766,0.06189,0.019009,-0.021089,-0.032228,0.015342,-0.00152,...,0.007077,-0.025533,-0.027654,0.026735,-0.059648,-0.076477,0.157786,-0.050296,0.005522,-0.21535
loan_amnt,-0.008016,1.0,0.999994,0.999988,0.433666,0.136696,0.95305,-0.155441,0.107117,0.350009,...,0.199203,-4.3e-05,-0.001577,-0.034947,0.32519,-0.398596,0.052753,0.020319,0.003367,-0.071449
funded_amnt,-0.007999,0.999994,1.0,0.999993,0.433653,0.136686,0.953059,-0.155428,0.107131,0.349996,...,0.199197,-4.3e-05,-0.001577,-0.034949,0.325188,-0.398594,0.052753,0.020298,0.003367,-0.071441
funded_amnt_inv,-0.00766,0.999988,0.999993,1.0,0.433636,0.137086,0.953121,-0.155825,0.107076,0.350009,...,0.199129,-3.5e-05,-0.001592,-0.034924,0.325093,-0.398578,0.05285,0.020349,0.003388,-0.071529
term,0.06189,0.433666,0.433653,0.433636,1.0,0.457692,0.184546,-0.474696,0.083272,0.066104,...,0.120179,-0.003143,-0.005209,-0.016123,0.307232,-0.385748,0.060098,0.070347,0.008241,-0.170375
int_rate,0.019009,0.136696,0.136686,0.137086,0.457692,1.0,0.114511,-0.988745,-0.002171,-0.054543,...,-0.138338,0.003483,0.003736,0.017277,0.164727,-0.194636,0.018579,0.170126,0.016151,-0.105982
installment,-0.021089,0.95305,0.953059,0.953121,0.184546,0.114511,1.0,-0.129726,0.087932,0.351127,...,0.15628,0.001761,0.000822,-0.029266,0.276008,-0.336725,0.042998,0.020181,0.002855,-0.030358
grade,-0.032228,-0.155441,-0.155428,-0.155825,-0.474696,-0.988745,-0.129726,1.0,0.000389,0.040424,...,0.129872,-0.003477,-0.00242,-0.01556,-0.16943,0.200408,-0.01935,-0.169966,-0.015504,0.111231
emp_length,0.015342,0.107117,0.107131,0.107076,0.083272,-0.002171,0.087932,0.000389,1.0,0.059398,...,0.18742,0.003801,0.002141,0.012823,0.084426,-0.0891,-0.002407,-0.013477,0.004917,-0.030896
annual_inc,-0.00152,0.350009,0.349996,0.350009,0.066104,-0.054543,0.351127,0.040424,0.059398,1.0,...,0.17891,-0.003543,-0.003854,-0.041736,0.069103,-0.158363,0.093672,-0.051482,-0.004156,0.030653


In [36]:
#Going to run a regression analysis to determine fully_paid_status of a loan
result = sm.ols(formula='status_fully_paid ~ out_prncp + out_prncp_inv + total_rec_prncp + last_pymnt_amnt + status_charged_off -1', data=workdata).fit()
result.summary()

0,1,2,3
Dep. Variable:,status_fully_paid,R-squared:,0.657
Model:,OLS,Adj. R-squared:,0.657
Method:,Least Squares,F-statistic:,55530.0
Date:,"Fri, 26 Oct 2018",Prob (F-statistic):,0.0
Time:,00:19:44,Log-Likelihood:,-72135.0
No. Observations:,144876,AIC:,144300.0
Df Residuals:,144871,BIC:,144300.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
out_prncp,-0.0010,0.000,-6.396,0.000,-0.001,-0.001
out_prncp_inv,0.0009,0.000,6.214,0.000,0.001,0.001
total_rec_prncp,2.914e-05,1.45e-07,201.514,0.000,2.89e-05,2.94e-05
last_pymnt_amnt,3.157e-05,2.51e-07,125.855,0.000,3.11e-05,3.21e-05
status_charged_off,-0.1328,0.003,-41.975,0.000,-0.139,-0.127

0,1,2,3
Omnibus:,1055.086,Durbin-Watson:,1.74
Prob(Omnibus):,0.0,Jarque-Bera (JB):,899.103
Skew:,0.136,Prob(JB):,5.7899999999999995e-196
Kurtosis:,2.726,Cond. No.,44200.0


In [16]:
# I am going to try to narrow the database down to what would be known at the signing of a loan contract
startdata=workdata
startdata=startdata.drop('out_prncp',axis=1)
startdata=startdata.drop('out_prncp_inv',axis=1)
startdata=startdata.drop('total_pymnt',axis=1)
startdata=startdata.drop('total_pymnt_inv',axis=1)
startdata=startdata.drop('total_rec_prncp',axis=1)
startdata=startdata.drop('total_rec_int',axis=1)
startdata=startdata.drop('total_rec_late_fee',axis=1)
startdata=startdata.drop('last_pymnt_amnt',axis=1)

In [31]:

fullypaidgdata=startdata.select_dtypes(exclude=['object']) 
fullypaidgdata=fullypaidgdata.drop('status_default',axis=1)
fullypaidgdata=fullypaidgdata.drop('status_charged_off',axis=1)
X = fullypaidgdata.drop('status_fully_paid', axis = 1)
lm.fit(X, fullypaidgdata.status_fully_paid)
print('Estimated intercept coefficent:',lm.intercept_)
print('Number of coefficients:',len(lm.coef_))
print('R^2:', lm.score(X, fullypaidgdata.status_fully_paid))

Estimated intercept coefficent: 0.6785579680848782
Number of coefficients: 35
R^2: 0.11979421006532665


In [34]:
import sklearn
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X, fullypaidgdata.status_fully_paid
                                                                           ,test_size=0.33, random_state=5)

In [35]:
lm=LinearRegression()
lm.fit(X_train, Y_train)
pred_train = lm.predict(X_train)
pred_test = lm.predict(X_test)

In [19]:
#Trying to determine Chance of default
defaultingdata=startdata.select_dtypes(exclude=['object']) 
defaultingdata=defaultingdata.drop('status_charged_off',axis=1)
defaultingdata=defaultingdata.drop('status_fully_paid',axis=1)


In [30]:
defaultingdata.corr()

Unnamed: 0,id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,emp_length,annual_inc,...,total_rev_hi_lim,home_rent,home_mortgage,home_none,home_other,home_own,verified,not_verified,source_verified,status_default
id,1.0,-0.008016,-0.007999,-0.00766,0.06189,0.019009,-0.021089,-0.032228,0.015342,-0.00152,...,-0.009119,-0.020226,0.007077,-0.025533,-0.027654,0.026735,-0.059648,-0.076477,0.157786,0.005522
loan_amnt,-0.008016,1.0,0.999994,0.999988,0.433666,0.136696,0.95305,-0.155441,0.107117,0.350009,...,0.251179,-0.18466,0.199203,-4.3e-05,-0.001577,-0.034947,0.32519,-0.398596,0.052753,0.003367
funded_amnt,-0.007999,0.999994,1.0,0.999993,0.433653,0.136686,0.953059,-0.155428,0.107131,0.349996,...,0.251174,-0.184652,0.199197,-4.3e-05,-0.001577,-0.034949,0.325188,-0.398594,0.052753,0.003367
funded_amnt_inv,-0.00766,0.999988,0.999993,1.0,0.433636,0.137086,0.953121,-0.155825,0.107076,0.350009,...,0.2511,-0.184597,0.199129,-3.5e-05,-0.001592,-0.034924,0.325093,-0.398578,0.05285,0.003388
term,0.06189,0.433666,0.433653,0.433636,1.0,0.457692,0.184546,-0.474696,0.083272,0.066104,...,0.0558,-0.113899,0.120179,-0.003143,-0.005209,-0.016123,0.307232,-0.385748,0.060098,0.008241
int_rate,0.019009,0.136696,0.136686,0.137086,0.457692,1.0,0.114511,-0.988745,-0.002171,-0.054543,...,-0.155663,0.131903,-0.138338,0.003483,0.003736,0.017277,0.164727,-0.194636,0.018579,0.016151
installment,-0.021089,0.95305,0.953059,0.953121,0.184546,0.114511,1.0,-0.129726,0.087932,0.351127,...,0.231297,-0.143979,0.15628,0.001761,0.000822,-0.029266,0.276008,-0.336725,0.042998,0.002855
grade,-0.032228,-0.155441,-0.155428,-0.155825,-0.474696,-0.988745,-0.129726,1.0,0.000389,0.040424,...,0.141843,-0.124227,0.129872,-0.003477,-0.00242,-0.01556,-0.16943,0.200408,-0.01935,-0.015504
emp_length,0.015342,0.107117,0.107131,0.107076,0.083272,-0.002171,0.087932,0.000389,1.0,0.059398,...,0.061913,-0.199338,0.18742,0.003801,0.002141,0.012823,0.084426,-0.0891,-0.002407,0.004917
annual_inc,-0.00152,0.350009,0.349996,0.350009,0.066104,-0.054543,0.351127,0.040424,0.059398,1.0,...,0.274067,-0.159903,0.17891,-0.003543,-0.003854,-0.041736,0.069103,-0.158363,0.093672,-0.004156


#The stuff below this comment is not finished. I probably won't be able to finish until Thursday.

In [23]:
X = defaultingdata.drop('status_default', axis = 1)
lm.fit(X, defaultingdata.status_default)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [24]:
print('Estimated intercept coefficent:',lm.intercept_)

Estimated intercept coefficent: -0.0072346542811625655


In [28]:
print('Number of coefficients:',len(lm.coef_))

Number of coefficients: 35


In [29]:
print('R^2:', lm.score(X, defaultingdata.status_default))

R^2: 0.0006274808652619868
