In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pprint

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('loan.csv') #save data to a pandas dataframe

In [3]:
df.columns

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
       'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose',
       'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs',
       'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq',
       'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal',
       'revol_util', 'total_acc', 'initial_list_status', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt',
       'next_pymnt_d', 'last_credit_pull_d', 'collections_12_mths_ex_med',
       'mths_since_last_major_derog', 'policy_code', 'application_type',
       'annual_inc_joint', 'dti_joint', 'verification_status_joint',
    

# This chunk gets the data used for the proposal

In [4]:
usefulColumns = ['total']
bankIncomeColumns = ['recoveries', 'collection_recovery_fee', ]

#we have loan amount and total payment, maybe use these
# benefit of lending a loan for the bank = avg interest paid (total_rec_int) + avg late payments recived (tatal_rec_late_fee)
benies = df['total_rec_int'].mean() + df['total_rec_late_fee'].mean()
benies

1755.2010184318683

In [5]:
# cost of defaulting on a loan = avg loan amount (loan_amnt)
    #- avg recoveries (recoveries) + avg collection recovery fee (collection_recovery_fee)
d = (df['loan_amnt'] - df['total_pymnt'])
defaultCost = d.mean() - df['recoveries'].mean() + df['collection_recovery_fee'].mean()
defaultCost

7155.399435454788

# Cost Matrix

In [6]:

#                  predicted default           predicted paid
# real default     0                           14714
#
# real paid        1755                        -1755
# wrong = 1755 + 14,714 = 16,469
# correct = - 1755

# This chunk will clean the data into clean_loan.csv

In [7]:
df['id']

0          1077501
1          1077430
2          1077175
3          1076863
4          1075358
            ...   
887374    36371250
887375    36441262
887376    36271333
887377    36490806
887378    36271262
Name: id, Length: 887379, dtype: int64

In [8]:
#don't understand: loan_amnt vs funded_amnt vs funded_amnt_inv
useful_columns = ['id', 'loan_amnt']
print(df.id.count()) #total number of rows

#how many rows have loan amnt != funded_amnt?
df[df.loan_amnt != df.funded_amnt].id

887379


61        1068934
66        1069093
70        1069043
77        1068416
80        1068994
           ...   
230644    1042487
230646    1063649
230647    1062754
230650    1062334
230656    1058291
Name: id, Length: 2062, dtype: int64

In [9]:
df[df.loan_amnt < df.funded_amnt].id # => all loans are >= funded_amnt

Series([], Name: id, dtype: int64)

I have no idea what the difference/significance is of loan_amnt vs funded_amnt_ vd funded_amnt_inv. I will just use loan_amnt. I believe they are not useful

In [10]:
# term may be useful for a decision tree
# int_rate is included. Lower interest rate => better credit score => likey to pay off?
# installment is included bc a lower payment is easier to pay => no default?
# grade and sub_grade are not included bc it just determines the interest rate see:
#                  https://www.lendingclub.com/foliofn/rateDetail.action
# emp_title is the employment title of borrower, not included bc a loan can be defaulted by anyone
# emp_length included bc long term employment => no default
# home_ownership included and will be transfomed into a boolean (own home vs not) bc home owners have collateral
# annual_inc (annual income) included bs more money => no default
# verification_status included, will be changed to boolean (people lie about cash flow)
# issue_d - the month the loan was funded was left out 
####### loan_status (Fully Paid vs Charged Off) is defaulted vs not########
# 

useful_columns.append(['term', 'int_rate', 'installment', 'emp_length', 'home_ownership', 'annula_inc', \
                     'verification_status', 'loan_status'])

Should pymnt_plan be included? payment plan => no default??

In [11]:
# how many, out of the total entries, have a payment plan?
print('Out of ' + str(df.id.count()) + ' rows, '  + str(df[df.pymnt_plan != 'n'].id.count()) + ' have payment plan')

Out of 887379 rows, 10 have payment plan


pymnt_plan data is super skewed, will not use

In [12]:
# url takes you to the lending club website
# desc is a description of why the loan was gotten, left out bc no (reasonable) way to turn into numbers
# purpose similar to desc
# title similar to desc
# zip_code (may need to be changed to float) used bc first 3 numbers tell what region borrower is in
# addr_state left out bc that infor is in zip_code
# dti is debt to income ratio
# deling_2yrs is the number of times borrower has been deliquent on any reported line of credit in 2yrs
# earliest_cr_line used due to data type and hopefully doesnt hurt anything
# inq_last_6mths used bc more inquiries => default
# 
useful_columns.append(['zip_code', 'dti', 'delinq_2yrs', 'inq_last_6mths'])

In [14]:
df.columns

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
       'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose',
       'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs',
       'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq',
       'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal',
       'revol_util', 'total_acc', 'initial_list_status', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt',
       'next_pymnt_d', 'last_credit_pull_d', 'collections_12_mths_ex_med',
       'mths_since_last_major_derog', 'policy_code', 'application_type',
       'annual_inc_joint', 'dti_joint', 'verification_status_joint',
    

In [19]:
# mths_since_last_delinq included if full enough bc longer time => no default
df['mths_since_last_delinq']
# is the column at least 75% full?
if((df['mths_since_last_delinq'].dropna().count()) > ((3/4) * (df['mths_since_last_delinq'].count()))):
    print('mths_since_last_delinq is at least 75% full')
else:
    print('mths_since_last_delinq is not at least 75% full')
df['mths_since_last_delinq'].dropna()

mths_since_last_delinq is at least 75% full


3         35.0
4         38.0
16        61.0
18         8.0
27        20.0
          ... 
887370    69.0
887371    65.0
887372     9.0
887375    26.0
887377    22.0
Name: mths_since_last_delinq, Length: 433067, dtype: float64

In [29]:
# mths_since_last_record is months since last public record. Not sure record of what so not included
# open_acc is the number of open credit lines in borrowers name, inluded bc more => no default 
# pub_rec is the number of derogetory marks on public record included bc less marks => on time payments
# revol_bal is the total credit revolving balance not included bc revol_util is included
# revol_util is the amount of credit used relative to total available
# total_acc is total credit lines on file (so including paid off ones) may be used to create new col with open_acc
# initial_list_status is w (whole) or f(fractional) since it can be converted to binary its kept
useful_columns.append(['open_acc', 'pub_rec', 'revol_util', 'total_acc'] )

# how many initial_list_status are f?
print(df[df['initial_list_status'] == 'f'].count())
print(df.initial_list_status.count())
# so about half => not super skewed

id                  456848
member_id           456848
loan_amnt           456848
funded_amnt         456848
funded_amnt_inv     456848
                     ...  
all_util              2645
total_rev_hi_lim    386572
inq_fi                2645
total_cu_tl           2645
inq_last_12m          2645
Length: 74, dtype: int64
887379


In [31]:
#out_prncp amount left
useful_columns.append(['initial_list_status', 'out_prncp'])

# how many rows have 0 in out_prncp?
(df[df['out_prncp'] == 0].count()) / (df['out_prncp'].count())
# so about 29% of rows that have out_prncp are 0 => enough data to keep

id                  0.288262
member_id           0.288262
loan_amnt           0.288262
funded_amnt         0.288262
funded_amnt_inv     0.288262
                      ...   
all_util            0.000165
total_rev_hi_lim    0.213371
inq_fi              0.000165
total_cu_tl         0.000165
inq_last_12m        0.000165
Length: 74, dtype: float64

In [None]:
# out_prncp_inv is amount remaining on loan by investors. I don't understand the use so not included
# total_pymnt is payments recived to date for total amnt funded
# useful_columns,append([''])