## Loan Data Prep

In [1]:
# Import dependencies
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from datetime import date
from ipykernel import kernelapp as app

## Read in Loan Data

In [2]:
# Read in 2017 csv files for review
df=pd.read_csv("static/data/LoanStats_2017Q1.csv", encoding="ISO-8859-1", engine='python',skiprows=1, skipfooter=2)
df2=pd.read_csv("static/data/LoanStats_2017Q2.csv", encoding="ISO-8859-1",engine = 'python',skiprows=1, skipfooter=2)
df3=pd.read_csv("static/data/LoanStats_2017Q3.csv", encoding="ISO-8859-1",engine='python',skiprows=1, skipfooter=2)
df4=pd.read_csv("static/data/LoanStats_2017Q4.csv", encoding="ISO-8859-1",engine='python',skiprows=1, skipfooter=2)

In [3]:
df.columns

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade',
       ...
       'hardship_payoff_balance_amount', 'hardship_last_payment_amount',
       'disbursement_method', 'debt_settlement_flag',
       'debt_settlement_flag_date', 'settlement_status', 'settlement_date',
       'settlement_amount', 'settlement_percentage', 'settlement_term'],
      dtype='object', length=145)

In [4]:
df['purpose'].unique()

array(['debt_consolidation', 'home_improvement', 'car', 'major_purchase',
       'credit_card', 'vacation', 'medical', 'other', 'house', 'moving',
       'small_business', 'renewable_energy'], dtype=object)

In [5]:
# Read in 2018 csv files for review
df5=pd.read_csv("static/data/LoanStats_2018Q1.csv", encoding="ISO-8859-1", engine='python', skiprows=1, skipfooter=2)
df6=pd.read_csv("static/data/LoanStats_2018Q2.csv", encoding="ISO-8859-1", engine ='python',skiprows=1, skipfooter=2)
df7=pd.read_csv("static/data/LoanStats_2018Q3.csv", encoding="ISO-8859-1", engine='python', skiprows=1, skipfooter=2)

In [6]:
df5.columns

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade',
       ...
       'hardship_payoff_balance_amount', 'hardship_last_payment_amount',
       'disbursement_method', 'debt_settlement_flag',
       'debt_settlement_flag_date', 'settlement_status', 'settlement_date',
       'settlement_amount', 'settlement_percentage', 'settlement_term'],
      dtype='object', length=145)

In [7]:
df5['purpose'].unique()

array(['credit_card', 'debt_consolidation', 'house', 'home_improvement',
       'major_purchase', 'medical', 'other', 'moving', 'car',
       'small_business', 'vacation', 'renewable_energy', 'wedding'],
      dtype=object)

## Merge datasets

In [8]:
# Merge all 2017 quarters
merge = [df,df2,df3,df4]
loan_df = pd.concat(merge, sort=False)
loan_df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,16000,16000,16000.0,60 months,12.74%,361.93,C,C1,...,,,Cash,N,,,,,,
1,,,11875,11875,11875.0,36 months,11.44%,391.26,B,B4,...,,,Cash,N,,,,,,
2,,,12000,12000,12000.0,36 months,7.99%,375.99,A,A5,...,,,Cash,N,,,,,,
3,,,25000,25000,25000.0,36 months,15.99%,878.81,C,C5,...,,,Cash,N,,,,,,
4,,,1500,1500,1500.0,36 months,5.32%,45.18,A,A1,...,,,Cash,N,,,,,,


In [9]:
# Merge all 2018 quarters
merge2 = [df5,df6,df7]
loan2_df = pd.concat(merge2, sort=False)
loan2_df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,1000,1000,1000.0,36 months,17.47%,35.89,D,D1,...,,,Cash,N,,,,,,
1,,,3200,3200,3200.0,36 months,7.96%,100.22,A,A5,...,,,Cash,N,,,,,,
2,,,30000,30000,30000.0,36 months,7.34%,930.99,A,A4,...,,,Cash,N,,,,,,
3,,,10000,10000,10000.0,36 months,7.96%,313.18,A,A5,...,,,Cash,N,,,,,,
4,,,35000,35000,35000.0,36 months,16.01%,1230.67,C,C5,...,,,Cash,N,,,,,,


## Retain Secured Loan Data Only

In [10]:
#keep just secured loan data
type_list = ('car','house','home_improvement')
secured_df = loan_df.loc[loan_df['purpose'].isin(type_list)]
secured_df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
1,,,11875,11875,11875.0,36 months,11.44%,391.26,B,B4,...,,,Cash,N,,,,,,
4,,,1500,1500,1500.0,36 months,5.32%,45.18,A,A1,...,,,Cash,N,,,,,,
7,,,35000,35000,35000.0,60 months,25.49%,1037.38,E,E4,...,,,Cash,N,,,,,,
25,,,12000,12000,12000.0,36 months,6.99%,370.48,A,A2,...,,,Cash,N,,,,,,
30,,,14000,14000,14000.0,36 months,8.24%,440.27,B,B1,...,,,Cash,N,,,,,,


In [11]:
type_list2 = ('car','house','home_improvement')
secured_df2 = loan2_df.loc[loan2_df['purpose'].isin(type_list)]
secured_df2.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
2,,,30000,30000,30000.0,36 months,7.34%,930.99,A,A4,...,,,Cash,N,,,,,,
5,,,9000,9000,9000.0,36 months,5.31%,271.0,A,A1,...,,,Cash,N,,,,,,
16,,,35000,35000,35000.0,60 months,24.84%,1024.02,E,E3,...,,,Cash,N,,,,,,
17,,,15000,15000,15000.0,36 months,6.07%,456.81,A,A2,...,,,Cash,N,,,,,,
38,,,20000,20000,20000.0,60 months,26.77%,607.97,E,E5,...,,,Cash,N,,,,,,


## Write out clean csv files

In [12]:
## Clean csv file to be saved in GitHub repository
secured_df.to_csv("DataSets/clean_loan_data_2017.csv", index=False,header=True)

In [13]:
## Clean csv file to be saved in GitHub repository
secured_df2.to_csv("DataSets/clean_loan_data_2018.csv", index=False,header=True)