In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

In [3]:
data_loan = pd.read_csv('../../data/raw/Loan.txt', sep='\t')

In [4]:
data_borrower = pd.read_csv('../../data/raw/Borrower.txt', sep='\t')

---

# <center>**Loan**</center>

---

In [5]:
data_loan.head()

Unnamed: 0,loanId,memberId,date,purpose,isJointApplication,loanAmount,term,interestRate,monthlyPayment,grade,loanStatus
0,1888978,2305095,12/10/2014,debtconsolidation,0.0,25190.0,60 months,6.25,490,E3,Current
1,1299695,2610493,9/15/2014,debtconsolidation,0.0,21189.0,60 months,10.49,455,B3,Current
2,1875016,2491679,9/11/2014,debtconsolidation,0.0,29908.0,60 months,9.11,622,B2,Current
3,1440478,2092798,4/22/2016,homeimprovement,0.0,13053.0,48 months,11.89,343,B3,Current
4,1124634,2633077,2/3/2016,debtconsolidation,0.0,24613.0,60 months,15.13,587,A3,Current


In [6]:
data_loan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 11 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   loanId              100000 non-null  int64  
 1   memberId            100000 non-null  int64  
 2   date                100000 non-null  object 
 3   purpose             100000 non-null  object 
 4   isJointApplication  99029 non-null   float64
 5   loanAmount          98994 non-null   float64
 6   term                98929 non-null   object 
 7   interestRate        100000 non-null  float64
 8   monthlyPayment      100000 non-null  int64  
 9   grade               100000 non-null  object 
 10  loanStatus          100000 non-null  object 
dtypes: float64(3), int64(3), object(5)
memory usage: 8.4+ MB


### **Columns Screen**

In [7]:
data_loan.date.describe()

count         100000
unique           780
top       12/20/2015
freq             162
Name: date, dtype: object

In [8]:
data_loan.purpose.value_counts()

debtconsolidation    81344
homeimprovement       8116
other                 4118
business              3210
healthcare            1229
education             1003
auto                   980
Name: purpose, dtype: int64

In [9]:
data_loan.isJointApplication.value_counts()

0.0    94061
1.0     4968
Name: isJointApplication, dtype: int64

In [10]:
data_loan.term.value_counts()

60 months    33971
48 months    32486
36 months    32472
Name: term, dtype: int64

In [11]:
data_loan.grade.value_counts()

A1    11251
A2    10574
A3    10416
B1     8553
B3     8279
B2     8014
C1     7030
C2     6826
C3     5573
D2     4513
D1     4488
D3     3917
E3     3614
E1     3530
E2     3422
Name: grade, dtype: int64

In [12]:
data_loan.loanStatus.value_counts()

Current    89996
Default    10004
Name: loanStatus, dtype: int64

In [13]:
data_loan[['loanAmount', 'interestRate', 'monthlyPayment']].describe()

Unnamed: 0,loanAmount,interestRate,monthlyPayment
count,98994.0,100000.0,100000.0
mean,20658.524587,10.977798,552.44862
std,4731.560429,3.904456,165.072058
min,1618.0,4.02,42.0
25%,17480.0,7.98,434.0
50%,20600.0,10.46,534.0
75%,23786.0,13.44,655.0
max,42403.0,31.7,1575.0


### **Missing Values**

In [14]:
data_loan.isna().sum()

loanId                   0
memberId                 0
date                     0
purpose                  0
isJointApplication     971
loanAmount            1006
term                  1071
interestRate             0
monthlyPayment           0
grade                    0
loanStatus               0
dtype: int64

In [15]:
data_loan.loc[data_loan['loanAmount'].isna() & data_loan['term'].isna()]

Unnamed: 0,loanId,memberId,date,purpose,isJointApplication,loanAmount,term,interestRate,monthlyPayment,grade,loanStatus
7574,1476191,2151629,5/4/2016,business,0.0,,,9.44,488,E3,Default
9043,1516436,2370473,11/18/2015,debtconsolidation,0.0,,,10.82,592,C3,Current
9251,1004279,2600631,1/12/2016,business,0.0,,,9.08,503,A2,Current
14073,1459019,2650588,8/1/2015,business,0.0,,,5.8,262,A2,Current
14089,1168275,2041818,3/13/2015,homeimprovement,0.0,,,8.72,378,B2,Current
22547,1085492,2609872,11/2/2014,debtconsolidation,0.0,,,13.0,531,C2,Current
29807,1364083,2005229,11/9/2015,debtconsolidation,0.0,,,7.47,336,A1,Current
30616,1689667,2204765,1/8/2016,debtconsolidation,0.0,,,9.26,380,C2,Current
34149,1894398,2533161,1/30/2016,debtconsolidation,0.0,,,10.28,563,A3,Current
41500,1750524,2790974,4/7/2015,debtconsolidation,0.0,,,8.77,611,E1,Current


### **Duplicates**

In [16]:
data_loan.duplicated().sum()

0

---

### **Actions Required**

- Fix column names
- Convert date to pandas datetime format
- Separate purpose levels with space
- Extract the term number
- Impute missing loan amount/terms by calculating it using monthly payment, interest rate, and terms/loan amount

---

# <center>**Borrower**</center>

---

In [17]:
data_borrower.head()

Unnamed: 0,memberId,residentialState,yearsEmployment,homeOwnership,annualIncome,incomeVerified,dtiRatio,lengthCreditHistory,numTotalCreditLines,numOpenCreditLines,numOpenCreditLines1Year,revolvingBalance,revolvingUtilizationRate,numDerogatoryRec,numDelinquency2Years,numChargeoff1year,numInquiries6Mon
0,2305095,NM,10+ years,rent,56471,1,16.8,6,11,9.0,6,14301,49.02,0,19,10,0
1,2610493,WA,2-5 years,rent,55038,0,19.99,22,8,7.0,4,18262,72.4,1,0,0,0
2,2491679,MS,< 1 year,rent,56610,1,14.33,5,8,5.0,5,10799,66.27,0,1,1,0
3,2092798,TX,6-9 years,own,54887,1,14.8,12,14,7.0,3,15272,61.05,1,0,0,3
4,2633077,MA,2-5 years,rent,53522,1,10.14,4,21,19.0,10,19316,56.39,2,14,7,1


In [18]:
data_borrower.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 17 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   memberId                  100000 non-null  int64  
 1   residentialState          100000 non-null  object 
 2   yearsEmployment           100000 non-null  object 
 3   homeOwnership             100000 non-null  object 
 4   annualIncome              100000 non-null  int64  
 5   incomeVerified            100000 non-null  int64  
 6   dtiRatio                  100000 non-null  float64
 7   lengthCreditHistory       100000 non-null  int64  
 8   numTotalCreditLines       100000 non-null  int64  
 9   numOpenCreditLines        99033 non-null   float64
 10  numOpenCreditLines1Year   100000 non-null  int64  
 11  revolvingBalance          100000 non-null  int64  
 12  revolvingUtilizationRate  100000 non-null  float64
 13  numDerogatoryRec          100000 non-null  in

### **Columns Screen**

In [19]:
data_borrower.residentialState.value_counts()

CA    7057
FL    6603
DC    5396
NY    5161
WA    3477
TX    3432
IA    2975
IL    2913
ID    2845
IN    2800
RI    1790
NV    1771
GA    1767
KS    1755
MN    1751
SD    1745
MD    1744
MI    1741
SC    1738
LA    1732
ME    1728
NH    1725
MO    1718
MS    1709
TN    1707
NJ    1694
MA    1694
NE    1688
PA    1674
KY    1664
MT    1661
OR    1659
NM    1644
DE    1609
CT    1605
CO    1575
AR    1486
AL    1484
AZ    1467
OH     946
WV     907
NC     901
VA     874
UT     845
OK     843
VT     843
WI     833
WY     818
ND     806
Name: residentialState, dtype: int64

In [20]:
data_borrower.yearsEmployment.value_counts()

10+ years    22389
6-9 years    19945
< 1 year     19401
1 year       19211
2-5 years    19054
Name: yearsEmployment, dtype: int64

In [21]:
data_borrower.homeOwnership.value_counts()

mortgage    36187
rent        32797
own         31016
Name: homeOwnership, dtype: int64

In [22]:
data_borrower.incomeVerified.value_counts()

1    68607
0    31393
Name: incomeVerified, dtype: int64

In [23]:
data_borrower.iloc[:, 1:].describe(include=np.number)

Unnamed: 0,annualIncome,incomeVerified,dtiRatio,lengthCreditHistory,numTotalCreditLines,numOpenCreditLines,numOpenCreditLines1Year,revolvingBalance,revolvingUtilizationRate,numDerogatoryRec,numDelinquency2Years,numChargeoff1year,numInquiries6Mon
count,100000.0,100000.0,100000.0,100000.0,100000.0,99033.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,54718.01314,0.68607,17.281603,10.92228,15.01582,11.274434,7.60496,14848.97021,65.090856,0.84446,1.61339,1.23613,1.45747
std,3178.627286,0.464091,5.100633,10.673098,4.007494,3.770307,3.407668,2507.838336,15.286859,2.519157,4.062456,3.120854,3.303967
min,38328.0,0.0,0.0,1.0,1.0,1.0,1.0,4291.0,0.0,0.0,0.0,0.0,0.0
25%,52648.0,0.0,13.82,4.0,12.0,9.0,5.0,13150.0,54.52,0.0,0.0,0.0,0.0
50%,54781.0,1.0,17.24,6.0,15.0,11.0,7.0,14852.0,65.18,0.0,0.0,0.0,0.0
75%,56866.0,1.0,20.72,15.0,18.0,14.0,10.0,16537.25,75.57,0.0,1.0,1.0,1.0
max,69553.0,1.0,40.51,40.0,32.0,29.0,26.0,25802.0,100.0,15.0,20.0,20.0,19.0


### **Missing Values**

In [24]:
data_borrower.isna().sum()

memberId                      0
residentialState              0
yearsEmployment               0
homeOwnership                 0
annualIncome                  0
incomeVerified                0
dtiRatio                      0
lengthCreditHistory           0
numTotalCreditLines           0
numOpenCreditLines          967
numOpenCreditLines1Year       0
revolvingBalance              0
revolvingUtilizationRate      0
numDerogatoryRec              0
numDelinquency2Years          0
numChargeoff1year             0
numInquiries6Mon              0
dtype: int64

### **Duplicated**

In [25]:
data_borrower.duplicated().sum()

0

---

# <center>**Foreign Key Relationships**</center>

---

- **memberId in the loan table is a foreign key to the borrower table**

In [26]:
data_loan.shape[0]

100000

In [27]:
data_loan.memberId.nunique()

100000

- **Check if there is any borrowers in the loan table that are not in the borrower table**

In [28]:
set(data_loan.memberId.unique()).difference(set(data_borrower.memberId.unique()))

set()

Good!