In [96]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [32]:
import warnings
warnings.filterwarnings(action='ignore')

In [33]:
pd.set_option('display.max_columns', 500)

---

In [34]:
data_loan_borrowers = pd.read_csv('../../data/interim/loan_borrower.csv')

---

# <center>**Correlation Analysis**</center>

---

In [76]:
loan_borrowers = data_loan_borrowers.copy()

In [86]:
loan_borrowers = loan_borrowers.drop(columns=['loan_id', 'member_id', 'date', 'term'])

In [78]:
loan_borrowers = loan_borrowers.loc[~loan_borrowers['loan_amount'].isna()]

In [79]:
loan_borrowers.head(1)

Unnamed: 0,purpose,is_joint_application,loan_amount,term,term_in_months,interest_rate,monthly_payment,grade,loan_status,residential_state,years_employment,home_ownership,annual_income,income_verified,dti_ratio,length_credit_history,n_total_credit_lines,n_open_credit_lines,n_open_credit_lines_1_year,revolving_balance,revolving_utilization_rate,n_derogatory_record,n_delinquency_2_years,n_charge_off_1_year,n_inquiries_6_months
0,Debt Consolidation,0.0,25190.0,60 months,60.0,6.25,490,E3,Current,NM,10+ years,rent,56471,1,16.8,6,11,9.0,6,14301,49.02,0,19,10,0


### **Variables for Encoding**

- Impute with average loan amount for each group
   - purpose
   - grade
   - loan_status
- Impute with average annual income for each group
   - residential_state
   - home_ownership
- Convert to ordinal scale
   - years_employment


In [80]:
def encode_label(col: str, impute: str, df: pd.DataFrame = loan_borrowers) -> None:
   """Encode all levels in the categorical variable <col>
   with the average value of <impute>.

   - impute = 'loan_amount'/'annual_income'
   """
   cat_dict = {}
   cats = df[col].unique().tolist()
   for cat in cats:
      cat_dict[cat] = df.loc[df[col] == cat, impute].mean()
   df[col] = df[col].map(cat_dict)

In [81]:
loan_amount_cols = ['purpose', 'grade', 'loan_status']
annual_income_cols = ['residential_state', 'home_ownership']

In [82]:
for col in loan_amount_cols:
   encode_label(col=col, impute='loan_amount')
   loan_borrowers[col] = loan_borrowers[col].astype('float')

In [83]:
for col in annual_income_cols:
   encode_label(col=col, impute='annual_income')
   loan_borrowers[col] = loan_borrowers[col].astype('float')

In [90]:
loan_borrowers['years_employment'] = pd.factorize(loan_borrowers['years_employment'])[0]

In [91]:
loan_borrowers.head()

Unnamed: 0,purpose,is_joint_application,loan_amount,term_in_months,interest_rate,monthly_payment,grade,loan_status,residential_state,years_employment,home_ownership,annual_income,income_verified,dti_ratio,length_credit_history,n_total_credit_lines,n_open_credit_lines,n_open_credit_lines_1_year,revolving_balance,revolving_utilization_rate,n_derogatory_record,n_delinquency_2_years,n_charge_off_1_year,n_inquiries_6_months
0,20644.574441,0.0,25190.0,60.0,6.25,490,21148.285082,20451.398677,54748.165043,0,54724.243451,56471,1,16.8,6,11,9.0,6,14301,49.02,0,19,10,0
1,20644.574441,0.0,21189.0,60.0,10.49,455,20625.598091,20451.398677,54734.720656,1,54724.243451,55038,0,19.99,22,8,7.0,4,18262,72.4,1,0,0,0
2,20644.574441,0.0,29908.0,60.0,9.11,622,20514.242761,20451.398677,54690.82797,2,54724.243451,56610,1,14.33,5,8,5.0,5,10799,66.27,0,1,1,0
3,20668.662682,0.0,13053.0,48.0,11.89,343,20625.598091,20451.398677,54749.640233,3,54706.056369,54887,1,14.8,12,14,7.0,3,15272,61.05,1,0,0,3
4,20644.574441,0.0,24613.0,60.0,15.13,587,20475.886318,20451.398677,54602.958678,1,54724.243451,53522,1,10.14,4,21,19.0,10,19316,56.39,2,14,7,1


---

# <center>**Correlation Matrix**</center>

---

In [106]:
px.imshow(loan_borrowers.corr())

- term_in_months and monthly_payment are moderately negatively correlated
- grade and loan status are weakly positively correlated
- n_delinquency_2_years and n_charge_off_1_year are strongly positively correlated

---

**Most features and hardly correlated. Should use some feature selection techniques to help filter out unsignificant variables**

---