In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:
from pingouin import ttest, anova

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
pd.set_option('display.max_columns', 500)

---

# <center>**Loan Analysis**</center>

---

In [None]:
loan_borrowers = pd.read_csv('../../data/interim/loan_borrower.csv')

---

In [None]:
from typing import Union, List

def visualize_pd_by(group: Union[str, List[str]], df: pd.DataFrame = loan_borrowers) -> px.bar:
   """Visualize the probability of default within each level of <group>.

   - If group is a list, it has a maximum length of 2
   """
   if isinstance(group, str):
      group = [group]

   group_status_count = df.groupby(group + ['loan_status']).size() \
      .to_frame().reset_index().rename(columns={0: 'count'})   
   
   group_count_sum = group_status_count.groupby(group)['count'].transform('sum')
   group_status_count['probability'] = group_status_count['count'] / group_count_sum

   if len(group) == 1:
      fig = px.bar(
         group_status_count, x=group[0], y='probability', color='loan_status', text_auto=True, 
         title=f'How does probability of default vary across {" ".join(group[0].split("_"))}?'
      )
   else:
      fig = px.bar(
         group_status_count, x=group[1], y='probability', color='loan_status', facet_col=group[0],
         text_auto=True, title="How does probability of default vary with " + \
            f"{' '.join(group[1].split('_'))} across {' '.join(group[0].split('_'))}?"
      )
   return fig

### **Visualize Loan Status Against Loan Terms**
- loan terms &uarr; &rarr; PD &uarr;

In [None]:
visualize_pd_by(group='term')

Probability of default pretty much the same across term lengths. Except that for 2-months loans, there is a slightly lower probability of default, but it is not a big deal

### **Visualize Loan Status Against Loan Grade**
- loan grade &darr; &rarr; PD &uarr;

In [None]:
visualize_pd_by(group='grade')

Probability of default increases with lower grades. As expected.

### **Visualize Loan Status Against Purpose for Borrowing**
- purpose for debt consolidation &rarr; PD &uarr; 

In [None]:
visualize_pd_by(group='purpose')

Hold on... There is a significantly greater probability of default for loans taken out for healthcare reason.

### **Visualize Loan Status Against Joint Application**
- joint application &rarr; PD &uarr;   

In [None]:
visualize_pd_by(group='is_joint_application')

Probability of default basically the same. Not a big deal...