### Credit Risk Assessment

A business use case for Maybank

Author: bedezub  
Date: 2023-05-15

Import libraries

In [61]:
## Basic Libraries:
import pandas as pd
import numpy as np
import time
import warnings
warnings.filterwarnings("ignore")

## For making sample data:
from sklearn.datasets import make_classification

## For Preprocessing: 
from sklearn.compose import ColumnTransformer,make_column_selector
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, RepeatedKFold,RepeatedStratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

## Using imblearn library:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from lightgbm import LGBMClassifier

Load Data

In [62]:
credit_df = pd.read_csv('./data/credit_risk_dataset.csv')

Data Exploratory and Preprocessing

1. Check for duplicates

In [63]:
duplicate_credit_df = credit_df.duplicated()
duplicate_credit_df.value_counts()

False    32416
True       165
dtype: int64

In [64]:
credit_df.drop_duplicates(inplace=True)

2. Drop irrelevant columns

In [65]:
credit_df.drop(['loan_int_rate'],axis=1,inplace=True)

3. Check missing values

In [66]:
credit_df.isnull().any()

person_age                    False
person_income                 False
person_home_ownership         False
person_emp_length              True
loan_intent                   False
loan_grade                    False
loan_amnt                     False
loan_status                   False
loan_percent_income           False
cb_person_default_on_file     False
cb_person_cred_hist_length    False
dtype: bool

In [67]:
# Replace missing values
credit_df.isna().sum()

person_age                      0
person_income                   0
person_home_ownership           0
person_emp_length             887
loan_intent                     0
loan_grade                      0
loan_amnt                       0
loan_status                     0
loan_percent_income             0
cb_person_default_on_file       0
cb_person_cred_hist_length      0
dtype: int64

4. Data preparation for training

In [68]:
X, X_test, y, y_test = train_test_split(
    credit_df.drop('loan_status', axis=1), 
    credit_df['loan_status'],
    random_state=0,  
    test_size=0.2, 
    stratify=credit_df['loan_status'],
    shuffle=True
)

In [69]:
# Checking missing values in train data
(X.shape[0]-X.dropna().shape[0])/X.shape[0]*100

2.799629801018047

In [45]:
X = X.loc[X['person_age']<80, :]

In [46]:
X = X.loc[X['person_emp_length']<60, :]

5. Generate Pipeline for Training

In [70]:
num_pipe = Pipeline([
    ('impute', IterativeImputer()),     #MICE (Multivariate Imputation by Chained Equations)
    ('scale', StandardScaler()),
])

In [71]:
ct = ColumnTransformer([
    ('num_pipe', num_pipe, make_column_selector(dtype_include=np.number)),
    ('cat_cols', OneHotEncoder(sparse=False, handle_unknown='ignore'), make_column_selector(dtype_include=object))
], remainder='passthrough')

In [72]:
grid = {
    LGBMClassifier(class_weight='balanced', random_state=0): 
    {
        'model__n_estimators':[400],
        'model__learning_rate':[0.01],
        'model__boosting_type': ['gbdt'],
    }   
}

In [73]:
full_df = pd.DataFrame()
best_algos = {}

## Changing the ordering of the columns for ease of understanding:
X=X[X.select_dtypes(include=[np.number]).columns.append(X.select_dtypes("O").columns)]

for model, param in grid.items():
    ## The smote object:
    smt = SMOTE(random_state=42)
    
    pipe = Pipeline([
    ('coltf', ct),     #ct for the column transformer for preprocessing
    ('smote', smt),
    ('model', model)
    ])
    
    print(f"Training {model}!!\n")
    ## Conducting a Randomized Search to find the best optimal hyperparamaters:
    gs = RandomizedSearchCV(estimator=pipe, param_distributions=param, scoring='accuracy',verbose=3, n_iter=4, random_state=0)
    
    print("Fitting!!\n")
    gs.fit(X, y)
    
    print("Gathering Results!!\n")
    all_res = pd.DataFrame(gs.cv_results_)

    temp = all_res.loc[:, ['params', 'mean_test_score']]
    algo_name = str(model).split('(')[0]
    temp['algo'] = algo_name
    
    full_df = pd.concat([full_df, temp], ignore_index=True)
    best_algos[algo_name] = gs.best_estimator_

Training LGBMClassifier(class_weight='balanced', random_state=0)!!

Fitting!!

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END model__boosting_type=gbdt, model__learning_rate=0.01, model__n_estimators=400;, score=0.935 total time=   4.7s
[CV 2/5] END model__boosting_type=gbdt, model__learning_rate=0.01, model__n_estimators=400;, score=0.929 total time=   5.1s
[CV 3/5] END model__boosting_type=gbdt, model__learning_rate=0.01, model__n_estimators=400;, score=0.932 total time=   4.7s
[CV 4/5] END model__boosting_type=gbdt, model__learning_rate=0.01, model__n_estimators=400;, score=0.933 total time=   2.9s
[CV 5/5] END model__boosting_type=gbdt, model__learning_rate=0.01, model__n_estimators=400;, score=0.933 total time=   2.3s
Gathering Results!!



In [74]:
be_lgb = best_algos['LGBMClassifier']

In [75]:
%%time
## A dry run of the best pipeline:
pipe_lgb = be_lgb
# evaluate pipeline using k-fold cross validation:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
scores = cross_val_score(pipe_lgb, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print(f"The MEAN of score obtained after CROSS VALIDATION of the LGBM Based Pipeline is: {scores.mean()} or {scores.mean()*100:.2f}%")

The MEAN of score obtained after CROSS VALIDATION of the LGBM Based Pipeline is: 0.932104562532638 or 93.21%
CPU times: total: 344 ms
Wall time: 22.8 s


In [76]:
%%time
## Fitting into best pipeline for evaluation:
pipe_lgb.fit(X, y)
## Getting predictions:
preds_lgb = pipe_lgb.predict(X_test)
## Getting probabilities:
probs_lgb = pipe_lgb.predict_proba(X_test)
## Accuracy Score:
print(f"The ACCURACY SCORE produced on the TEST SET by the LGBM Based Pipeline is: {accuracy_score(y_test,preds_lgb)} or {accuracy_score(y_test,preds_lgb)*100}%.")

The ACCURACY SCORE produced on the TEST SET by the LGBM Based Pipeline is: 0.9285934608266502 or 92.85934608266501%.
CPU times: total: 13 s
Wall time: 4.93 s


6. Deploy application to Streamlit?

In [77]:
# Save the ML Pipeline:
import joblib
joblib.dump(pipe_lgb, 'best_pipeline.pkl')

['best_pipeline.pkl']

In [None]:
import streamlit as st
import numpy as np
import pandas as pd
from PIL import Image
import sklearn
import joblib
import json

## Side Tab:
l=["Introduction","Predict your Credit Score"]
st.sidebar.subheader("Here's what you can do:")
option=st.sidebar.selectbox("Choose what you want to do:",l)

def page_1():
    ## Intro Tab::
    image = Image.open('Credit_Risk.jpg')

    ## Displaying the image:
    st.image(image,use_column_width="always")

    ## Headers:
    st.title("Welcome to this Mock Credit Risk Simulator")
    st.header("Here's the drill. You get me whatever I need and I predict whether you are eligible or not. DEAL!")
    st.subheader("Let's get started...")

def page_2():
    data={}
    ## Details Tab:
    st.header("Gimme your details and I will deliver magic!")

    #Full Name:
    first,last=st.columns(2)
    first=first.text_input("Enter your First Name:")
    last=last.text_input("Enter your Last Name:")
    data["First Name"]=first
    data["Last Name"]=last

    name=first+" "+last
    data["Full Name"]=name

    ##Age:
    age=st.slider("Enter your Age:",10,70)
    data["Age"]=age

    ##Annual Income:
    ai=st.number_input("Enter your Annual Income:",1000,100000)
    data["Annual Income"]=ai

    ##Home Ownership:
    ho=st.selectbox("What is the type of House Ownership:", ["RENT", "OWN", "MORTGAGE","OTHER"])
    data["Home Ownership"]=ho

    ##Employment Length:
    el=st.number_input("Enter your Work Experience in years:",2,50)
    data["Employment Length"]=el

    ##Loan Intent:
    li=st.selectbox("Why do you want a loan?", ['EDUCATION', 'MEDICAL', 'VENTURE', 'PERSONAL', 'DEBTCONSOLIDATION',
                                                'HOMEIMPROVEMENT'])
    data["Loan Intent"]=li
    ##Loan Grade:
    lg=st.selectbox("Grade of Loan expected?", ['A', 'B', 'C', 'D', 'E', 'F', 'G'])
    data["Loan Grade"]=lg

    ## Loan Amount:
    la=st.number_input("Enter your Work Experience in years:",100,50000)
    data["Loan Amount"]=la

    ## loan_percent_income:
    lpi=st.number_input("Enter your % Income to be used for repaying:",0,100)
    data["Loan Percent Income"]=lpi

    ## cb_person_default_on_file:
    def_his=st.selectbox("Have your ever defaulted?",["Y","N"]) 
    data["Previous Defaults"]=def_his

    ## cb_person_cred_hist_length:
    n_def=st.slider("Total Number of Defaults:",0,50)
    data["Number of Defaults"]=n_def

    ## Make a submit button:
    data_display=json.dumps(data)
    temp=pd.DataFrame(data,index=[0])  ## making a record

    ## Display the input data as a json:
    if st.button("Display Data",key = 8)==1:
        st.write("The data in JSON Format:")
        st.write(data_display)        
        st.write("\nThe data in Tabular Format:")
        st.write(temp)   
 
    ## Display the prediction:
    if st.button("Predict Credit Score",key = 9)==1:
        ## Order of passing the data into the pipeline:
        cols=['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
       'loan_percent_income', 'cb_person_cred_hist_length',
       'person_home_ownership', 'loan_intent', 'loan_grade',
       'cb_person_default_on_file']  ## List of columns of the original dataframe
                
        input_data=[[data["Age"],data["Annual Income"],data["Employment Length"],data["Loan Amount"],
                     round(data["Loan Percent Income"]/100,2),data["Number of Defaults"],
                     data["Home Ownership"],data["Loan Intent"],data["Loan Grade"],data["Previous Defaults"]]]
        
        pipe=joblib.load('best_pipeline.pkl')  ## Loading the pipeline
        
        input_data=pd.DataFrame(input_data,columns=cols)  ## Converting input into a dataframe with respective columns

        res=pipe.predict(input_data)[0]  ## Predicting the class
        out={1:"The Customer is capable of DEFAULTING. Hence it is RISKY to provide loan!", 0:"The Customer is capable of NOT DEFAULTING. Hence it is POSSIBLE to provide loan!"}
        st.write(f"The Final Verdict obtained from the given model is that : {out[res]}")
if option==l[0]:
    page_1()

if option==l[1]:
    page_2()
