In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, RobustScaler
import joblib
# https://www.analyticsvidhya.com/blog/2023/02/how-to-save-and-load-machine-learning-models-in-python-using-joblib-library/
import tkinter as tk
from tkinter import filedialog, messagebox
import shap
# https://towardsdatascience.com/interpretable-machine-learning-with-xgboost-9ec80d148d27
from PIL import Image, ImageTk
import io
from sklearn.tree import plot_tree

import warnings
warnings.filterwarnings('ignore')

## Model Evaluation

In [3]:
model = joblib.load("stacking_model.pkl")
expected_features = joblib.load("expected_feature.pkl")

In [4]:
def cleaning(data):
    state_mapping = {
        'AK': 'Alaska', 'AL': 'Alabama', 'AR': 'Arkansas', 'AZ': 'Arizona', 'CA': 'California',
        'CO': 'Colorado', 'CT': 'Connecticut', 'DC': 'District of Columbia', 'DE': 'Delaware',
        'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'IA': 'Iowa', 'ID': 'Idaho',
        'IL': 'Illinois', 'IN': 'Indiana', 'KS': 'Kansas', 'KY': 'Kentucky', 'LA': 'Louisiana',
        'MA': 'Massachusetts', 'MD': 'Maryland', 'ME': 'Maine', 'MI': 'Michigan', 'MN': 'Minnesota',
        'MO': 'Missouri', 'MS': 'Mississippi', 'MT': 'Montana', 'NC': 'North Carolina', 'ND': 'North Dakota',
        'NE': 'Nebraska', 'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NV': 'Nevada',
        'NY': 'New York', 'OH': 'Ohio', 'OK': 'Oklahoma', 'OR': 'Oregon', 'PA': 'Pennsylvania',
        'RI': 'Rhode Island', 'SC': 'South Carolina', 'SD': 'South Dakota', 'TN': 'Tennessee',
        'TX': 'Texas', 'UT': 'Utah', 'VA': 'Virginia', 'VT': 'Vermont', 'WA': 'Washington',
        'WI': 'Wisconsin', 'WV': 'West Virginia', 'WY': 'Wyoming'
    }
    
    region_mapping = {
        'Alaska': 'West', 'Alabama': 'SouthEast', 'Arkansas': 'SouthEast', 'Arizona': 'SouthWest',
        'California': 'West', 'Colorado': 'West', 'Connecticut': 'NorthEast',
        'District of Columbia': 'SouthEast', 'Delaware': 'SouthEast', 'Florida': 'SouthEast',
        'Georgia': 'SouthEast', 'Hawaii': 'West', 'Iowa': 'MidWest', 'Idaho': 'West',
        'Illinois': 'MidWest', 'Indiana': 'MidWest', 'Kansas': 'MidWest', 'Kentucky': 'SouthEast',
        'Louisiana': 'SouthEast', 'Massachusetts': 'NorthEast', 'Maryland': 'NorthEast',
        'Maine': 'NorthEast', 'Michigan': 'MidWest', 'Minnesota': 'MidWest', 'Missouri': 'MidWest',
        'Mississippi': 'SouthEast', 'Montana': 'West', 'North Carolina': 'SouthEast',
        'North Dakota': 'MidWest', 'Nebraska': 'MidWest', 'New Hampshire': 'NorthEast',
        'New Jersey': 'NorthEast', 'New Mexico': 'SouthWest', 'Nevada': 'West', 'New York': 'NorthEast',
        'Ohio': 'MidWest', 'Oklahoma': 'SouthWest', 'Oregon': 'West', 'Pennsylvania': 'NorthEast',
        'Rhode Island': 'NorthEast', 'South Carolina': 'SouthEast', 'South Dakota': 'MidWest',
        'Tennessee': 'SouthEast', 'Texas': 'SouthWest', 'Utah': 'West', 'Virginia': 'SouthEast',
        'Vermont': 'NorthEast', 'Washington': 'West', 'Wisconsin': 'MidWest', 'West Virginia': 'SouthEast',
        'Wyoming': 'West'
    }
    
    # mapping state code to the state name then to region
    data['addr_state'] = data['addr_state'].map(state_mapping)
    data['region'] = data['addr_state'].map(region_mapping)

    data.drop(columns= 'addr_state', inplace=True)

    data['term'] = data['term'].map({' 36 months': 36, ' 60 months': 60})

    # sample data - with features similar to the features used in model deevelopment stage
    data = data[['mock_id','loan_amnt', 'term', 'int_rate', 'installment', 'home_ownership', 
    'annual_inc', 'verification_status', 'purpose', 'dti', 'delinq_2yrs', 
    'fico_range_low', 'inq_last_6mths', 'mths_since_last_delinq', 'open_acc', 
    'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'initial_list_status', 
    'application_type', 'mths_since_rcnt_il', 'total_bal_il', 'il_util', 
    'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util', 'total_rev_hi_lim', 
    'inq_fi', 'total_cu_tl', 'inq_last_12m', 'acc_open_past_24mths', 
    'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'mo_sin_old_il_acct', 
    'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 
    'mort_acc', 'mths_since_recent_bc', 'num_actv_bc_tl', 'num_actv_rev_tl', 
    'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl', 'num_rev_accts', 
    'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 'tax_liens', 
    'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_bc_limit', 'region', 'target']]

    def divide(var_1, var_2):
        return np.where(var_2 == 0, 0, var_1 / var_2)

    data['annual_inc_installment'] = divide(data['annual_inc'], data['installment'])
    data['revol_bal_annual_inc'] = divide(data['revol_bal'], data['annual_inc'])
    data['annual_inc_dti'] = data['annual_inc'] * data['dti']
    data['loan_amnt_dti'] = data['loan_amnt'] * data['dti']

    bins = [0, 20000, 50000, 100000, 200000, np.inf]
    labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']
    data['annual_inc_binned'] = pd.cut(data['annual_inc'], bins=bins, labels=labels)
    data['revol_bal_binned'] = pd.cut(data['revol_bal'], bins=bins, labels=labels)

    # handle missing values if any
    data['annual_inc_binned'] = data['annual_inc_binned'].cat.add_categories('none')
    data['revol_bal_binned'] = data['revol_bal_binned'].cat.add_categories('none')
    
    data['annual_inc_binned'].fillna('none', inplace=True)
    data['revol_bal_binned'].fillna('none', inplace=True)

    data.drop(columns= ['installment'], inplace=True)

    # one-hot encoding 
    columns_to_check = data.select_dtypes(exclude='number').columns.tolist()
    features_to_encode = columns_to_check

    encoder = OneHotEncoder(drop='first', sparse=False)
    encoded_features = encoder.fit_transform(data[features_to_encode])
    encoded_data = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(features_to_encode))

    data = data.drop(features_to_encode, axis=1)
    data.reset_index(drop=True, inplace=True)
    encoded_data.reset_index(drop=True, inplace=True)

    data = pd.concat([data, encoded_data], axis=1)

    mock_id = data['mock_id']

    scaler = RobustScaler()
    data = pd.DataFrame(scaler.fit_transform(data[expected_features]), columns=expected_features)

    return data.fillna(0), mock_id

In [5]:
def upload_file():
    file_path = filedialog.askopenfilename(filetypes=[("CSV files", "*.csv")])
    if file_path:
        try:
            result_text.delete(1.0, tk.END)
            uploaded_data = pd.read_csv(file_path)

            # data preparation
            data, mock_id = cleaning(uploaded_data)
            predictions = model.predict(data)
            results = pd.DataFrame({'mock_id': mock_id, 'prediction': predictions})

            # add the status column
            results['status'] = results['prediction'].map({0: 'Non-Default', 1: 'Default'})

            result_text.insert(tk.END, "Predictions:\n")
            result_text.insert(tk.END, results.head(25).to_string(index=False))

            data_sample = data.sample(20, random_state=42)

            # SHAP analysis
            xgboost_model = model.named_estimators_['XGBoost']
            explainer_xgb = shap.TreeExplainer(xgboost_model)
            shap_values_xgb = explainer_xgb.shap_values(data_sample)

            # summary plot 
            summary_plot_path = "shap_summary_plot.png"
            shap.summary_plot(shap_values_xgb, data_sample, show=False)
            plt.savefig(summary_plot_path)
            plt.close()
            shap_plot(summary_plot_path, "SHAP Summary Plot")
            
            # show a simple decision tree (the one in the stacked model)
            decision_tree(model.named_estimators_['Decision Tree'], list(data.columns))

            # individual SHAP explanations
            individual_shap_explanations(data_sample, shap_values_xgb, mock_id)

        except Exception as e:
            messagebox.showerror("Error", str(e))

def individual_shap_explanations(data_sample, shap_values, mock_id):
    explainer_window = tk.Toplevel(root)
    explainer_window.title("Individual SHAP Explanations")

    # Tkinter to display individual explanations
    explainer_text = tk.Text(explainer_window, height=20, width=80)
    explainer_text.pack(pady=20)

    for i in range(len(data_sample)):
        id = mock_id[i]
        shap_val = shap_values[i]
        explanation = f"ID: {id}\n"
        explanation += f"Prediction: {model.predict(data_sample.iloc[[i]])[0]}\n"
        explanation += f"Status: {'Default' if model.predict(data_sample.iloc[[i]])[0] == 1 else 'Non-Default'}\n"
        explanation += "Feature Contributions:\n"
        for feature, value in zip(data_sample.columns, shap_val):
            explanation += f"{feature}: {value:.4f}\n"
        explanation += "\n" + "-"*50 + "\n"
        explainer_text.insert(tk.END, explanation)

def shap_plot(plot_path, title):
    plot_window = tk.Toplevel(root)
    plot_window.title(title)

    # open and display the plots
    img = Image.open(plot_path)
    img = ImageTk.PhotoImage(img)
    img_label = tk.Label(plot_window, image=img)
    img_label.image = img  
    img_label.pack()

def decision_tree(tree_model, feature_names):
    plot_window = tk.Toplevel()
    plot_window.title("Decision Tree")

    fig, ax = plt.subplots(figsize=(12, 8))

    plot_tree(tree_model, feature_names=feature_names, class_names=['Non-Default', 'Default'],
        filled=True, rounded=True, proportion=True, ax=ax
    )
    
    plt.tight_layout()

    # display plots as images
    buf = io.BytesIO()
    plt.savefig(buf, format='png')
    buf.seek(0)
    img = Image.open(buf)
    img = ImageTk.PhotoImage(img)
    img_label = tk.Label(plot_window, image=img)
    img_label.image = img  
    img_label.pack()

    buf.close()
    plt.close()

In [6]:
# initialising Tkinter root to run the GUI Application
root = tk.Tk()
root.title("Loan Prediction and Analysis")

info_label = tk.Label(root, text=(
    "Welcome to the Loan Prediction UI!\n\n"
    "Kindly have your data ready in a CSV file and upload it using the 'Upload CSV' button.\n\n"
    "The results you will receive include:\n"
    "- User ID\n"
    "- Predictions on whether the borrower would default or not\n"
    "- Status Non-Default or Default\n"
    "- SHAP values to understand why a default might occur\n"
    "- A decision tree to visualise the decision-making process\n\n"
    "Press 'Upload CSV' to get started."
), wraplength=400, justify="left")
info_label.pack(pady=20)

# buttons to uploading the data
upload_button = tk.Button(root, text="Upload CSV", command=upload_file)
upload_button.pack(pady=20)

# displaying the result
result_text = tk.Text(root, height=10, width=80)
result_text.pack(pady=20)

root.mainloop()