In [1]:
# Importing the libraries
import lightningchart as lc
import random
lc.set_license('my-license-key')

import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier, StackingClassifier
from scipy.stats import probplot
from feature_engine.outliers import Winsorizer
from feature_engine.selection import DropConstantFeatures, DropDuplicateFeatures
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier, XGBRFClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import BorderlineSMOTE
from collections import Counter
from yellowbrick.classifier import ClassPredictionError
from feature_engine.selection import DropCorrelatedFeatures
import warnings, gc

warnings.filterwarnings("ignore")
gc.enable()  # Enabling garbage collection to manage memory during large data operations.

# Ensuring to install feature_engine correctly.
!pip install feature_engine




In [2]:
# Loading the dataset
data = pd.read_csv("./credit_risk_dataset.csv")
df=data.copy()
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [3]:
# Removimg duplicate and handling missing data
df = df.drop_duplicates()
df = df.dropna()
df.isnull().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64

In [4]:
# Initial classification based on data type
def grab_col_names(dataframe, cat_th=10, car_th=20):

    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]

    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                dataframe[col].dtypes != "O"]

    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                dataframe[col].dtypes == "O"]

    # Updating categorical columns list
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # Defining numerical columns excluding numeric but categorical
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, cat_but_car, num_cols

cat_cols, cat_but_car, num_cols = grab_col_names(df)

Observations: 28501
Variables: 12
cat_cols: 5
num_cols: 7
cat_but_car: 0
num_but_cat: 1


In [5]:
def high_correlated_cols(dataframe, display_table=False, corr_th=0.70):
    # Selecting only the numeric columns from the DataFrame
    numeric_dataframe = dataframe.select_dtypes(include=['number'])
    
    # Calculating the absolute correlation matrix
    corr = numeric_dataframe.corr().abs()
    
    # Create an upper triangle matrix to identify high correlations
    upper_triangle_matrix = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    drop_list = [col for col in upper_triangle_matrix.columns if any(upper_triangle_matrix[col] > corr_th)]
    
    if display_table:
        # Displaying the correlation matrix
        print("Correlation Matrix:")
        display(corr)  # This uses IPython.display.display to show the DataFrame in Jupyter
        
    return drop_list

drop_list = high_correlated_cols(df, display_table=True)

Correlation Matrix:


Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
person_age,1.0,0.178987,0.165624,0.054246,0.01017,0.024091,0.040782,0.859621
person_income,0.178987,1.0,0.136427,0.264942,0.001346,0.140456,0.251487,0.116622
person_emp_length,0.165624,0.136427,1.0,0.110934,0.056607,0.082852,0.055033,0.146486
loan_amnt,0.054246,0.264942,0.110934,1.0,0.146026,0.114153,0.577708,0.045334
loan_int_rate,0.01017,0.001346,0.056607,0.146026,1.0,0.339995,0.123441,0.014562
loan_status,0.024091,0.140456,0.082852,0.114153,0.339995,1.0,0.38,0.016559
loan_percent_income,0.040782,0.251487,0.055033,0.577708,0.123441,0.38,1.0,0.02969
cb_person_cred_hist_length,0.859621,0.116622,0.146486,0.045334,0.014562,0.016559,0.02969,1.0


In [6]:
# Preparing data for the PieChart
target = df.loan_status.value_counts(normalize=True) * 100
target.rename(index={1: 'Default', 0: 'Non-default'}, inplace=True)

# Initializing and configuring the PieChart
chart = lc.PieChart(
    labels_inside_slices=False,
    title='Target Distribution',
    theme=lc.Themes.White
)

# Adding a legend to the chart
legend = chart.add_legend()

# Adding slices to the chart
for label, value in target.items():
    pie_slice = chart.add_slice(label, value)

# Adding the entire chart
legend.add(chart)

# Setting an inner radius to create a donut-like appearance
chart.set_inner_radius(50)

# Opening the chart window
chart.open()

In [7]:
# Calculating the quantiles at specified levels
def outlier_thresholds(dataframe, variable, low_quantile=0.10, up_quantile=0.90):
    quantile_one = dataframe[variable].quantile(low_quantile)
    quantile_three = dataframe[variable].quantile(up_quantile)
    interquantile_range = quantile_three - quantile_one
    up_limit = quantile_three + 1.5 * interquantile_range
    low_limit = quantile_one - 1.5 * interquantile_range
    return low_limit, up_limit

# Obtaining the outlier thresholds
def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False
for col in num_cols:
    if col != "loan_status":
        print(col, check_outlier(df, col))

person_age True
person_income True
person_emp_length True
loan_amnt False
loan_int_rate False
loan_percent_income True
cb_person_cred_hist_length True


In [8]:
# Calculating the lower and upper limits for outliers
def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit


for col in num_cols:
    if col != "loan_status":
        replace_with_thresholds(df,col)

In [9]:
# Preparing data for the Bar Chart
# Adding age range
bins = range(df['person_age'].min(), 80, 5) 

# Filtering data for good loan status
df_good = df[df["loan_status"] == 0]['person_age']

# Defining bins
bins = range(df['person_age'].min(), df['person_age'].max() + 2, 5)

# Calculating histogram
hist_good, bins_good = np.histogram(df_good, bins=bins)

# Converting numpy array to Python list
hist_good = hist_good.astype(int).tolist()

# Creating a BarChart
chart_good = lc.BarChart(vertical=True)
chart_good.set_title("Good Loan Status Age Distribution")

# Filtering out categories and values with zero counts
filtered_categories = []
filtered_values = []

for i in range(len(hist_good)):
    if hist_good[i] > 0:
        category_label = f"{bins_good[i]}-{bins_good[i+1]}"
        filtered_categories.append(category_label)
        filtered_values.append(hist_good[i])

# Preparing data for the chart
data_good = [{'category': filtered_categories[i], 'value': filtered_values[i]} for i in range(len(filtered_categories))]
chart_good.set_data(data_good)

# Opening the chart
chart_good.open()

In [10]:
# Preparing data for the Bar Chart
# Adding age range
df_filtered = df[df['person_age'] <= 80]

# Filtering data for bad loan status
df_bad = df[df["loan_status"] == 1]['person_age']

# Calculating histogram
hist_bad, bins_bad = np.histogram(df_bad, bins=bins)

# Converting numpy array to Python list
hist_bad = hist_bad.astype(int).tolist()

# Creating a BarChart
chart_bad = lc.BarChart(vertical=True)
chart_bad.set_title("Bad Loan Status Age Distribution")

# Adding bars to chart
categories = [f"{bins_bad[i]}-{bins_bad[i+1]}" for i in range(len(bins_bad)-1)]
values = [hist_bad[i] for i in range(len(hist_bad))]
data_bad = [{'category': categories[i], 'value': values[i]} for i in range(len(categories))]
chart_bad.set_data(data_bad)

# Opening the chart
chart_bad.open()

In [11]:
# Preparing data for the Bar Chart
# Adding age range
df_filtered = df[df['person_age'] <= 80]

# Recalculating the histogram:
hist_filtered, bins_filtered = np.histogram(df_filtered['person_age'], bins=range(df_filtered['person_age'].min(), df_filtered['person_age'].max() + 5, 5))

# excluding bins when displaying the histogram
bins = range(df['person_age'].min(), 80, 5) 
hist_age, bins_age = np.histogram(df['person_age'], bins=bins)

# Converting numpy array to Python list
hist_age = hist_age.astype(int).tolist()

# Creating a BarChart
chart_age = lc.BarChart(vertical=True)
chart_age.set_title("Overall Age Distribution")

# Adding bars to chart
categories = [f"{bins_age[i]}-{bins_age[i+1]}" for i in range(len(bins_age)-1) if hist_age[i] > 0]
values = [hist_age[i] for i in range(len(hist_age)) if hist_age[i] > 0]
data_age = [{'category': categories[i], 'value': values[i]} for i in range(len(categories))]
chart_age.set_data(data_age)

# Opening the chart
chart_age.open()


In [12]:
# Preparing data for Stacked Bar Chart
# Defining bins for age groups
bins = range(df['person_age'].min(), df['person_age'].max() + 1, 5)

# Histogram data preparation for three categories
histograms = []
categories = [f"{bins[i]}-{bins[i+1]}" for i in range(len(bins)-1)]

# 'Good' loan status (Non-Default)
df_good = df[df["loan_status"] == 0]['person_age']
hist_good, _ = np.histogram(df_good, bins=bins)
hist_good = hist_good.astype(int).tolist()

# 'Bad' loan status (Default)
df_bad = df[df["loan_status"] == 1]['person_age']
hist_bad, _ = np.histogram(df_bad, bins=bins)
hist_bad = hist_bad.astype(int).tolist()

# 'Overall' loan status
hist_overall, _ = np.histogram(df['person_age'], bins=bins)
hist_overall = hist_overall.astype(int).tolist()

# Creating a BarChart with stacked data using LightningChart
chart = lc.BarChart(
    vertical=True, 
    theme=lc.Themes.White, 
    title='Age Distribution by Loan Status'
)
chart.set_data_stacked(
    categories,
    [
        {'subCategory': 'Good Loan Status', 'values': hist_good},
        {'subCategory': 'Bad Loan Status', 'values': hist_bad},
        {'subCategory': 'Overall Loan Status', 'values': hist_overall}
    ]
)
chart.set_value_label_display_mode('hidden')
chart.add_legend().add(chart)
chart.open()


In [13]:
# Preparing data for the Bar Chart
# Calculating the counts for each home ownership category for each loan status
home_ownership_counts = df.groupby(['person_home_ownership', 'loan_status']).size().unstack(fill_value=0)

# Ensuring all categories are present for both loan statuses and fill missing ones with zero
home_ownership_counts = home_ownership_counts.reindex(columns=[0, 1], fill_value=0)

# Extracting categories and values
categories = home_ownership_counts.index.tolist()
loan_status_0_values = home_ownership_counts[0].tolist()  # Loan status = 0 counts
loan_status_1_values = home_ownership_counts[1].tolist()  # Loan status = 1 counts

# Printing the calculated values for verification
print("Categories:", categories)
print("Loan Status = 0 Values:", loan_status_0_values)
print("Loan Status = 1 Values:", loan_status_1_values)

# Setting up the LightningChart environment
chart = lc.BarChart(vertical=True)
chart.set_title("Housing Distribution by Loan Status")

# Preparing grouped data
grouped_data = [
    {'name': 'Loan Status = Default', 'values': loan_status_1_values},
    {'name': 'Loan Status = Non-default', 'values': loan_status_0_values}
]

# Setting grouped data on the chart
chart.set_data_grouped(
    categories,
    [
        {'subCategory': grouped_data[0]['name'], 'values': grouped_data[0]['values']},
        {'subCategory': grouped_data[1]['name'], 'values': grouped_data[1]['values']}
    ]
)

# Adding a legend to the chart
legend = chart.add_legend()
legend.add(chart)  

# Opening the chart
chart.open()


Categories: ['MORTGAGE', 'OTHER', 'OWN', 'RENT']
Loan Status = 0 Values: [10253, 66, 2029, 9965]
Loan Status = 1 Values: [1483, 27, 145, 4533]


In [14]:
# Preparing data for the Bar Chart
# Calculating the counts for each loan grade category for each loan status
loan_grade_counts = df.groupby(['loan_grade', 'loan_status']).size().unstack(fill_value=0)

# Ensuring all categories are present for both loan statuses and fill missing ones with zero
loan_grade_counts = loan_grade_counts.reindex(columns=[0, 1], fill_value=0)

# Sorting the index to maintain a consistent order (A, B, C, ..., G)
loan_grade_counts.sort_index(inplace=True)

# Printing the calculated values for verification
categories = loan_grade_counts.index.tolist()
status_1_values = loan_grade_counts[1].tolist()  # Loan status = 1 counts
status_0_values = loan_grade_counts[0].tolist()  # Loan status = 0 counts

print("Loan Grade Categories:", categories)
print("Loan Status = 1 Values:", status_1_values)
print("Loan Status = 0 Values:", status_0_values)

# Setting up the LightningChart environment
chart = lc.BarChart(vertical=True)
chart.set_title("Loan Grade Distribution")

# Preparing grouped data
grouped_data = [
    {'name': 'Loan Status = Default', 'values': status_1_values},
    {'name': 'Loan Status = Non-default', 'values': status_0_values}
]

# Setting grouped data on the chart
chart.set_data_grouped(
    categories,
    [
        {'subCategory': grouped_data[0]['name'], 'values': grouped_data[0]['values']},
        {'subCategory': grouped_data[1]['name'], 'values': grouped_data[1]['values']},
    ],
)

# Adding a legend to the chart
legend = chart.add_legend()
legend.add(chart)  

# Opening the chart
chart.open()


Loan Grade Categories: ['A', 'B', 'C', 'D', 'E', 'F', 'G']
Loan Status = 1 Values: [898, 1448, 1155, 1921, 562, 146, 58]
Loan Status = 0 Values: [8447, 7646, 4527, 1322, 307, 63, 1]


In [15]:
# Preparing data for the Bar Chart
# Calculating the counts for each loan intent category for each loan status
intent_counts = df.groupby(['loan_intent', 'loan_status']).size().unstack(fill_value=0)

# Printing the calculated values for verification
categories = intent_counts.index.tolist()
status_1_values = intent_counts[1].tolist()
status_0_values = intent_counts[0].tolist()

print("Loan Intent Categories:", categories)
print("Loan Status = 1 Values:", status_1_values)
print("Loan Status = 0 Values:", status_0_values)

# Setting up the LightningChart environment
chart = lc.BarChart(vertical=True)
chart.set_title("Loan Intent by Status")

# Preparing grouped data
grouped_data = [
    {'name': 'Loan Status = Default', 'values': status_1_values},
    {'name': 'Loan Status = Non-default', 'values': status_0_values}
]

# Setting grouped data on the chart
chart.set_data_grouped(
    categories,  
    [
        {'subCategory': grouped_data[0]['name'], 'values': grouped_data[0]['values']},
        {'subCategory': grouped_data[1]['name'], 'values': grouped_data[1]['values']}
    ]
)

# Adding a legend to the chart
legend = chart.add_legend()
legend.add(chart)  

# Opening the chart
chart.open()


Loan Intent Categories: ['DEBTCONSOLIDATION', 'EDUCATION', 'HOMEIMPROVEMENT', 'MEDICAL', 'PERSONAL', 'VENTURE']
Loan Status = 1 Values: [1294, 967, 820, 1418, 961, 728]
Loan Status = 0 Values: [3253, 4703, 2367, 3851, 3898, 4241]


In [16]:
# Segmenting 'person_income' into categorized groups for easier analysis
df['income_group'] = pd.cut(df['person_income'],
                            bins=[0, 25000, 50000, 75000, 100000, float('inf')],
                            labels=['low', 'low-middle', 'middle', 'high-middle', 'high'])

In [17]:
# Copying the original DataFrame to preserve the original data.
dfx=df.copy()

# Identifying categorical columns, categorical columns with many unique values, and numerical columns.
cat_cols, cat_but_car, num_cols = grab_col_names(dfx)

# Removing 'loan_status' from the list of categorical columns to prevent it from being encoded.
cat_cols.remove("loan_status")


def one_hot_encoder(dataframe, categorical_cols, drop_first=False):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first,dtype=int)
    return dataframe

# Converting categorical variables into dummy/indicator variables
dfx = one_hot_encoder(dfx, cat_cols, drop_first=True)

Observations: 28501
Variables: 13
cat_cols: 6
num_cols: 7
cat_but_car: 0
num_but_cat: 2


In [18]:
# Preparing 'X' by dropping 'loan_rate' and setting 'y' by isolating the 'loan_status' column
X = dfx.drop(['loan_status',"person_age","person_income"], axis=1)
y = dfx['loan_status']

In [19]:
# Configuring a preprocessing pipeline
pipeline = Pipeline(steps=[
    ('constant',DropConstantFeatures()),
    ('correlated',DropCorrelatedFeatures()),
    ('duplicate',DropDuplicateFeatures())
])

X = pipeline.fit_transform(X)
X.shape

(28501, 24)

In [20]:
# Initializing the BorderlineSMOTE method to handle class imbalance
smote = BorderlineSMOTE()
X, y = smote.fit_resample(X, y)
print("Final dimensions of target label classes:", Counter(y))

Final dimensions of target label classes: Counter({1: 22313, 0: 22313})


In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42,stratify=y)
#scaling variables
scaler = StandardScaler()
#scaler= RobustScaler()
scaled_train_X = scaler.fit_transform(X_train)
scaled_test_X = scaler.transform(X_test)

In [22]:
# Training and evaluating a model
def train_and_evaluate_model(model, model_name, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    # Collecting metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')
    roc_auc = roc_auc_score(y_test, y_pred, average='macro')
    
    # Printing classification report for debugging
    print(f"Classification Report for {model_name}:")
    print(classification_report(y_test, y_pred))
    
    return {
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC-AUC': roc_auc
    }

# Assuming 'X' and 'y' are already defined elsewhere in the code
results = []
results.append(train_and_evaluate_model(CatBoostClassifier(silent=True), "CatBoost", X, y))
results.append(train_and_evaluate_model(LGBMClassifier(verbose=-1), "LightGBM", X, y))
results.append(train_and_evaluate_model(RandomForestClassifier(), "Random Forest", X, y))
results.append(train_and_evaluate_model(XGBClassifier(), "XGBoost", X, y))
results.append(train_and_evaluate_model(StackingClassifier(estimators=[
    ('ET', ExtraTreesClassifier()),
    ('XGB', XGBClassifier()),
    ('CAT', CatBoostClassifier(silent=True))
], final_estimator=RandomForestClassifier(), verbose=2), "Stacking Classifier", X, y))

# Converting results to DataFrame for display and analysis
df_results = pd.DataFrame(results)
print(df_results)


Classification Report for CatBoost:
              precision    recall  f1-score   support

           0       0.92      0.98      0.95      4496
           1       0.98      0.92      0.95      4430

    accuracy                           0.95      8926
   macro avg       0.95      0.95      0.95      8926
weighted avg       0.95      0.95      0.95      8926

Classification Report for LightGBM:
              precision    recall  f1-score   support

           0       0.91      0.98      0.95      4496
           1       0.98      0.91      0.94      4430

    accuracy                           0.94      8926
   macro avg       0.95      0.94      0.94      8926
weighted avg       0.95      0.94      0.94      8926

Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.92      0.97      0.94      4496
           1       0.97      0.91      0.94      4430

    accuracy                           0.94      8926
   macro avg   

127.0.0.1 - - [17/Jun/2024 12:11:33] "GET / HTTP/1.1" 200 -


127.0.0.1 - - [17/Jun/2024 12:11:34] "GET / HTTP/1.1" 200 -


127.0.0.1 - - [17/Jun/2024 12:11:34] "GET / HTTP/1.1" 200 -


127.0.0.1 - - [17/Jun/2024 12:11:34] "GET / HTTP/1.1" 200 -


127.0.0.1 - - [17/Jun/2024 12:11:34] "GET / HTTP/1.1" 200 -


127.0.0.1 - - [17/Jun/2024 12:11:34] "GET / HTTP/1.1" 200 -


127.0.0.1 - - [17/Jun/2024 12:11:34] "GET / HTTP/1.1" 200 -


127.0.0.1 - - [17/Jun/2024 12:11:35] "GET / HTTP/1.1" 200 -


In [23]:
# Preparing data for the Grouped Bar Chart
# Initializing the BarChart from LightningChart
chart = lc.BarChart(vertical=True, theme=lc.Themes.White, title='Grouped Model Performance Metrics')
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC-AUC']
categories = [result['Model'] for result in results]

data_grouped = [
    {'subCategory': metric, 'values': [result[metric] for result in results]}
    for metric in metrics
]

# Setting the grouped data for the bar chart
chart.set_data_grouped(categories, data_grouped)

# Adding a legend to the chart
legend = chart.add_legend()
legend.add(chart)

# Displaying the chart
chart.open()

127.0.0.1 - - [17/Jun/2024 12:25:53] "GET / HTTP/1.1" 200 -


In [24]:
# Preparing data for the Bar Chart
# Assuming 'X' and 'y' are already defined and preprocessed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fitting the model
model = CatBoostClassifier(silent=True)
model.fit(X_train_scaled, y_train)

# Getting feature importances
feature_importances = model.feature_importances_
features = X.columns

# Creating a DataFrame for easier handling
feature_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})

# Setting up the LightningChart environment
chart = lc.BarChart(vertical=True, theme=lc.Themes.White, title="Feature Importances")
chart.set_data([{'category': row['Feature'], 'value': row['Importance']} for index, row in feature_df.iterrows()])

# Adding sorting 
chart.set_sorting('descending')

# Adding a legend to the chart
legend = chart.add_legend()
legend.add(chart)

# Open the chart
chart.open()


127.0.0.1 - - [17/Jun/2024 12:28:00] "GET / HTTP/1.1" 200 -


In [25]:
# Original Dataset Prediction
# Adjusting display settings for DataFrame output
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', None)

# Loading the data
data = pd.read_csv("./credit_risk_dataset.csv")
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)

# Categorical and numerical columns are correctly defined
cat_cols = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
num_cols = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income']

# Encoding categorical data & creating 'income_group'
data['income_group'] = pd.cut(data['person_income'],
                            bins=[0, 25000, 50000, 75000, 100000, float('inf')],
                            labels=['low', 'low-middle', 'middle', 'high-middle', 'high'])
cat_cols.append('income_group')  

data_encoded = pd.get_dummies(data, columns=cat_cols, drop_first=True)

# Scaling numerical features
scaler = StandardScaler()
data_encoded[num_cols] = scaler.fit_transform(data_encoded[num_cols])

# Splitting data into features and target
X = data_encoded.drop('loan_status', axis=1)
y = data_encoded['loan_status']

# Splitting data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Defining and training the CatBoost model
best_model = CatBoostClassifier(silent=True)
best_model.fit(X_train, y_train)  

# Loading new data to make predictions
new_data = pd.read_csv("./credit_risk_dataset.csv")
new_data.drop_duplicates(inplace=True)
new_data.dropna(inplace=True)

# Processing new_data as done with the original data
new_data['income_group'] = pd.cut(new_data['person_income'],
                                bins=[0, 25000, 50000, 75000, 100000, float('inf')],
                                labels=['low', 'low-middle', 'middle', 'high-middle', 'high'])
new_data_encoded = pd.get_dummies(new_data, columns=cat_cols, drop_first=True)
new_data_encoded[num_cols] = scaler.transform(new_data_encoded[num_cols]) 

# Aligning new data columns with the training features
missing_cols = set(X.columns) - set(new_data_encoded.columns)
for c in missing_cols:
    new_data_encoded[c] = 0
new_data_encoded = new_data_encoded[X.columns]  

# Predicting using the trained model
predictions = best_model.predict(new_data_encoded)
new_data['Predictions'] = predictions

# Mapping predictions for clarity
new_data['Predictions'] = new_data['Predictions'].map({0: 'Non-Default', 1: 'Default'})

# Displaying or saving the updated dataset with predictions
print("\n----------Original Dataset Prediction:----------")
print("\n------------------------------------------------")
print(new_data[['person_age', 'person_income', 'loan_intent', 'loan_amnt', 'loan_int_rate', 'Predictions']].head())
print("\n------------------------------------------------")


----------Original Dataset Prediction:----------

------------------------------------------------
   person_age  person_income loan_intent  loan_amnt  loan_int_rate  Predictions
0          22          59000    PERSONAL      35000          16.02      Default
1          21           9600   EDUCATION       1000          11.14  Non-Default
2          25           9600     MEDICAL       5500          12.87      Default
3          23          65500     MEDICAL      35000          15.23      Default
4          24          54400     MEDICAL      35000          14.27      Default

------------------------------------------------


In [26]:
# New Dataset Prediction
# Adjusting display settings for DataFrame output
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', None)

# Loading the data
data = pd.read_csv("./dataset_for_prediction.csv")
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)

# Categorical and numerical columns are correctly defined
cat_cols = ['person_home_ownership', 'loan_intent', 'cb_person_default_on_file']
num_cols = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income']

# Encoding categorical data & creating 'income_group'
data['income_group'] = pd.cut(data['person_income'],
                            bins=[0, 25000, 50000, 75000, 100000, float('inf')],
                            labels=['low', 'low-middle', 'middle', 'high-middle', 'high'])
cat_cols.append('income_group')  

data_encoded = pd.get_dummies(data, columns=cat_cols, drop_first=True)

# Scaling numerical features
scaler = StandardScaler()
data_encoded[num_cols] = scaler.fit_transform(data_encoded[num_cols])

# Splitting data into features and target
X = data_encoded.drop('loan_status', axis=1)
y = data_encoded['loan_status']

# Splitting data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Defining and training the CatBoost model
best_model = CatBoostClassifier(silent=True)
best_model.fit(X_train, y_train)  

# Loading new data to make predictions
new_data = pd.read_csv("./dataset_for_prediction.csv")
new_data.drop_duplicates(inplace=True)
new_data.dropna(inplace=True)

# Processing new_data as done with the original data
new_data['income_group'] = pd.cut(new_data['person_income'],
                                bins=[0, 25000, 50000, 75000, 100000, float('inf')],
                                labels=['low', 'low-middle', 'middle', 'high-middle', 'high'])
new_data_encoded = pd.get_dummies(new_data, columns=cat_cols, drop_first=True)
new_data_encoded[num_cols] = scaler.transform(new_data_encoded[num_cols]) 

# Aligning new data columns with the training features
missing_cols = set(X.columns) - set(new_data_encoded.columns)
for c in missing_cols:
    new_data_encoded[c] = 0
new_data_encoded = new_data_encoded[X.columns]  

# Predicting using the trained model
predictions = best_model.predict(new_data_encoded)
new_data['Predictions'] = predictions

# Mapping predictions for clarity
new_data['Predictions'] = new_data['Predictions'].map({0: 'Non-Default', 1: 'Default'})

# Displaying or saving the updated dataset with predictions
print("\n----------New Dataset Prediction:----------")
print("\n------------------------------------------------")
print(new_data[['person_age', 'person_income', 'loan_intent', 'loan_amnt', 'loan_int_rate', 'Predictions']].head())
print("\n------------------------------------------------")



----------New Dataset Prediction:----------

------------------------------------------------
   person_age  person_income        loan_intent  loan_amnt  loan_int_rate  Predictions
1          26          26004    HOMEIMPROVEMENT      10000          11.14  Non-Default
2          23          33600           PERSONAL       1000          10.36  Non-Default
3          25          33600  DEBTCONSOLIDATION       1000          11.86  Non-Default
4          23          34000            MEDICAL       1000           7.37  Non-Default
5          23          34800            VENTURE       1000          11.99  Non-Default

------------------------------------------------


In [27]:
# Predictions and statistics
# Loading the data
data = pd.read_csv("./credit_risk_dataset.csv")
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)

# Mapping loan status directly in the original data for clarity
data['loan_status'] = data['loan_status'].map({0: 'Non-Default', 1: 'Default'})

# Original loan status count
original_status_counts = data['loan_status'].value_counts()
print("Original Dataset Counts:")
print(original_status_counts.to_string())
print("Sum --------->", original_status_counts.sum())

# Defining categorical and numerical columns
base_cat_cols = ['person_home_ownership', 'loan_intent', 'cb_person_default_on_file']
num_cols = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income']

# Adding 'loan_grade' if it exists in the dataset
if 'loan_grade' in data.columns:
    base_cat_cols.append('loan_grade')

# Creating 'income_group'
data['income_group'] = pd.cut(data['person_income'],
                            bins=[0, 25000, 50000, 75000, 100000, float('inf')],
                            labels=['low', 'low-middle', 'middle', 'high-middle', 'high'])
base_cat_cols.append('income_group')

data_encoded = pd.get_dummies(data, columns=base_cat_cols, drop_first=True)

# Scaling numerical features
scaler = StandardScaler()
data_encoded[num_cols] = scaler.fit_transform(data_encoded[num_cols])

# Splitting data into features and target
X = data_encoded.drop('loan_status', axis=1)
y = data_encoded['loan_status']

# Splitting data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Defining and training the CatBoost model
best_model = CatBoostClassifier(silent=True)
best_model.fit(X_train, y_train) 

# Predicting using the trained model on both datasets
original_predictions = best_model.predict(X_test)
new_data = pd.read_csv("./dataset_for_prediction.csv")
new_data.drop_duplicates(inplace=True)
new_data.dropna(inplace=True)

# Processing new_data as done with the original data
new_data['income_group'] = pd.cut(new_data['person_income'],
                                bins=[0, 25000, 50000, 75000, 100000, float('inf')],
                                labels=['low', 'low-middle', 'middle', 'high-middle', 'high'])
new_cat_cols = [col for col in base_cat_cols if col in new_data.columns]

new_data_encoded = pd.get_dummies(new_data, columns=new_cat_cols, drop_first=True)
new_data_encoded[num_cols] = scaler.transform(new_data_encoded[num_cols])  

# Aligning new data columns with the training features
missing_cols = set(X.columns) - set(new_data_encoded.columns)
for c in missing_cols:
    new_data_encoded[c] = 0
new_data_encoded = new_data_encoded[X.columns]  

new_predictions = best_model.predict(new_data_encoded)

# Counting predictions
original_prediction_counts = pd.Series(original_predictions).value_counts()
new_prediction_counts = pd.Series(new_predictions).value_counts()

print("\n------------------------------------------------")
print("\nOriginal Dataset Prediction:")
print(original_prediction_counts.to_string())
print("Sum --------->", original_prediction_counts.sum())
print("\n------------------------------------------------")
print("\nNew Dataset Prediction:")
print(new_prediction_counts.to_string())
print("Sum --------->", new_prediction_counts.sum())

Original Dataset Counts:
loan_status
Non-Default    22313
Default         6188
Sum ---------> 28501

------------------------------------------------

Original Dataset Prediction:
Non-Default    4756
Default         945
Sum ---------> 5701

------------------------------------------------

New Dataset Prediction:
Non-Default    17139
Default         2923
Sum ---------> 20062


In [28]:
# Preparing data for the Bar Chart
# Converting prediction counts to int for JSON serialization
original_non_default_status_count = int(original_status_counts.get('Non-Default', 0))
original_default_status_count = int(original_status_counts.get('Default', 0))
original_non_default_count = int(original_prediction_counts.get('Non-Default', 0))
original_default_count = int(original_prediction_counts.get('Default', 0))
new_non_default_count = int(new_prediction_counts.get('Non-Default', 0))
new_default_count = int(new_prediction_counts.get('Default', 0))

# Initializing the chart
chart = lc.BarChart(vertical=True, theme=lc.Themes.White, title='Credit Risk Predictions Comparison')

# Configuring the data for the chart, ensuring values are native Python integers
chart.set_data_grouped(
    ['1. Original Dataset Count', '2. Original Dataset Prediction', '3. New Dataset Prediction'],
    [
        {'subCategory': 'Non-Default', 'values': [ original_non_default_status_count, original_non_default_count, new_non_default_count]},
        {'subCategory': 'Default', 'values': [original_default_status_count, original_default_count, new_default_count]}
    ]
)

# Sorting the chart
chart.set_sorting('alphabetical')

# Adding a legend to the chart
legend = chart.add_legend().add(chart)

# Opening the chart
chart.open() 

127.0.0.1 - - [17/Jun/2024 12:13:00] "GET / HTTP/1.1" 200 -


In [29]:
# Preparing data for the Box Plot
# Converting prediction counts to int for JSON serialization
original_non_default_status_count = int(original_status_counts.get('Non-Default', 0))
original_default_status_count = int(original_status_counts.get('Default', 0))
original_non_default_count = int(original_prediction_counts.get('Non-Default', 0))
original_default_count = int(original_prediction_counts.get('Default', 0))
new_non_default_count = int(new_prediction_counts.get('Non-Default', 0))
new_default_count = int(new_prediction_counts.get('Default', 0))

# Simulating multiple data points around the main counts
def generate_data_around_count(count, num_points=10, spread=1000):
    return [count + random.randint(-spread, spread) for _ in range(num_points)]

data = [
    generate_data_around_count(original_non_default_status_count),
    generate_data_around_count(original_default_status_count),
    generate_data_around_count(original_non_default_count),
    generate_data_around_count(new_non_default_count),
    generate_data_around_count(original_default_count),
    generate_data_around_count(new_default_count)
]

# Initializing the chart
chart = lc.BoxPlot(
    data=data,
    theme=lc.Themes.White,
    title='Credit Risk Predictions Comparison',
    xlabel='Categories',
    ylabel='Counts'
)

# Opening the chart
chart.open()


127.0.0.1 - - [17/Jun/2024 12:13:00] "GET / HTTP/1.1" 200 -
