In [None]:
!pip install lightgbm imbalanced-learn shap --quiet


In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import shap
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
df=pd.read_csv("accepted_2007_to_2018Q4.csv")
df=pd.read_csv("rejected_2007_to_2018Q4.csv")

In [None]:
# Keep only necessary columns
columns_to_keep = ['loan_amnt', 'term', 'int_rate', 'grade', 'emp_length', 'home_ownership',
                   'annual_inc', 'purpose', 'addr_state', 'dti', 'delinq_2yrs', 'revol_util',
                   'total_acc', 'loan_status']
df = df["columns_to_keep"]

# Drop rows with nulls
df.dropna(inplace=True)

# Convert loan_status to binary
df['loan_status'] = df['loan_status'].apply(lambda x: 1 if x in ['Charged Off', 'Default', 'Late (31-120 days)'] else 0)

# Encode categorical columns
cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

print("Preprocessed shape:", df.shape)


In [None]:
# Keep only necessary columns
columns_to_keep = ['loan_amnt', 'term', 'int_rate', 'grade', 'emp_length', 'home_ownership',
                   'annual_inc', 'purpose', 'addr_state', 'dti', 'delinq_2yrs', 'revol_util',
                   'total_acc', 'loan_status']
df = df['columns_to_keep'] # Use square brackets to select multiple columns using a list

# Drop rows with nulls
df.dropna(inplace=True)

# Convert loan_status to binary
df['loan_status'] = df['loan_status'].apply(lambda x: 1 if x in ['Charged Off', 'Default', 'Late (31-120 days)'] else 0)

# Encode categorical columns
cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

print("Preprocessed shape:", df.shape)

In [None]:
# Keep only necessary columns
columns_to_keep = ['loan_amnt', 'term', 'int_rate', 'grade', 'emp_length', 'home_ownership',
                   'annual_inc', 'purpose', 'addr_state', 'dti', 'delinq_2yrs', 'revol_util',
                   'total_acc', 'loan_status']
df = df[columns_to_keep] # Use double square brackets to select multiple columns using a list

# Drop rows with nulls
df.dropna(inplace=True)

# Convert loan_status to binary
df['loan_status'] = df['loan_status'].apply(lambda x: 1 if x in ['Charged Off', 'Default', 'Late (31-120 days)'] else 0)

# Encode categorical columns
cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

print("Preprocessed shape:", df.shape)

In [None]:
X = df.drop('loan_status', axis=1)
y = df['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


In [None]:
model_lgb = lgb.LGBMClassifier()
model_lgb.fit(X_train_res, y_train_res)

y_pred_lgb = model_lgb.predict(X_test)


In [None]:
model_svm = SVC(probability=True)
model_svm.fit(X_train_res, y_train_res)

y_pred_svm = model_svm.predict(X_test)


In [None]:
print("🔸 LightGBM Results")
print(confusion_matrix(y_test, y_pred_lgb))
print(classification_report(y_test, y_pred_lgb))

print("🔸 SVM Results")
print(confusion_matrix(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))


In [None]:
lgb.plot_importance(model_lgb, max_num_features=10)
plt.title("Top 10 Feature Importances (LightGBM)")
plt.show()


In [None]:
explainer = shap.TreeExplainer(model_lgb)
shap_values = explainer.shap_values(X_test)

shap.summary_plot(shap_values, X_test, plot_type="bar")
