In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn modules for preprocessing and model selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb  

In [17]:
# Evaluation Metrics
from sklearn.metrics import (
    accuracy_score, 
    roc_auc_score, 
    precision_score, 
    recall_score, 
    f1_score, 
    matthews_corrcoef
)

# random seed for reproducibility
RANDOM_STATE = 42

In [18]:
# Load the dataset
df = pd.read_csv('loan_data.csv')

# Separate features (X) and target (y)
target = 'loan_status'
X = df.drop(columns=[target])
y = df[target]

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['number']).columns.tolist()

print(f"Categorical Columns: {categorical_cols}")
print(f"Numerical Columns: {numerical_cols}")

# Create a preprocessing pipeline
# - OneHotEncoder for categorical variables (handles 'male', 'female', etc.)
# - StandardScaler for numerical variables (crucial for KNN and Logistic Regression)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ]
)

# Split the data into training and testing sets (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

# Apply preprocessing
# Fit on training data, transform both training and testing data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print("Data preprocessing complete.")
print(f"Training set shape: {X_train_processed.shape}")
print(f"Test set shape: {X_test_processed.shape}")

Categorical Columns: ['person_gender', 'person_education', 'person_home_ownership', 'loan_intent', 'previous_loan_defaults_on_file']
Numerical Columns: ['person_age', 'person_income', 'person_emp_exp', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'credit_score']
Data preprocessing complete.
Training set shape: (36000, 27)
Test set shape: (9000, 27)


In [19]:
def evaluate_model(model, X_test, y_test, model_name):
    """
    Evaluates a classification model and prints key metrics.
    """
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Predict probabilities (needed for AUC Score)
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
    else:
        y_prob = None
    
    # Calculate metrics
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob) if y_prob is not None else "N/A"
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    
    # Print results
    print(f"--- {model_name} Evaluation ---")
    print(f"Accuracy:  {acc:.4f}")
    print(f"AUC Score: {auc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print(f"MCC Score: {mcc:.4f}")
    print("-" * 30)
    
    return {
        "Model": model_name,
        "Accuracy": acc,
        "AUC": auc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1,
        "MCC": mcc
    }

results = []

In [20]:
# train Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
lr_model.fit(X_train_processed, y_train)

# Evaluate
lr_results = evaluate_model(lr_model, X_test_processed, y_test, "Logistic Regression")
print(lr_results)
results.append(lr_results)

--- Logistic Regression Evaluation ---
Accuracy:  0.8946
AUC Score: 0.9527
Precision: 0.7762
Recall:    0.7418
F1 Score:  0.7586
MCC Score: 0.6915
------------------------------
{'Model': 'Logistic Regression', 'Accuracy': 0.8945555555555555, 'AUC': np.float64(0.9526841472181297), 'Precision': 0.7761582509109839, 'Recall': 0.7417910447761195, 'F1': 0.7585856016280844, 'MCC': np.float64(0.6914666346995662)}


  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b


In [21]:
# train Decision Tree
dt_model = DecisionTreeClassifier(random_state=RANDOM_STATE)
dt_model.fit(X_train_processed, y_train)

# Evaluate
dt_results = evaluate_model(dt_model, X_test_processed, y_test, "Decision Tree")
print(dt_results)
results.append(dt_results)

--- Decision Tree Evaluation ---
Accuracy:  0.9017
AUC Score: 0.8635
Precision: 0.7719
Recall:    0.7945
F1 Score:  0.7830
MCC Score: 0.7196
------------------------------
{'Model': 'Decision Tree', 'Accuracy': 0.9016666666666666, 'AUC': np.float64(0.8635011637093503), 'Precision': 0.7718704688255196, 'Recall': 0.7945273631840796, 'F1': 0.7830350576121599, 'MCC': np.float64(0.7196050841085211)}


In [22]:
# train KNN
# Choosing k=5 as a standard default
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_processed, y_train)

# Evaluate
knn_results = evaluate_model(knn_model, X_test_processed, y_test, "K-Nearest Neighbor")
print(knn_results)
results.append(knn_results)

--- K-Nearest Neighbor Evaluation ---
Accuracy:  0.8938
AUC Score: 0.9258
Precision: 0.7961
Recall:    0.7050
F1 Score:  0.7478
MCC Score: 0.6828
------------------------------
{'Model': 'K-Nearest Neighbor', 'Accuracy': 0.8937777777777778, 'AUC': np.float64(0.9257952725642176), 'Precision': 0.7960674157303371, 'Recall': 0.7049751243781095, 'F1': 0.7477572559366754, 'MCC': np.float64(0.6828107080043633)}


In [23]:
# train Gaussian Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train_processed, y_train)

# Evaluate
nb_results = evaluate_model(nb_model, X_test_processed, y_test, "Naive Bayes (Gaussian)")
print(nb_results)
results.append(nb_results)

--- Naive Bayes (Gaussian) Evaluation ---
Accuracy:  0.7364
AUC Score: 0.9362
Precision: 0.4586
Recall:    0.9980
F1 Score:  0.6284
MCC Score: 0.5493
------------------------------
{'Model': 'Naive Bayes (Gaussian)', 'Accuracy': 0.7364444444444445, 'AUC': np.float64(0.9361577306600047), 'Precision': 0.4586191129401006, 'Recall': 0.9980099502487563, 'F1': 0.6284461152882206, 'MCC': np.float64(0.5493361098610332)}


In [24]:
# train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
rf_model.fit(X_train_processed, y_train)

# Evaluate
rf_results = evaluate_model(rf_model, X_test_processed, y_test, "Random Forest")
print(rf_results)
results.append(rf_results)

--- Random Forest Evaluation ---
Accuracy:  0.9291
AUC Score: 0.9738
Precision: 0.8933
Recall:    0.7751
F1 Score:  0.8300
MCC Score: 0.7887
------------------------------
{'Model': 'Random Forest', 'Accuracy': 0.9291111111111111, 'AUC': np.float64(0.9737561121431468), 'Precision': 0.893348623853211, 'Recall': 0.7751243781094528, 'F1': 0.8300479488545551, 'MCC': np.float64(0.7887061186849529)}


In [25]:
# train XGBoost
xgb_model = xgb.XGBClassifier(
    use_label_encoder=False, 
    eval_metric='logloss', 
    random_state=RANDOM_STATE
)
xgb_model.fit(X_train_processed, y_train)

# Evaluate
xgb_results = evaluate_model(xgb_model, X_test_processed, y_test, "XGBoost")
print(xgb_results)
results.append(xgb_results)

--- XGBoost Evaluation ---
Accuracy:  0.9358
AUC Score: 0.9787
Precision: 0.8887
Recall:    0.8144
F1 Score:  0.8499
MCC Score: 0.8104
------------------------------
{'Model': 'XGBoost', 'Accuracy': 0.9357777777777778, 'AUC': np.float64(0.9787303824226506), 'Precision': 0.8887079261672095, 'Recall': 0.8144278606965174, 'F1': 0.8499480789200415, 'MCC': np.float64(0.8104403812492864)}


Parameters: { "use_label_encoder" } are not used.

