In [1]:
import pandas as pd
import numpy as np

# Load dataset (relative path)
df = pd.read_csv("../data/full_data.csv")

# Quick look
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [2]:
# Dataset shape
print("Dataset shape:", df.shape)

# Target distribution
print(df["default.payment.next.month"].value_counts(normalize=True))

Dataset shape: (30000, 25)
default.payment.next.month
0    0.7788
1    0.2212
Name: proportion, dtype: float64


In [3]:
# ML models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

# Metrics
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    matthews_corrcoef,
    confusion_matrix,
    classification_report
)

In [4]:
def evaluate_model(model, X_test, y_test, X_test_scaled=None, needs_scaling=False):
    """
    Evaluates a classification model and returns all required metrics.
    """
    if needs_scaling:
        y_pred = model.predict(X_test_scaled)
        y_prob = model.predict_proba(X_test_scaled)[:, 1]
    else:
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]

    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }
    
    return metrics, y_pred

In [5]:
# Separate features and target
X = df.drop("default.payment.next.month", axis=1)
y = df["default.payment.next.month"]

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
print(X_train.shape)
print(X_train_scaled.shape)

(24000, 24)
(24000, 24)


In [9]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_scaled, y_train)

log_reg_metrics, log_reg_preds = evaluate_model(
    log_reg,
    X_test,
    y_test,
    X_test_scaled=X_test_scaled,
    needs_scaling=True
)

log_reg_metrics

{'Accuracy': 0.8083333333333333,
 'AUC': 0.7077224240780342,
 'Precision': 0.6911447084233261,
 'Recall': 0.24114544084400905,
 'F1': 0.3575418994413408,
 'MCC': 0.3274527933593308}

In [10]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

dt_metrics, dt_preds = evaluate_model(dt, X_test, y_test)

dt_metrics

{'Accuracy': 0.7258333333333333,
 'AUC': 0.6138208222418353,
 'Precision': 0.38755304101838756,
 'Recall': 0.4129615674453655,
 'F1': 0.3998540678584458,
 'MCC': 0.22260931656159239}

In [11]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

knn_metrics, knn_preds = evaluate_model(
    knn, X_test, y_test,
    X_test_scaled=X_test_scaled,
    needs_scaling=True
)

knn_metrics

{'Accuracy': 0.7935,
 'AUC': 0.6942130641626261,
 'Precision': 0.553012048192771,
 'Recall': 0.3458929917106255,
 'F1': 0.42559109874826145,
 'MCC': 0.32036676127672575}

In [12]:
nb = GaussianNB()
nb.fit(X_train_scaled, y_train)

nb_metrics, nb_preds = evaluate_model(
    nb, X_test, y_test,
    X_test_scaled=X_test_scaled,
    needs_scaling=True
)

nb_metrics

{'Accuracy': 0.7523333333333333,
 'AUC': 0.725098857923091,
 'Precision': 0.451316595223515,
 'Recall': 0.5553880934438583,
 'F1': 0.497972972972973,
 'MCC': 0.3391019125795756}

In [13]:
rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

rf_metrics, rf_preds = evaluate_model(rf, X_test, y_test)

rf_metrics

{'Accuracy': 0.8146666666666667,
 'AUC': 0.7564438465548936,
 'Precision': 0.6420079260237781,
 'Recall': 0.36623963828183875,
 'F1': 0.4664107485604607,
 'MCC': 0.38529575205784866}

In [14]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [30]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42
)

xgb.fit(X_train, y_train)

xgb_metrics, xgb_preds = evaluate_model(xgb, X_test, y_test)

xgb_metrics

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


{'Accuracy': 0.809,
 'AUC': 0.7590199338146588,
 'Precision': 0.6161745827984596,
 'Recall': 0.3617181612660136,
 'F1': 0.45584045584045585,
 'MCC': 0.36763392405450007}

In [32]:
results_df = pd.DataFrame.from_dict({
    "Logistic Regression": log_reg_metrics,
    "Decision Tree": dt_metrics,
    "kNN": knn_metrics,
    "Naive Bayes": nb_metrics,
    "Random Forest": rf_metrics,
    "XGBoost": xgb_metrics
}, orient="index")

results_df

Unnamed: 0,Accuracy,AUC,Precision,Recall,F1,MCC
Logistic Regression,0.808333,0.707722,0.691145,0.241145,0.357542,0.327453
Decision Tree,0.725833,0.613821,0.387553,0.412962,0.399854,0.222609
kNN,0.7935,0.694213,0.553012,0.345893,0.425591,0.320367
Naive Bayes,0.752333,0.725099,0.451317,0.555388,0.497973,0.339102
Random Forest,0.814667,0.756444,0.642008,0.36624,0.466411,0.385296
XGBoost,0.809,0.75902,0.616175,0.361718,0.45584,0.367634


In [34]:
# Create test_data.csv for Streamlit upload

test_data = X_test.copy()
test_data["default.payment.next.month"] = y_test.values

test_data.to_csv("../data/test_data.csv", index=False)

print("test_data.csv created successfully")

test_data.csv created successfully


In [36]:
pd.read_csv("../data/test_data.csv").head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,6908,50000.0,1,2,2,46,-1,-1,-1,-1,...,2650.0,3320.0,2764.0,26060.0,0.0,3472.0,2320.0,1764.0,2841.0,0
1,24576,150000.0,1,1,1,31,-1,-1,-2,-2,...,0.0,0.0,11694.0,0.0,0.0,0.0,0.0,11694.0,30000.0,0
2,26767,50000.0,1,2,2,25,0,0,0,0,...,49949.0,50479.0,50702.0,1800.0,1844.0,2200.0,2000.0,1800.0,2038.0,0
3,2157,290000.0,2,1,2,25,0,0,0,0,...,248801.0,241983.0,230925.0,15000.0,10500.0,10000.0,15000.0,7844.0,23333.0,1
4,3180,500000.0,2,2,1,27,-2,-2,-2,-2,...,10000.0,10000.0,10000.0,9983.0,13587.0,10000.0,10000.0,10000.0,25304.0,0
