In [1]:
# imports 

import pandas as pd

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Scores
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

from sklearn.model_selection import cross_val_score


In [5]:
# Load data

data = pd.read_csv('../Original_data/Credit_Card.csv')
data = data.rename(columns={'default.payment.next.month': 'default'})
data.head(2)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1


### Preprocessing 

#### Balanced

In [6]:
# Balanced

data['default'].value_counts()

default
0    23364
1     6636
Name: count, dtype: int64

In [7]:

# Separate classes
majority = data[data['default'] == 0]
minority = data[data['default'] == 1]

# Downsample majority
majority_down = resample(
    majority,
    replace=False,            
    n_samples=len(minority),   
    random_state=42
)

# Combine balanced dataset
data_balanced = pd.concat([majority_down, minority]).sample(frac=1, random_state=42)

# Check new distribution
print(data_balanced['default'].value_counts())

default
0    6636
1    6636
Name: count, dtype: int64


#### Tranform and normalized dataset 

In [8]:
# Tranform and normalized dataset for prediction

X = data_balanced.drop("default", axis=1)
y = data_balanced["default"]

# Split original X
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Standardize using all rows of X, then match indices
stdX = (X - X.mean()) / X.std()

# Align standardized train/test with the split
stdX_train = stdX.loc[X_train.index]
stdX_test = stdX.loc[X_test.index]

### Logistic Regression

In [9]:
# Logistic Regression
C = 3.17
log_reg = LogisticRegression(
    C=C,
    solver='liblinear',
    max_iter=5000
)

# Fit 
log_reg.fit(stdX_train, y_train)

# Predict 
y_pred = log_reg.predict(stdX_test)
y_prob = log_reg.predict_proba(stdX_test)[:, 1]

# Performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.6715218483174284

Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.70      0.68      1991
           1       0.68      0.64      0.66      1991

    accuracy                           0.67      3982
   macro avg       0.67      0.67      0.67      3982
weighted avg       0.67      0.67      0.67      3982



### Desicion tree

In [10]:
tree = DecisionTreeClassifier(
    criterion='gini',
    max_depth=None,        # you can tune this
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

# Fit 
tree.fit(stdX_train, y_train)

# Predict 
y_pred = tree.predict(stdX_test)

# Performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.6270718232044199

Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.61      0.62      1991
           1       0.62      0.64      0.63      1991

    accuracy                           0.63      3982
   macro avg       0.63      0.63      0.63      3982
weighted avg       0.63      0.63      0.63      3982



### Random forest

In [11]:

rf = RandomForestClassifier(
    n_estimators=300,      
    max_depth=None,        
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1              
)

# Fit 
rf.fit(stdX_train, y_train)

# Predict
y_pred = rf.predict(stdX_test)

# Performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7034153691612255

Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.76      0.72      1991
           1       0.73      0.64      0.68      1991

    accuracy                           0.70      3982
   macro avg       0.71      0.70      0.70      3982
weighted avg       0.71      0.70      0.70      3982



### XGBoost

In [12]:
xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1
)

# Fit
xgb.fit(stdX_train, y_train)

# Predict
y_pred = xgb.predict(stdX_test)
y_prob = xgb.predict_proba(stdX_test)[:, 1]

# Performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7046710195881467

Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.75      0.72      1991
           1       0.73      0.66      0.69      1991

    accuracy                           0.70      3982
   macro avg       0.71      0.70      0.70      3982
weighted avg       0.71      0.70      0.70      3982



###  Quick check before turning

In [13]:
# Cross-validation for RF

rf_scores = cross_val_score(
    rf,
    stdX, y,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1
)

print("Random Forest CV ROC-AUC:", rf_scores)
print("Mean ROC-AUC:", rf_scores.mean())

Random Forest CV ROC-AUC: [0.74990892 0.77202773 0.76735093 0.77961491 0.77576807]
Mean ROC-AUC: 0.7689341126258062


In [14]:
# Cross-validation for XGB

xgb_scores = cross_val_score(
    xgb,
    stdX, y,
    cv=5,
    scoring="roc_auc",
    n_jobs=1      
)

print("XGBoost CV ROC-AUC:", xgb_scores)
print("Mean ROC-AUC:", xgb_scores.mean())

XGBoost CV ROC-AUC: [0.75518086 0.78116715 0.77317058 0.77514539 0.77919837]
Mean ROC-AUC: 0.7727724702347181


In [16]:
data_balanced.to_csv('../Helper_csv-s/data_balanced.csv')

Exception ignored in: <function ResourceTracker.__del__ at 0x1026a1da0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x107251da0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x103425da0>
Traceback (most recent call last