In [1]:
# imports 

import pandas as pd

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Scores
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

from sklearn.model_selection import cross_val_score


In [4]:
# Load data

data = pd.read_csv('//Original_data/Credit_Card.csv')
data = data.rename(columns={'default.payment.next.month': 'default'})
data.head(2)

FileNotFoundError: [Errno 2] No such file or directory: '//Original_data/Credit_Card.csv'

### Preprocessing 

#### Balanced

In [None]:
# Balanced

data['default'].value_counts()

In [None]:

# Separate classes
majority = data[data['default'] == 0]
minority = data[data['default'] == 1]

# Downsample majority
majority_down = resample(
    majority,
    replace=False,            
    n_samples=len(minority),   
    random_state=42
)

# Combine balanced dataset
data_balanced = pd.concat([majority_down, minority]).sample(frac=1, random_state=42)

# Check new distribution
print(data_balanced['default'].value_counts())

#### Tranform and normalized dataset 

In [None]:
# Tranform and normalized dataset for prediction

X = data_balanced.drop("default", axis=1)
y = data_balanced["default"]

# Split original X
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Standardize using all rows of X, then match indices
stdX = (X - X.mean()) / X.std()

# Align standardized train/test with the split
stdX_train = stdX.loc[X_train.index]
stdX_test = stdX.loc[X_test.index]

### Logistic Regression

In [None]:
# Logistic Regression
C = 3.17
log_reg = LogisticRegression(
    C=C,
    solver='liblinear',
    max_iter=5000
)

# Fit 
log_reg.fit(stdX_train, y_train)

# Predict 
y_pred = log_reg.predict(stdX_test)
y_prob = log_reg.predict_proba(stdX_test)[:, 1]

# Performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


### Desicion tree

In [None]:
tree = DecisionTreeClassifier(
    criterion='gini',
    max_depth=None,        # you can tune this
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

# Fit 
tree.fit(stdX_train, y_train)

# Predict 
y_pred = tree.predict(stdX_test)

# Performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


### Random forest

In [None]:

rf = RandomForestClassifier(
    n_estimators=300,      
    max_depth=None,        
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1              
)

# Fit 
rf.fit(stdX_train, y_train)

# Predict
y_pred = rf.predict(stdX_test)

# Performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


### XGBoost

In [None]:
xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1
)

# Fit
xgb.fit(stdX_train, y_train)

# Predict
y_pred = xgb.predict(stdX_test)
y_prob = xgb.predict_proba(stdX_test)[:, 1]

# Performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


###  Quick check before turning

In [None]:
# Cross-validation for RF

rf_scores = cross_val_score(
    rf,
    stdX, y,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1
)

print("Random Forest CV ROC-AUC:", rf_scores)
print("Mean ROC-AUC:", rf_scores.mean())

In [None]:
# Cross-validation for XGB

xgb_scores = cross_val_score(
    xgb,
    stdX, y,
    cv=5,
    scoring="roc_auc",
    n_jobs=1      
)

print("XGBoost CV ROC-AUC:", xgb_scores)
print("Mean ROC-AUC:", xgb_scores.mean())

In [None]:
data_balanced.to_csv('Helper_csv-s/data_balanced.csv')