Data Cleaning

In [None]:
import pandas as pd
df = pd.read_excel('default of credit card clients.xls', header=1)

In [None]:
df.rename(columns={
    'ID': 'ID',
    'LIMIT_BAL': 'CREDIT_LIMIT',
    'SEX': 'GENDER',
    'EDUCATION': 'EDUCATION',
    'MARRIAGE': 'MARRIAGE',
    'AGE': 'AGE',
    'PAY_0': 'PAY_SEP',
    'PAY_2': 'PAY_AUG',
    'PAY_3': 'PAY_JUL',
    'PAY_4': 'PAY_JUN',
    'PAY_5': 'PAY_MAY',
    'PAY_6': 'PAY_APR',
    'BILL_AMT1': 'BILL_SEP',
    'BILL_AMT2': 'BILL_AUG',
    'BILL_AMT3': 'BILL_JUL',
    'BILL_AMT4': 'BILL_JUN',
    'BILL_AMT5': 'BILL_MAY',
    'BILL_AMT6': 'BILL_APR',
    'PAY_AMT1': 'PAY_AMT_SEP',
    'PAY_AMT2': 'PAY_AMT_AUG',
    'PAY_AMT3': 'PAY_AMT_JUL',
    'PAY_AMT4': 'PAY_AMT_JUN',
    'PAY_AMT5': 'PAY_AMT_MAY',
    'PAY_AMT6': 'PAY_AMT_APR',
    'default payment next month': 'DEFAULT'
}, inplace=True)

In [None]:
df['EDUCATION'] = df['EDUCATION'].replace({0: 4, 5: 4, 6: 4})
df['MARRIAGE'] = df['MARRIAGE'].replace({0: 3})

In [None]:
print("Shape:", df.shape)
print("\nData info: ", df.info())
print("\nMissing values:\n", df.isnull().sum())
print("\nTarget variable distribution:\n", df['DEFAULT'].value_counts(normalize=True))

EDA

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid', palette='Set2')

In [None]:
#class distribution
plt.figure(figsize=(5,4))
sns.countplot(x='DEFAULT', data=df)
plt.title('Target Class Distribution (Default = 1)')
plt.xlabel('Default')
plt.ylabel('Count')
plt.show()

In [None]:
# --- Gender vs Default ---
plt.figure(figsize=(5,4))
sns.countplot(x='GENDER', hue='DEFAULT', data=df)
plt.title('Default Rate by Gender')
plt.xlabel('Gender (1=Male, 2=Female)')
plt.show()

In [None]:
# --- Education vs Default ---
plt.figure(figsize=(6,4))
sns.countplot(x='EDUCATION', hue='DEFAULT', data=df)
plt.title('Default Rate by Education Level')
plt.xlabel('Education')
plt.show()

In [None]:
# --- Marital Status vs Default ---
plt.figure(figsize=(6,4))
sns.countplot(x='MARRIAGE', hue='DEFAULT', data=df)
plt.title('Default Rate by Marital Status')
plt.xlabel('Marital Status')
plt.show()

In [None]:
# --- Age Distribution ---
plt.figure(figsize=(6,4))
sns.histplot(df['AGE'], bins=30, kde=True)
plt.title('Age Distribution')
plt.show()

In [None]:
# --- Correlation Heatmap for Numerical Features ---
plt.figure(figsize=(12,8))
corr = df.corr()
sns.heatmap(corr, cmap='coolwarm', center=0, annot=False)
plt.title('Correlation Heatmap')
plt.show()

Features preprocessing

In [None]:
X = df.drop(columns=['ID', 'DEFAULT'])
y = df['DEFAULT']

In [None]:
categorical_features = ['GENDER', 'EDUCATION', 'MARRIAGE']
numerical_features = [col for col in X.columns if col not in categorical_features]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.2, random_state=42, stratify=y )

In [None]:
print("Before SMOTE:")
print("Training set class distribution:\n", y_train.value_counts(normalize=True))
print("Testing set class distribution:\n", y_test.value_counts(normalize=True))

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer( transformers=[ ('num', numerical_transformer, numerical_features), ('cat', categorical_transformer, categorical_features)] )

In [None]:
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_preprocessed, y_train)

print("\nAfter SMOTE:")
print("Training set class distribution:\n", y_train_resampled.value_counts(normalize=True))
print("X_train_resampled shape:", X_train_resampled.shape)
print("X_test_preprocessed shape:", X_test_preprocessed.shape)

Modeling

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

In [None]:
from sklearn.model_selection import cross_validate, StratifiedKFold
import numpy as np

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = ['accuracy', 'precision', 'recall', 'f1']

In [None]:
cv_results = []

for name, model in models.items():
    scores = cross_validate(model, X_train_resampled, y_train_resampled, cv=cv, scoring=scoring)
    cv_results.append({
        'Model': name,
        'Accuracy': np.mean(scores['test_accuracy']),
        'Precision': np.mean(scores['test_precision']),
        'Recall': np.mean(scores['test_recall']),
        'F1 Score': np.mean(scores['test_f1'])
    })

In [None]:
results_df = pd.DataFrame(cv_results)
print("Cross-Validation Results (Mean Scores Across 5 Folds):")
print(results_df.sort_values(by='F1 Score', ascending=False).reset_index(drop=True))

Model Evaluation

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve

best_model = RandomForestClassifier(random_state=42)
best_model.fit(X_train_resampled, y_train_resampled)

In [None]:
y_pred = best_model.predict(X_test_preprocessed)
y_proba = best_model.predict_proba(X_test_preprocessed)[:, 1]

In [None]:
print("Classification Report:\n")
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.3f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.title('ROC-AUC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

In [None]:
precision, recall, _ = precision_recall_curve(y_test, y_proba)

plt.figure(figsize=(6,5))
plt.plot(recall, precision, color='green')
plt.title('Precision-Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.show()

In [None]:
ohe_feature_names = best_model.feature_names_in_ if hasattr(best_model, 'feature_names_in_') else None
num_features = numerical_features
cat_features = list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features))
all_features = num_features + cat_features

In [None]:
importances = best_model.feature_importances_
indices = np.argsort(importances)[::-1][:20]

plt.figure(figsize=(8,6))
sns.barplot(x=importances[indices], y=np.array(all_features)[indices], palette='viridis')
plt.title('Top 20 Most Important Features')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

In [None]:
xgb_model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
xgb_model.fit(X_train_resampled, y_train_resampled)

In [None]:
y_pred_xgb = xgb_model.predict(X_test_preprocessed)
y_proba_xgb = xgb_model.predict_proba(X_test_preprocessed)[:, 1]

In [None]:
print("Classification Report (XGBoost):\n")
print(classification_report(y_test, y_pred_xgb))

In [None]:
importances_xgb = xgb_model.feature_importances_
indices_xgb = np.argsort(importances_xgb)[::-1][:20]

plt.figure(figsize=(8,6))
sns.barplot(x=importances_xgb[indices_xgb], y=np.array(all_features)[indices_xgb], palette='magma')
plt.title('Top 20 Most Important Features - XGBoost')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

Hyperparameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

rf_model = RandomForestClassifier(random_state=42)

rf_params = {
    'n_estimators': randint(100, 400),
    'max_depth': randint(4, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'max_features': ['sqrt', 'log2', None]
}

rf_search = RandomizedSearchCV(
    rf_model,
    rf_params,
    n_iter=20,
    scoring='f1',
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

rf_search.fit(X_train_resampled, y_train_resampled)
print("\nBest Random Forest Parameters:", rf_search.best_params_)

In [None]:
rf_best = rf_search.best_estimator_
y_pred_rf_tuned = rf_best.predict(X_test_preprocessed)
print("\nClassification Report (Tuned Random Forest):\n")
print(classification_report(y_test, y_pred_rf_tuned))

In [None]:
xgb_model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

xgb_params = {
    'n_estimators': randint(100, 400),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.7, 0.3),
    'colsample_bytree': uniform(0.7, 0.3),
    'gamma': uniform(0, 0.3)
}

xgb_search = RandomizedSearchCV(
    xgb_model,
    xgb_params,
    n_iter=20,
    scoring='f1',
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

xgb_search.fit(X_train_resampled, y_train_resampled)
print("\nBest XGBoost Parameters:", xgb_search.best_params_)

In [None]:
xgb_best = xgb_search.best_estimator_
y_pred_xgb_tuned = xgb_best.predict(X_test_preprocessed)
print("\nClassification Report (Tuned XGBoost):\n")
print(classification_report(y_test, y_pred_xgb_tuned))

In [None]:
rf_probs = rf_best.predict_proba(X_test_preprocessed)[:, 1]
xgb_probs = xgb_best.predict_proba(X_test_preprocessed)[:, 1]

def find_best_threshold(y_true, probs):
    precisions, recalls, thresholds = precision_recall_curve(y_true, probs)
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-6)
    best_idx = f1_scores.argmax()

    best_threshold = thresholds[best_idx] if best_idx < len(thresholds) else thresholds[-1]
    return best_threshold, precisions[best_idx], recalls[best_idx], f1_scores[best_idx]

rf_thr, rf_prec, rf_rec, rf_f1 = find_best_threshold(y_test, rf_probs)
xgb_thr, xgb_prec, xgb_rec, xgb_f1 = find_best_threshold(y_test, xgb_probs)

print("Best Threshold (Tuned RF):", rf_thr)
print(f"Precision={rf_prec:.2f}, Recall={rf_rec:.2f}, F1={rf_f1:.2f}")
print("\nBest Threshold (Tuned XGB):", xgb_thr)
print(f"Precision={xgb_prec:.2f}, Recall={xgb_rec:.2f}, F1={xgb_f1:.2f}")

In [None]:
rf_preds = (rf_probs >= rf_thr).astype(int)
xgb_preds = (xgb_probs >= xgb_thr).astype(int)

print("Random Forest (Tuned Threshold):")
print(classification_report(y_test, rf_preds))

print("XGBoost (Tuned Threshold):")
print(classification_report(y_test, xgb_preds))