# Credit Default Prediction with Logistic Regression and SMOTE

This notebook demonstrates how to build a credit default prediction model using Logistic Regression and SMOTE for class imbalance. It includes all preprocessing steps: missing value treatment, handling infinite/large values, and log-transforming monetary columns.

## 1. Install and Import Required Libraries

In [None]:
!pip install imbalanced-learn pandas numpy matplotlib seaborn scikit-learn --quiet

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, auc
from imblearn.over_sampling import SMOTE

## 2. Load and Explore the Dataset

In [None]:
# Load the dataset (ensure the CSV is in your working directory)
df = pd.read_csv('data/UCI_Credit_Card.csv')
df.head()

## 3. Data Cleaning: Handle Missing, Infinite, and Large Values

In [None]:
# Drop ID column if present
df = df.drop('ID', axis=1)

# Replace inf/-inf with NaN and fill NaN with median
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(df.median(), inplace=True)

# Log-transform monetary columns to reduce skewness and avoid large values
money_cols = [col for col in df.columns if 'BILL_AMT' in col or 'PAY_AMT' in col or col == 'LIMIT_BAL']
for col in money_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
    df[col] = df[col].clip(lower=0)
    df[col] = np.log1p(df[col])

# Check for any remaining NaNs
print('Missing values after cleaning:')
print(df.isnull().sum())

## 4. Prepare Features and Target

In [None]:
X = df.drop('default.payment.next.month', axis=1)
y = df['default.payment.next.month']

## 5. Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## 6. Feature Scaling

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 7. Handle Class Imbalance with SMOTE

In [None]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)
print(f"Training data after SMOTE: {np.bincount(y_train_smote)}")

## 8. Train Logistic Regression Model

In [None]:
model = LogisticRegression(class_weight='balanced', max_iter=1000, solver='liblinear', random_state=42)
model.fit(X_train_smote, y_train_smote)

## 9. Evaluate Model

In [None]:
y_pred = model.predict(X_test_scaled)

Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

In [None]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:")
print(accuracy_score(y_test, y_pred))

ROC Curve

In [None]:
y_proba = model.predict_proba(X_test_scaled)[:,1]  # predicted probabilities for positive class
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(7,6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.3f})')
plt.plot([0,1], [0,1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()