In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# Load dataset
df = pd.read_csv('data/UCI_Credit_Card.csv')

# Drop ID column if present
if 'ID' in df.columns:
    df.drop('ID', axis=1, inplace=True)

# Check for missing values and infinite values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(df.median(), inplace=True)  # Impute missing values with median

# Log transform large monetary columns to reduce skewness and avoid large values
money_cols = [col for col in df.columns if 'BILL_AMT' in col or 'PAY_AMT' in col or col == 'LIMIT_BAL']
for col in money_cols:
    # Clip negative values to zero before log1p to avoid errors
    df[col] = df[col].clip(lower=0)
    df[col] = np.log1p(df[col])

# Separate features and target
X = df.drop('default.payment.next.month', axis=1)
y = df['default.payment.next.month']

# Split dataset with stratification to keep class distribution
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

print(f"Training data after SMOTE: {np.bincount(y_train_smote)}")  # Check class balance

# Train Logistic Regression model
model = LogisticRegression(class_weight='balanced', max_iter=1000, solver='liblinear', random_state=42)
model.fit(X_train_smote, y_train_smote)

# Predict on test set
y_pred = model.predict(X_test_scaled)

# Evaluate model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Accuracy Score:")
print(accuracy_score(y_test, y_pred))


Training data after SMOTE: [18691 18691]
Confusion Matrix:
[[3534 1139]
 [ 514  813]]

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.76      0.81      4673
           1       0.42      0.61      0.50      1327

    accuracy                           0.72      6000
   macro avg       0.64      0.68      0.65      6000
weighted avg       0.77      0.72      0.74      6000

Accuracy Score:
0.7245
