# 02 - Frequentist Logistic Regression (Baseline)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, RocCurveDisplay, PrecisionRecallDisplay

In [2]:
# Load processed data
df = pd.read_csv("../data/processed_fraud.csv")
df.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,1,9839.64,170136.0,160296.36,0.0,0.0,0,0,False,False,True,False
1,1,1864.28,21249.0,19384.72,0.0,0.0,0,0,False,False,True,False
2,1,181.0,181.0,0.0,0.0,0.0,1,0,False,False,False,True
3,1,181.0,181.0,0.0,21182.0,0.0,1,0,True,False,False,False
4,1,11668.14,41554.0,29885.86,0.0,0.0,0,0,False,False,True,False


In [3]:
# Define features and label
X = df.drop(columns=["isFraud"])
y = df["isFraud"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


In [4]:
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

Training set shape: (5090096, 11)
Test set shape: (1272524, 11)


In [5]:
# L1 model
clf_l1 = LogisticRegression(max_iter=1000, penalty='l1', solver='liblinear', class_weight='balanced', random_state=42)

# L2 model
clf_l2 = LogisticRegression(max_iter=1000, penalty='l2', solver='lbfgs', class_weight='balanced', random_state=42)

# Elastic Net model
clf_el = LogisticRegression(max_iter=1000, penalty='elasticnet', solver='saga', l1_ratio=0.5, class_weight='balanced', random_state=42)

In [None]:
clf_l1.fit(X_train, y_train)

In [None]:
clf_l2.fit(X_train, y_train)

In [None]:

clf_el.fit(X_train, y_train)

In [None]:
models = {
    "L1": clf_l1,
    "L2": clf_l2,
    "ElasticNet": clf_el
}

In [None]:
# 4. Evaluate Models

for name, model in models.items():
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    print(f"\n===== {name} Penalty =====")
    print("ROC AUC:", roc_auc_score(y_test, y_proba))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    
    RocCurveDisplay.from_predictions(y_test, y_proba)
    plt.title(f"ROC Curve - {name}")
    plt.grid()
    plt.show()
    
    PrecisionRecallDisplay.from_predictions(y_test, y_proba)
    plt.title(f"Precision-Recall Curve - {name}")
    plt.grid()
    plt.show()


In [None]:
# 5. Coefficients

for name, model in models.items():
    coefs = pd.Series(model.coef_.flatten(), index=X.columns)
    print(f"\nTop coefficients for {name} Penalty:")
    print(coefs.sort_values(ascending=False))