In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score, f1_score, confusion_matrix


In [2]:
df = pd.read_csv("../data/processed/fraud_data_processed.csv")


In [3]:
X = df.drop(columns="class")
y = df["class"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [4]:
num_cols = X_train.select_dtypes(include="number").columns
cat_cols = X_train.select_dtypes(exclude="number").columns

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])


In [5]:
log_reg = Pipeline([
    ("preprocess", preprocessor),
    ("model", LogisticRegression(
        class_weight="balanced",
        max_iter=1000,
        random_state=42
    ))
])

log_reg.fit(X_train, y_train)

y_proba = log_reg.predict_proba(X_test)[:,1]
y_pred = log_reg.predict(X_test)

print("LogReg AUC-PR:", average_precision_score(y_test, y_proba))
print("LogReg F1:", f1_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


LogReg AUC-PR: 0.6543021345975666
LogReg F1: 0.6684503901895206
Confusion Matrix:
 [[27237   156]
 [ 1331  1499]]


In [6]:
rf = Pipeline([
    ("preprocess", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    ))
])

rf.fit(X_train, y_train)

y_proba_rf = rf.predict_proba(X_test)[:,1]
y_pred_rf = rf.predict(X_test)

print("RF AUC-PR:", average_precision_score(y_test, y_proba_rf))
print("RF F1:", f1_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


RF AUC-PR: 0.6219482984594581
RF F1: 0.5950966955992082
Confusion Matrix:
 [[25610  1783]
 [  876  1954]]


In [7]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(
    rf, X_train, y_train,
    scoring="average_precision",
    cv=cv
)

print("Mean CV AUC-PR:", scores.mean())
print("Std CV AUC-PR:", scores.std())


Mean CV AUC-PR: 0.697048281043043
Std CV AUC-PR: 0.03053946714737323
