In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

class FraudFeatureEngineerDL(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.merchant_fraud_rate_map = {}
        self.category_fraud_rate_map = {}
        self.category_encoder = LabelEncoder()
        self.categorical_cols = ['category']

    def fit(self, X, y=None):
        df = X.copy()
        df['fraud'] = y.values
        self.merchant_fraud_rate_map = df.groupby('merchant')['fraud'].mean().to_dict()
        self.category_fraud_rate_map = df.groupby('category')['fraud'].mean().to_dict()
        self.category_encoder.fit(df['category'])
        return self

    def transform(self, X):
        df = X.copy()
        df = df.drop(columns=["zipcodeOri", "zipMerchant"], errors='ignore')
        df = df.applymap(lambda x: x.replace("'", "") if isinstance(x, str) else x)

        df['age'] = df['age'].replace('U', -1)
        df['age'] = pd.to_numeric(df['age'], errors='coerce').fillna(-1)
        df['step'] = pd.to_numeric(df['step'], errors='coerce')
        df['hour_of_day'] = df['step'] % 24
        df['day'] = df['step'] // 24
        df['is_night'] = df['hour_of_day'].apply(lambda x: 1 if x <= 6 else 0)
        df['gender'] = df['gender'].map({'M': 0, 'F': 1}).fillna(-1)

        df['customer_txn_count'] = df.groupby('customer')['step'].transform('count')
        df['customer_avg_amt'] = df.groupby('customer')['amount'].transform('mean')
        df['customer_std_amt'] = df.groupby('customer')['amount'].transform('std').fillna(0)
        df['relative_amt'] = df['amount'] / df['customer_avg_amt']
        df['amt_zscore'] = (df['amount'] - df['customer_avg_amt']) / df['customer_std_amt'].replace(0, 1)

        df['merchant_txn_count'] = df.groupby('merchant')['step'].transform('count')
        df['merchant_avg_amt'] = df.groupby('merchant')['amount'].transform('mean')
        df['merchant_fraud_rate'] = df['merchant'].map(self.merchant_fraud_rate_map).fillna(0)

        df['category_txn_count'] = df.groupby('category')['step'].transform('count')
        df['category_avg_amt'] = df.groupby('category')['amount'].transform('mean')
        df['category_fraud_rate'] = df['category'].map(self.category_fraud_rate_map).fillna(0)

        df['log_amt'] = np.log1p(df['amount'])
        amt_threshold = df['amount'].quantile(0.95)
        df['is_high_amt'] = (df['amount'] > amt_threshold).astype(int)
        df['is_high_risk_age'] = df['age'].apply(lambda x: 1 if x in [2, 3] else 0)

        df['category'] = self.category_encoder.transform(df['category'])

        drop_cols = ['step', 'customer', 'merchant', 'amount']
        df.drop(columns=drop_cols, inplace=True, errors='ignore')

        return df

In [2]:
import torch
import torch.nn as nn

class MLPWithEmbeddings(nn.Module):
    def __init__(self, num_numerical, category_cardinality, emb_dim=8, hidden_sizes=[64, 32]):
        super().__init__()
        self.embedding = nn.Embedding(category_cardinality, emb_dim)
        self.fc_input_size = num_numerical + emb_dim

        layers = []
        in_dim = self.fc_input_size
        for h in hidden_sizes:
            layers.append(nn.Linear(in_dim, h))
            layers.append(nn.ReLU())
            layers.append(nn.BatchNorm1d(h))
            in_dim = h
        layers.append(nn.Linear(in_dim, 1))

        self.mlp = nn.Sequential(*layers)

    def forward(self, x_num, x_cat):
        emb = self.embedding(x_cat).squeeze(1)
        x = torch.cat([x_num, emb], dim=1)
        return torch.sigmoid(self.mlp(x)).squeeze()

In [5]:
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import TensorDataset, DataLoader

# Load and clean
def load_clean_csv(path):
    df = pd.read_csv(path)
    df.columns = df.columns.str.strip().str.replace("'", "")
    df = df.applymap(lambda x: x.replace("'", "") if isinstance(x, str) else x)
    return df

# Prepare
df = load_clean_csv("dataset.csv")
X_raw = df.drop(columns=['fraud'])
y = df['fraud'].astype(int)

# Feature engineering
fe = FraudFeatureEngineerDL()
X_processed = fe.fit_transform(X_raw, y)

# Split
X_train, X_val, y_train, y_val = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Split into numerical & categorical
cat_col = 'category'
X_train_cat = torch.tensor(X_train[cat_col].values).long().unsqueeze(1)
X_val_cat = torch.tensor(X_val[cat_col].values).long().unsqueeze(1)

num_cols = X_train.drop(columns=[cat_col]).columns
scaler = StandardScaler()
X_train_num = torch.tensor(scaler.fit_transform(X_train[num_cols])).float()
X_val_num = torch.tensor(scaler.transform(X_val[num_cols])).float()

y_train_t = torch.tensor(y_train.values).float()
y_val_t = torch.tensor(y_val.values).float()

# Dataset & loader
train_ds = TensorDataset(X_train_num, X_train_cat, y_train_t)
val_ds = TensorDataset(X_val_num, X_val_cat, y_val_t)

train_dl = DataLoader(train_ds, batch_size=256, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=256)

# Model
model = MLPWithEmbeddings(num_numerical=X_train_num.shape[1], category_cardinality=X_train_cat.max().item()+1)
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Training loop
for epoch in range(10):
    model.train()
    for xb_num, xb_cat, yb in train_dl:
        preds = model(xb_num, xb_cat)
        loss = loss_fn(preds, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        val_preds = model(X_val_num, X_val_cat)
        val_loss = loss_fn(val_preds, y_val_t)
        print(f"Epoch {epoch+1} - Val Loss: {val_loss.item():.4f}")

# Save model
torch.save({
    'model_state_dict': model.state_dict(),
    'scaler': scaler,
    'feature_engineer': fe
}, "mlp_fraud_model.pth")
print("✅ Model saved to mlp_fraud_model.pth")

  df = df.applymap(lambda x: x.replace("'", "") if isinstance(x, str) else x)
  df = df.applymap(lambda x: x.replace("'", "") if isinstance(x, str) else x)


Epoch 1 - Val Loss: 0.0108
Epoch 2 - Val Loss: 0.0097
Epoch 3 - Val Loss: 0.0094
Epoch 4 - Val Loss: 0.0098
Epoch 5 - Val Loss: 0.0089
Epoch 6 - Val Loss: 0.0093
Epoch 7 - Val Loss: 0.0090
Epoch 8 - Val Loss: 0.0091
Epoch 9 - Val Loss: 0.0086
Epoch 10 - Val Loss: 0.0085
✅ Model saved to mlp_fraud_model.pth


In [6]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# --- Evaluate on Train Data ---
model.eval()
with torch.no_grad():
    train_preds = model(X_train_num, X_train_cat).numpy()
    train_labels = y_train_t.numpy()
    train_pred_labels = (train_preds > 0.5).astype(int)

print("\n📊 Training Metrics:")
print(classification_report(train_labels, train_pred_labels))
print("Confusion Matrix:\n", confusion_matrix(train_labels, train_pred_labels))
print("ROC-AUC Score:", roc_auc_score(train_labels, train_preds))


📊 Training Metrics:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    422963
         1.0       0.92      0.84      0.88      5179

    accuracy                           1.00    428142
   macro avg       0.96      0.92      0.94    428142
weighted avg       1.00      1.00      1.00    428142

Confusion Matrix:
 [[422605    358]
 [   854   4325]]
ROC-AUC Score: 0.999111838182553


In [8]:
import pandas as pd
import torch
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib

# Load test data
def load_clean_csv(path):
    df = pd.read_csv(path)
    df.columns = df.columns.str.strip().str.replace("'", "")
    df = df.applymap(lambda x: x.replace("'", "") if isinstance(x, str) else x)
    return df

df_test = load_clean_csv("test_hsbc_df.csv")

# Determine if labels are present
has_labels = 'fraud' in df_test.columns
if has_labels:
    y_test = torch.tensor(df_test['fraud'].astype(int).values).float()
    X_test_raw = df_test.drop(columns=['fraud'])
else:
    X_test_raw = df_test

# Load model and preprocessor
checkpoint = torch.load("mlp_fraud_model.pth", weights_only=False)
fe = checkpoint['feature_engineer']
scaler = checkpoint['scaler']

X_test_processed = fe.transform(X_test_raw)

# Separate categorical and numerical
cat_col = 'category'
X_cat = torch.tensor(X_test_processed[cat_col].values).long().unsqueeze(1)
num_cols = X_test_processed.drop(columns=[cat_col]).columns
X_num = torch.tensor(scaler.transform(X_test_processed[num_cols])).float()

# Load model
model = MLPWithEmbeddings(num_numerical=X_num.shape[1], category_cardinality=X_cat.max().item()+1)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

# Predict
with torch.no_grad():
    y_proba = model(X_num, X_cat).numpy()
    y_pred = (y_proba > 0.5).astype(int)

# If labels are available
if has_labels:
    print("\n📊 Test Metrics:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("ROC-AUC Score:", roc_auc_score(y_test, y_proba))
else:
    print("\n⚠️ No labels found — showing top predictions:")
    print(pd.DataFrame({
        "predicted_fraud": y_pred,
        "fraud_probability": y_proba
    }).head(10))

# Save output
df_test['predicted_fraud'] = y_pred
df_test['fraud_probability'] = y_proba
df_test.to_csv("test_mlp_predictions.csv", index=False)
print("\n📄 Predictions saved to test_mlp_predictions.csv")

  df = df.applymap(lambda x: x.replace("'", "") if isinstance(x, str) else x)
  df = df.applymap(lambda x: x.replace("'", "") if isinstance(x, str) else x)



📊 Test Metrics:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     80000
         1.0       0.93      0.66      0.77      1000

    accuracy                           1.00     81000
   macro avg       0.96      0.83      0.89     81000
weighted avg       1.00      1.00      0.99     81000

Confusion Matrix:
 [[79953    47]
 [  339   661]]
ROC-AUC Score: 0.9956460999999999

📄 Predictions saved to test_mlp_predictions.csv
