In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

class FraudFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.merchant_fraud_rate_map = {}
        self.category_fraud_rate_map = {}

    def fit(self, X, y=None):
        df = X.copy()
        df['fraud'] = y.values

        self.merchant_fraud_rate_map = df.groupby('merchant')['fraud'].mean().to_dict()
        self.category_fraud_rate_map = df.groupby('category')['fraud'].mean().to_dict()
        return self

    def transform(self, X):
        df = X.copy()

        df = df.drop(columns=["zipcodeOri", "zipMerchant"], errors='ignore')
        df = df.applymap(lambda x: x.replace("'", "") if isinstance(x, str) else x)

        df['age'] = df['age'].replace('U', -1)
        df['age'] = pd.to_numeric(df['age'], errors='coerce')
        df['step'] = pd.to_numeric(df['step'], errors='coerce')
        df['hour_of_day'] = df['step'] % 24
        df['day'] = df['step'] // 24
        df['is_night'] = df['hour_of_day'].apply(lambda x: 1 if x <= 6 else 0)
        df['gender'] = df['gender'].map({'M': 0, 'F': 1})

        df['customer_txn_count'] = df.groupby('customer')['step'].transform('count')
        df['customer_avg_amt'] = df.groupby('customer')['amount'].transform('mean')
        df['customer_std_amt'] = df.groupby('customer')['amount'].transform('std').fillna(0)
        df['relative_amt'] = df['amount'] / df['customer_avg_amt']
        df['amt_zscore'] = (df['amount'] - df['customer_avg_amt']) / df['customer_std_amt'].replace(0, 1)

        df['merchant_txn_count'] = df.groupby('merchant')['step'].transform('count')
        df['merchant_avg_amt'] = df.groupby('merchant')['amount'].transform('mean')
        df['merchant_fraud_rate'] = df['merchant'].map(self.merchant_fraud_rate_map).fillna(0)

        df['category_txn_count'] = df.groupby('category')['step'].transform('count')
        df['category_avg_amt'] = df.groupby('category')['amount'].transform('mean')
        df['category_fraud_rate'] = df['category'].map(self.category_fraud_rate_map).fillna(0)

        df['log_amt'] = np.log1p(df['amount'])
        amt_threshold = df['amount'].quantile(0.95)
        df['is_high_amt'] = (df['amount'] > amt_threshold).astype(int)
        df['is_high_risk_age'] = df['age'].apply(lambda x: 1 if x in [2, 3] else 0)

        df['category'] = LabelEncoder().fit_transform(df['category'])

        drop_cols = ['step', 'customer', 'merchant', 'amount']
        df.drop(columns=drop_cols, inplace=True, errors='ignore')

        low_corr_features = ['gender', 'day', 'age', 'hour_of_day', 'is_night', 'is_high_risk_age']
        df.drop(columns=low_corr_features, inplace=True, errors='ignore')

        return df

In [65]:
import joblib
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Clean loader
def load_clean_csv(path):
    df = pd.read_csv(path)
    df.columns = df.columns.str.strip().str.replace("'", "")
    df = df.applymap(lambda x: x.replace("'", "") if isinstance(x, str) else x)
    return df

# Load and clean training data
df_train = load_clean_csv("dataset.csv")

X_train = df_train.drop(columns=['fraud'])
y_train = df_train['fraud'].astype(int)

# Create pipeline
pipeline = Pipeline([
    ('features', FraudFeatureEngineer()),
    ('model', RandomForestClassifier(
        n_estimators=100,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
])

# Fit model
pipeline.fit(X_train, y_train)

# Save model
joblib.dump(pipeline, "fraud_model.pkl")
print("✅ Model trained and saved to fraud_model.pkl")

# Predict on training data
y_pred_train = pipeline.predict(X_train)
y_proba_train = pipeline.predict_proba(X_train)[:, 1]

# Report on training
print("\n📊 Training Report:")
print(classification_report(y_train, y_pred_train))
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred_train))
print("ROC-AUC Score:", roc_auc_score(y_train, y_proba_train))

✅ Model trained and saved to fraud_model.pkl

📊 Training Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    528686
           1       1.00      1.00      1.00      6492

    accuracy                           1.00    535178
   macro avg       1.00      1.00      1.00    535178
weighted avg       1.00      1.00      1.00    535178

Confusion Matrix:
 [[528684      2]
 [     1   6491]]
ROC-AUC Score: 0.9999999841211086


In [67]:
# Load and clean test data
df_test = load_clean_csv("test_hsbc_df.csv")

if 'fraud' in df_test.columns:
    X_test = df_test.drop(columns=['fraud'])
    y_test = df_test['fraud'].astype(int)
    has_labels = True
else:
    X_test = df_test
    y_test = None
    has_labels = False

# Load saved model
pipeline = joblib.load("fraud_model.pkl")

# Predict
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

# Evaluation
if has_labels:
    print("\n📊 Test Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("ROC-AUC Score:", roc_auc_score(y_test, y_proba))
else:
    print("\n⚠️ No 'fraud' column in test.csv. Showing sample predictions:")
    print(pd.DataFrame({
        "Predicted Fraud": y_pred,
        "Fraud Probability": y_proba
    }).head(10))

# Save results
df_test['predicted_fraud'] = y_pred
df_test['fraud_probability'] = y_proba
df_test.to_csv("test_predictions.csv", index=False)
print("\n📄 Results saved to test_predictions.csv")


📊 Test Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     80000
           1       0.97      0.65      0.78      1000

    accuracy                           1.00     81000
   macro avg       0.98      0.83      0.89     81000
weighted avg       1.00      1.00      1.00     81000

Confusion Matrix:
 [[79979    21]
 [  346   654]]
ROC-AUC Score: 0.9936912625

📄 Results saved to test_predictions.csv


In [None]:
pip install tensorflow scikeras

In [None]:
import joblib
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Clean loader
def load_clean_csv(path):
    df = pd.read_csv(path)
    df.columns = df.columns.str.strip().str.replace("'", "")
    df = df.applymap(lambda x: x.replace("'", "") if isinstance(x, str) else x)
    return df

# Load and clean training data
df_train = load_clean_csv("dataset.csv")

X_train = df_train.drop(columns=['fraud'])
y_train = df_train['fraud'].astype(int)

In [36]:
from scikeras.wrappers import KerasClassifier
from sklearn.pipeline import Pipeline
from tensorflow.keras.callbacks import EarlyStopping

# Custom Keras model function
def create_model(input_dim):
    model = Sequential()
    model.add(Dense(64, activation='relu', input_dim=input_dim))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Get feature count BEFORE pipeline (important!)
input_dim = X_train.shape[1]

# Early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Define the KerasClassifier *first*
clf = KerasClassifier(
    model=create_model,
    model__input_dim=input_dim,   # ✅ now it works
    epochs=30,
    batch_size=1024,
    validation_split=0.1,
    callbacks=[early_stop],
    verbose=1
)

In [37]:
pipeline = Pipeline([
    ('features', FraudFeatureEngineer()),
    ('model', clf)
])

In [None]:
# Fit model
pipeline.fit(X_train, y_train)

# Save model
joblib.dump(pipeline, "fraud_model_v3.pkl")
print("✅ Model trained and saved to fraud_model_v3.pkl")

# Predict on training data
y_pred_train = pipeline.predict(X_train)
y_proba_train = pipeline.predict_proba(X_train)[:, 1]

# Report on training
print("\n📊 Training Report:")
print(classification_report(y_train, y_pred_train))
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred_train))
print("ROC-AUC Score:", roc_auc_score(y_train, y_proba_train))

In [None]:
##############

In [74]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.base import BaseEstimator, ClassifierMixin

class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)

class TorchClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, input_dim=None, lr=0.001, epochs=10, batch_size=64, verbose=False):
        self.input_dim = input_dim
        self.lr = lr
        self.epochs = epochs
        self.batch_size = batch_size
        self.verbose = verbose
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = None

    def fit(self, X, y):
        X_tensor = torch.tensor(X.values, dtype=torch.float32).to(self.device)
        y_tensor = torch.tensor(y.values, dtype=torch.float32).view(-1, 1).to(self.device)

        if self.input_dim is None:
            self.input_dim = X.shape[1]

        self.model = SimpleNN(self.input_dim).to(self.device)
        optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        criterion = nn.BCELoss()

        dataset = torch.utils.data.TensorDataset(X_tensor, y_tensor)
        loader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

        for epoch in range(self.epochs):
            self.model.train()
            for batch_X, batch_y in loader:
                optimizer.zero_grad()
                outputs = self.model(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()

            if self.verbose:
                print(f"Epoch {epoch+1}/{self.epochs} - Loss: {loss.item():.4f}")

        return self

    def predict_proba(self, X):
        self.model.eval()
        with torch.no_grad():
            X_tensor = torch.tensor(X.values, dtype=torch.float32).to(self.device)
            probs = self.model(X_tensor).cpu().numpy()
        return np.hstack([1 - probs, probs])  # shape: [N, 2]

    def predict(self, X):
        proba = self.predict_proba(X)[:, 1]
        return (proba >= 0.5).astype(int)

In [75]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('features', FraudFeatureEngineer()),
    ('model', TorchClassifier(epochs=20, verbose=True))
])