# TESTING CODE FOR Random Forest
### Using features- 

In [None]:
# TESTING MODEL CODE

import pandas as pd
import joblib

# Load test data
test_df = pd.read_csv("test.csv")       # ---> Replace test file here...

# Apply same preprocessing: remove quotes, drop unused, encode age, etc.
test_df = test_df.drop(columns=["zipcodeOri", "zipMerchant", "step"])
test_df = test_df.applymap(lambda x: x.replace("'", "") if isinstance(x, str) else x)
test_df['age'] = test_df['age'].replace('U', -1)
test_df['age'] = pd.to_numeric(test_df['age'], errors='coerce')

# Frequency encoding for 'customer'
test_df["customer_freq"] = test_df["customer"].map(test_df["customer"].value_counts())
test_df = test_df.drop(columns=["customer"])

# Load the model
model = joblib.load("fraud_detection_model_RandomForest.pkl")

# Predict
preds = model.predict(test_df)
probs = model.predict_proba(test_df)[:, 1]

# Save or show predictions
test_df["predicted_fraud"] = preds
test_df["fraud_probability"] = probs
test_df.to_csv("test_predictions.csv", index=False)
print("✅ Predictions saved to test_predictions.csv")

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

# TESTING CODE FOR Random Forest
### Using features- 

In [None]:
import joblib
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

class FraudFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.merchant_fraud_rate_map = {}
        self.category_fraud_rate_map = {}

    def fit(self, X, y=None):
        df = X.copy()
        df['fraud'] = y.values

        self.merchant_fraud_rate_map = df.groupby('merchant')['fraud'].mean().to_dict()
        self.category_fraud_rate_map = df.groupby('category')['fraud'].mean().to_dict()
        return self

    def transform(self, X):
        df = X.copy()

        df = df.drop(columns=["zipcodeOri", "zipMerchant"], errors='ignore')
        df = df.applymap(lambda x: x.replace("'", "") if isinstance(x, str) else x)

        df['age'] = df['age'].replace('U', -1)
        df['age'] = pd.to_numeric(df['age'], errors='coerce')
        df['step'] = pd.to_numeric(df['step'], errors='coerce')
        df['hour_of_day'] = df['step'] % 24
        df['day'] = df['step'] // 24
        df['is_night'] = df['hour_of_day'].apply(lambda x: 1 if x <= 6 else 0)
        df['gender'] = df['gender'].map({'M': 0, 'F': 1})

        df['customer_txn_count'] = df.groupby('customer')['step'].transform('count')
        df['customer_avg_amt'] = df.groupby('customer')['amount'].transform('mean')
        df['customer_std_amt'] = df.groupby('customer')['amount'].transform('std').fillna(0)
        df['relative_amt'] = df['amount'] / df['customer_avg_amt']
        df['amt_zscore'] = (df['amount'] - df['customer_avg_amt']) / df['customer_std_amt'].replace(0, 1)

        df['merchant_txn_count'] = df.groupby('merchant')['step'].transform('count')
        df['merchant_avg_amt'] = df.groupby('merchant')['amount'].transform('mean')
        df['merchant_fraud_rate'] = df['merchant'].map(self.merchant_fraud_rate_map).fillna(0)

        df['category_txn_count'] = df.groupby('category')['step'].transform('count')
        df['category_avg_amt'] = df.groupby('category')['amount'].transform('mean')
        df['category_fraud_rate'] = df['category'].map(self.category_fraud_rate_map).fillna(0)

        df['log_amt'] = np.log1p(df['amount'])
        amt_threshold = df['amount'].quantile(0.95)
        df['is_high_amt'] = (df['amount'] > amt_threshold).astype(int)
        df['is_high_risk_age'] = df['age'].apply(lambda x: 1 if x in [2, 3] else 0)

        df['category'] = LabelEncoder().fit_transform(df['category'])

        drop_cols = ['step', 'customer', 'merchant', 'amount']
        df.drop(columns=drop_cols, inplace=True, errors='ignore')

        low_corr_features = ['gender', 'day', 'age', 'hour_of_day', 'is_night', 'is_high_risk_age']
        df.drop(columns=low_corr_features, inplace=True, errors='ignore')

        return df
    
# Clean loader
def load_clean_csv(path):
    df = pd.read_csv(path)
    df.columns = df.columns.str.strip().str.replace("'", "")
    df = df.applymap(lambda x: x.replace("'", "") if isinstance(x, str) else x)
    return df

# Load and clean training data
df_train = load_clean_csv("dataset.csv")

X_train = df_train.drop(columns=['fraud'])
y_train = df_train['fraud'].astype(int)

# Create pipeline
pipeline = Pipeline([
    ('features', FraudFeatureEngineer()),
    ('model', RandomForestClassifier(
        n_estimators=100,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
])

# Fit model
pipeline.fit(X_train, y_train)

# Save model
joblib.dump(pipeline, "fraud_model.pkl")
print("✅ Model trained and saved to fraud_model.pkl")

# Predict on training data
y_pred_train = pipeline.predict(X_train)
y_proba_train = pipeline.predict_proba(X_train)[:, 1]

# Report on training
print("\n📊 Training Report:")
print(classification_report(y_train, y_pred_train))
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred_train))
print("ROC-AUC Score:", roc_auc_score(y_train, y_proba_train))

# TESTING CODE FOR Deep Learning MLP Embeddings Model
### Using features- 

In [None]:
import pandas as pd
import torch
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib

import torch
import torch.nn as nn

class MLPWithEmbeddings(nn.Module):
    def __init__(self, num_numerical, category_cardinality, emb_dim=8, hidden_sizes=[64, 32]):
        super().__init__()
        self.embedding = nn.Embedding(category_cardinality, emb_dim)
        self.fc_input_size = num_numerical + emb_dim

        layers = []
        in_dim = self.fc_input_size
        for h in hidden_sizes:
            layers.append(nn.Linear(in_dim, h))
            layers.append(nn.ReLU())
            layers.append(nn.BatchNorm1d(h))
            in_dim = h
        layers.append(nn.Linear(in_dim, 1))

        self.mlp = nn.Sequential(*layers)

    def forward(self, x_num, x_cat):
        emb = self.embedding(x_cat).squeeze(1)
        x = torch.cat([x_num, emb], dim=1)
        return torch.sigmoid(self.mlp(x)).squeeze()
    
# Load test data
def load_clean_csv(path):
    df = pd.read_csv(path)
    df.columns = df.columns.str.strip().str.replace("'", "")
    df = df.applymap(lambda x: x.replace("'", "") if isinstance(x, str) else x)
    return df

df_test = load_clean_csv("test.csv")         ### ---> Replace test file here...

# Determine if labels are present
has_labels = 'fraud' in df_test.columns
if has_labels:
    y_test = torch.tensor(df_test['fraud'].astype(int).values).float()
    X_test_raw = df_test.drop(columns=['fraud'])
else:
    X_test_raw = df_test

# Load model and preprocessor
checkpoint = torch.load("mlp_fraud_model.pth", weights_only=False)
fe = checkpoint['feature_engineer']
scaler = checkpoint['scaler']

X_test_processed = fe.transform(X_test_raw)

# Separate categorical and numerical
cat_col = 'category'
X_cat = torch.tensor(X_test_processed[cat_col].values).long().unsqueeze(1)
num_cols = X_test_processed.drop(columns=[cat_col]).columns
X_num = torch.tensor(scaler.transform(X_test_processed[num_cols])).float()

# Load model
model = MLPWithEmbeddings(num_numerical=X_num.shape[1], category_cardinality=X_cat.max().item()+1)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

# Predict
with torch.no_grad():
    y_proba = model(X_num, X_cat).numpy()
    y_pred = (y_proba > 0.5).astype(int)

# If labels are available
if has_labels:
    print("\n📊 Test Metrics:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("ROC-AUC Score:", roc_auc_score(y_test, y_proba))
else:
    print("\n⚠️ No labels found — showing top predictions:")
    print(pd.DataFrame({
        "predicted_fraud": y_pred,
        "fraud_probability": y_proba
    }).head(10))

# Save output
df_test['predicted_fraud'] = y_pred
df_test['fraud_probability'] = y_proba
df_test.to_csv("test_mlp_predictions.csv", index=False)
print("\n📄 Predictions saved to test_mlp_predictions.csv")