Ethereum Fraud Detection Using Graph Neural Networks (GNN)

In [None]:
!pip install catboost



In [None]:
import os, warnings, numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from datetime import datetime
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from imblearn.combine import SMOTEENN

warnings.filterwarnings("ignore")
SEED = 42
np.random.seed(SEED)
MODEL_DIR = "models"
os.makedirs(MODEL_DIR, exist_ok=True)

# Load data
DATA_PATH = "transaction_dataset.csv"
df = pd.read_csv(DATA_PATH)
LABEL_CANDIDATES = ['FLAG', 'label', 'Label', 'fraud', 'isFraud']
LABEL_COL = next((c for c in LABEL_CANDIDATES if c in df.columns), df.columns[-1])
df[LABEL_COL] = df[LABEL_COL].astype(int)

# Preprocessing
id_cols = ['Address', 'Unnamed: 0', 'Index']
df.drop(columns=[c for c in id_cols if c in df.columns], inplace=True)
df.drop_duplicates(inplace=True)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

y = df[LABEL_COL].copy()
X = df.drop(columns=[LABEL_COL]).copy()

for c in X.select_dtypes(include=['object','category']).columns:
    X[c], _ = pd.factorize(X[c])
X = X.loc[:, X.nunique() > 1]

# Train/Validation/Test Split
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=SEED)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.17647, stratify=y_temp, random_state=SEED)

# Feature selection
K = min(50, X_train.shape[1])
selector = SelectKBest(score_func=mutual_info_classif, k=K)
selector.fit(X_train, y_train)
selected_features = X_train.columns[selector.get_support()].tolist()

corr_matrix = X_train[selected_features].corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
high_corr = [c for c in upper.columns if any(upper[c] > 0.9)]
for c in high_corr:
    selected_features.remove(c)

X_train_sel = X_train[selected_features]
X_val_sel = X_val[selected_features]
X_test_sel = X_test[selected_features]
joblib.dump(selected_features, os.path.join(MODEL_DIR, "selected_features.pkl"))

# Scaling
scaler = RobustScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_sel), columns=selected_features)
X_val_scaled = pd.DataFrame(scaler.transform(X_val_sel), columns=selected_features)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_sel), columns=selected_features)
joblib.dump(scaler, os.path.join(MODEL_DIR, "scaler.pkl"))

# Handle class imbalance
smote_enn = SMOTEENN(random_state=SEED)
X_train_bal_np, y_train_bal = smote_enn.fit_resample(X_train_scaled, y_train)
X_train_bal = pd.DataFrame(X_train_bal_np, columns=selected_features)

# Base models
xgb_params = dict(n_estimators=500, max_depth=10, learning_rate=0.05,
                  subsample=0.8, colsample_bytree=0.8, random_state=SEED,
                  use_label_encoder=False, eval_metric='logloss')
lgb_params = dict(n_estimators=500, learning_rate=0.05, random_state=SEED)

base_models = {
    'XGB': xgb.XGBClassifier(**xgb_params),
    'LGB': lgb.LGBMClassifier(**lgb_params),
    'RF': RandomForestClassifier(n_estimators=400, max_depth=20, class_weight='balanced', random_state=SEED, n_jobs=-1),
    'GB': GradientBoostingClassifier(n_estimators=300, max_depth=8, learning_rate=0.05, random_state=SEED)
}

fitted_models = {name: clf.fit(X_train_bal, y_train_bal) for name, clf in base_models.items()}

# Evaluate base models
results = {}
for name, clf in fitted_models.items():
    y_val_pred = clf.predict(X_val_scaled)
    y_val_prob = clf.predict_proba(X_val_scaled)[:,1]
    results[name] = {
        'accuracy': accuracy_score(y_val, y_val_pred),
        'precision': precision_score(y_val, y_val_pred, zero_division=0),
        'recall': recall_score(y_val, y_val_pred, zero_division=0),
        'f1': f1_score(y_val, y_val_pred, zero_division=0),
        'auc': roc_auc_score(y_val, y_val_prob)
    }

best_base_name = max(results, key=lambda n: results[n]['f1'])
best_base_model = fitted_models[best_base_name]

# Stacking ensemble
top_models = sorted(results.items(), key=lambda x: x[1]['f1'], reverse=True)[:2]
estimators = [(name, fitted_models[name]) for name,_ in top_models]

stack_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000, class_weight='balanced', random_state=SEED),
    cv=5, n_jobs=-1, passthrough=False
)
stack_clf.fit(X_train_bal, y_train_bal)

y_stack_val_pred = stack_clf.predict(X_val_scaled)
stack_f1 = f1_score(y_val, y_stack_val_pred)
final_model = stack_clf if stack_f1 > results[best_base_name]['f1'] else best_base_model
final_model_name = 'Stacking' if final_model==stack_clf else best_base_name
joblib.dump(final_model, os.path.join(MODEL_DIR, "final_model.pkl"))

# Optimize threshold
y_val_prob = final_model.predict_proba(X_val_scaled)[:,1]
thresholds = np.arange(0.3,0.7,0.01)
best_f1_thresh, best_thresh = 0, 0.5
for t in thresholds:
    preds = (y_val_prob>=t).astype(int)
    f1 = f1_score(y_val, preds)
    if f1 > best_f1_thresh:
        best_f1_thresh, best_thresh = f1, t
print(f"Optimal threshold based on val F1: {best_thresh:.2f}")

# Test evaluation
y_test_prob = final_model.predict_proba(X_test_scaled)[:,1]
y_test_pred = (y_test_prob>=best_thresh).astype(int)
test_acc = accuracy_score(y_test, y_test_pred)
test_prec = precision_score(y_test, y_test_pred)
test_rec = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
test_auc = roc_auc_score(y_test, y_test_prob)
cm = confusion_matrix(y_test, y_test_pred)

print(f"Test Acc: {test_acc:.4f} | Prec: {test_prec:.4f} | Rec: {test_rec:.4f} | F1: {test_f1:.4f} | AUC: {test_auc:.4f}")
print("Confusion matrix:\n", cm)
print("\nClassification report:\n", classification_report(y_test, y_test_pred, target_names=['Clean','Fraud']))

# Save metadata
metadata = {
    'final_model_name': final_model_name,
    'optimal_threshold': float(best_thresh),
    'test_accuracy': float(test_acc),
    'test_precision': float(test_prec),
    'test_recall': float(test_rec),
    'test_f1': float(test_f1),
    'test_auc': float(test_auc),
    'timestamp': datetime.now().isoformat()
}
joblib.dump(metadata, os.path.join(MODEL_DIR, "metadata.pkl"))

# Inference function
def predict_fraud(transaction_data):
    model = joblib.load(os.path.join(MODEL_DIR, "final_model.pkl"))
    scaler = joblib.load(os.path.join(MODEL_DIR, "scaler.pkl"))
    features = joblib.load(os.path.join(MODEL_DIR, "selected_features.pkl"))
    metadata = joblib.load(os.path.join(MODEL_DIR, "metadata.pkl"))
    threshold = metadata['optimal_threshold']

    if isinstance(transaction_data, dict):
        transaction_data = pd.DataFrame([transaction_data])
    if not isinstance(transaction_data, pd.DataFrame):
        raise ValueError("Input must be DataFrame or dict")

    for f in features:
        if f not in transaction_data.columns:
            transaction_data[f] = 0

    X_new = transaction_data[features].copy()
    X_new_scaled = scaler.transform(X_new)
    probs = model.predict_proba(X_new_scaled)[:,1]
    preds = (probs >= threshold).astype(int)
    risk_levels = ['Very Low' if p<0.25 else 'Low' if p<0.5 else 'Medium' if p<0.75 else 'High' for p in probs]

    return {'predictions': preds, 'fraud_probability': probs, 'risk_level': risk_levels}

Optimal threshold based on val F1: 0.30
Test Acc: 0.9971 | Prec: 1.0000 | Rec: 0.9819 | F1: 0.9909 | AUC: 0.9985
Confusion matrix:
 [[863   0]
 [  3 163]]

Classification report:
               precision    recall  f1-score   support

       Clean       1.00      1.00      1.00       863
       Fraud       1.00      0.98      0.99       166

    accuracy                           1.00      1029
   macro avg       1.00      0.99      0.99      1029
weighted avg       1.00      1.00      1.00      1029



In [None]:
np.random.seed(100)
n_clean, n_fraud = 100, 100
top_features = selected_features[:5]

synthetic_data = {}
fraud_stats = X_train[y_train==1][selected_features] if sum(y_train==1) > 0 else None

for feature in selected_features:
    mean_val = X_train[feature].mean()
    std_val = X_train[feature].std()
    clean_samples = np.random.normal(mean_val, std_val, n_clean)

    if fraud_stats is not None and feature in fraud_stats.columns:
        fraud_mean = fraud_stats[feature].mean()
        fraud_std = fraud_stats[feature].std()
    else:
        fraud_mean, fraud_std = mean_val*1.5, std_val*2

    if feature in top_features:
        fraud_samples = np.random.normal(fraud_mean*1.5, fraud_std*1.5, n_fraud)
    else:
        fraud_samples = np.random.normal(fraud_mean*1.2, fraud_std*1.2, n_fraud)

    synthetic_data[feature] = np.concatenate([clean_samples, fraud_samples])

synthetic_df = pd.DataFrame(synthetic_data)

def add_noise(df, noise_level=0.05):
    noise = np.random.normal(0, noise_level, df.shape)
    stds = df.std().to_numpy().reshape(1, -1)
    return df + noise * stds

synthetic_df.iloc[:n_clean] = add_noise(synthetic_df.iloc[:n_clean], 0.03)
synthetic_df.iloc[n_clean:] = add_noise(synthetic_df.iloc[n_clean:], 0.1)

synthetic_labels = np.array([0]*n_clean + [1]*n_fraud)
synthetic_df['isFraud'] = synthetic_labels

SYNTHETIC_PATH = os.path.join(MODEL_DIR, "synthetic_test_dataset.csv")
synthetic_df.to_csv(SYNTHETIC_PATH, index=False)
print(f"Synthetic dataset saved to: {SYNTHETIC_PATH}")

Synthetic dataset saved to: models/synthetic_test_dataset.csv


In [None]:
X_synth_scaled = scaler.transform(synthetic_df[selected_features])
y_synth_prob = final_model.predict_proba(X_synth_scaled)[:,1]
y_synth_pred = (y_synth_prob >= best_thresh).astype(int)

n_total = n_clean + n_fraud
n_fraud_pred = y_synth_pred.sum()
n_clean_pred = n_total - n_fraud_pred
synthetic_accuracy = ((y_synth_pred[:n_clean]==0).sum() + (y_synth_pred[n_clean:]==1).sum()) / n_total
precision = (y_synth_pred[n_clean:] == 1).sum() / max(y_synth_pred.sum(), 1)
recall = (y_synth_pred[n_clean:] == 1).sum() / n_fraud

risk_levels = ['Very Low' if p<0.25 else 'Low' if p<0.5 else 'Medium' if p<0.75 else 'High' for p in y_synth_prob]
risk_counts = pd.Series(risk_levels).value_counts(normalize=True) * 100

print("Synthetic Inference Dataset Statistics:")
print(f"Total Transactions: {n_total}")
print(f"Fraud Predictions: {n_fraud_pred} ({n_fraud_pred/n_total*100:.1f}%)")
print(f"Clean Predictions: {n_clean_pred} ({n_clean_pred/n_total*100:.1f}%)")
print(f"Synthetic Accuracy: {synthetic_accuracy*100:.2f}%")
print(f"Precision: {precision*100:.2f}%")
print(f"Recall: {recall*100:.2f}%")
print("Risk Breakdown:")
for r, pct in risk_counts.items():
    print(f"  {r:<9}: {pct:.1f}%")

Synthetic Inference Dataset Statistics:
Total Transactions: 200
Fraud Predictions: 115 (57.5%)
Clean Predictions: 85 (42.5%)
Synthetic Accuracy: 81.50%
Precision: 77.39%
Recall: 89.00%
Risk Breakdown:
  High     : 52.0%
  Very Low : 42.5%
  Medium   : 4.5%
  Low      : 1.0%
