Fraud Detection in Ethereum Dataset using XGBoost Model

In [1]:
# FOR GENERATING REAL TIME DATASET

import pandas as pd
import numpy as np
from datetime import datetime

# Set random seed for reproducibility
np.random.seed(42)

# Number of samples
n_samples = 10000
fraud_ratio = 0.55  # 22% fraud (similar to original dataset)
n_fraud = int(n_samples * fraud_ratio)
n_clean = n_samples - n_fraud

print("=" * 80)
print("ETHEREUM FRAUD DETECTION - DUMMY DATASET GENERATOR")
print("=" * 80)
print(f"Generating {n_samples:,} transactions...")
print(f"   Clean: {n_clean:,} ({(n_clean/n_samples)*100:.1f}%)")
print(f"   Fraud: {n_fraud:,} ({(n_fraud/n_samples)*100:.1f}%)")

# Generate Ethereum addresses (dummy format)
def generate_eth_address(n):
    addresses = []
    for i in range(n):
        address = '0x' + ''.join(np.random.choice(list('0123456789abcdef'), 40))
        addresses.append(address)
    return addresses

# Create base features for CLEAN transactions
clean_data = {
    'Unnamed: 0': np.arange(n_clean),
    'Index': np.arange(1, n_clean + 1),
    'Address': generate_eth_address(n_clean),
    'FLAG': np.zeros(n_clean, dtype=int),

    # Transaction timing patterns (clean transactions are more regular)
    'Avg min between sent tnx': np.random.exponential(5000, n_clean),
    'Avg min between received tnx': np.random.exponential(8000, n_clean),
    'Time Diff between first and last (Mins)': np.random.uniform(10000, 1500000, n_clean),

    # Transaction counts (clean = moderate activity)
    'Sent tnx': np.random.poisson(50, n_clean),
    'Received Tnx': np.random.poisson(80, n_clean),
    'Number of Created Contracts': np.random.poisson(1, n_clean),
    'Unique Received From Addresses': np.random.poisson(15, n_clean),
    'Unique Sent To Addresses': np.random.poisson(8, n_clean),

    # Transaction values (clean = normal distribution)
    'min value received': np.random.exponential(0.1, n_clean),
    'max value received': np.random.exponential(50, n_clean) + 10,
    'avg val received': np.random.exponential(5, n_clean) + 1,
    'min val sent': np.random.exponential(0.1, n_clean),
    'max val sent': np.random.exponential(40, n_clean) + 5,
    'avg val sent': np.random.exponential(4, n_clean) + 0.5,

    # Contract values
    'min value sent to contract': np.random.exponential(0.05, n_clean),
    'max val sent to contract': np.random.exponential(20, n_clean),
    'avg value sent to contract': np.random.exponential(2, n_clean),

    # Total transaction metrics
    'total transactions (including tnx to create contract': np.random.poisson(130, n_clean),
    'total Ether sent': np.random.exponential(200, n_clean) + 10,
    'total ether received': np.random.exponential(300, n_clean) + 20,
    'total ether sent contracts': np.random.exponential(50, n_clean),
    'total ether balance': np.random.uniform(-50, 500, n_clean),
}

# ERC20 Token features (with some missing values)
erc20_mask = np.random.random(n_clean) > 0.08  # 92% have ERC20 data

clean_data[' Total ERC20 tnxs'] = np.where(erc20_mask, np.random.poisson(10, n_clean), np.nan)
clean_data[' ERC20 total Ether received'] = np.where(erc20_mask, np.random.exponential(100000, n_clean), np.nan)
clean_data[' ERC20 total ether sent'] = np.where(erc20_mask, np.random.exponential(80000, n_clean), np.nan)
clean_data[' ERC20 total Ether sent contract'] = np.where(erc20_mask, np.random.exponential(10000, n_clean), np.nan)
clean_data[' ERC20 uniq sent addr'] = np.where(erc20_mask, np.random.poisson(3, n_clean), np.nan)
clean_data[' ERC20 uniq rec addr'] = np.where(erc20_mask, np.random.poisson(8, n_clean), np.nan)
clean_data[' ERC20 uniq sent addr.1'] = np.where(erc20_mask, np.random.poisson(3, n_clean), np.nan)
clean_data[' ERC20 uniq rec contract addr'] = np.where(erc20_mask, np.random.poisson(2, n_clean), np.nan)

clean_data[' ERC20 avg time between sent tnx'] = np.where(erc20_mask, np.random.exponential(15000, n_clean), np.nan)
clean_data[' ERC20 avg time between rec tnx'] = np.where(erc20_mask, np.random.exponential(12000, n_clean), np.nan)
clean_data[' ERC20 avg time between rec 2 tnx'] = np.where(erc20_mask, np.random.exponential(11000, n_clean), np.nan)
clean_data[' ERC20 avg time between contract tnx'] = np.where(erc20_mask, np.random.exponential(20000, n_clean), np.nan)

clean_data[' ERC20 min val rec'] = np.where(erc20_mask, np.random.exponential(1000, n_clean), np.nan)
clean_data[' ERC20 max val rec'] = np.where(erc20_mask, np.random.exponential(500000, n_clean), np.nan)
clean_data[' ERC20 avg val rec'] = np.where(erc20_mask, np.random.exponential(50000, n_clean), np.nan)
clean_data[' ERC20 min val sent'] = np.where(erc20_mask, np.random.exponential(800, n_clean), np.nan)
clean_data[' ERC20 max val sent'] = np.where(erc20_mask, np.random.exponential(400000, n_clean), np.nan)
clean_data[' ERC20 avg val sent'] = np.where(erc20_mask, np.random.exponential(40000, n_clean), np.nan)

clean_data[' ERC20 min val sent contract'] = np.zeros(n_clean)
clean_data[' ERC20 max val sent contract'] = np.zeros(n_clean)
clean_data[' ERC20 avg val sent contract'] = np.zeros(n_clean)

clean_data[' ERC20 uniq sent token name'] = np.where(erc20_mask, np.random.poisson(2, n_clean), np.nan)
clean_data[' ERC20 uniq rec token name'] = np.where(erc20_mask, np.random.poisson(6, n_clean), np.nan)

# Token types (categorical)
token_types = ['USDT', 'USDC', 'DAI', 'LINK', 'UNI', 'AAVE', 'WETH', 'MATIC', None]
clean_data[' ERC20 most sent token type'] = np.random.choice(token_types, n_clean, p=[0.15, 0.12, 0.10, 0.08, 0.08, 0.07, 0.10, 0.05, 0.25])
clean_data[' ERC20_most_rec_token_type'] = np.random.choice(token_types, n_clean, p=[0.18, 0.14, 0.12, 0.09, 0.09, 0.08, 0.12, 0.06, 0.12])

# Create base features for FRAUD transactions
fraud_data = {
    'Unnamed: 0': np.arange(n_clean, n_samples),
    'Index': np.arange(n_clean + 1, n_samples + 1),
    'Address': generate_eth_address(n_fraud),
    'FLAG': np.ones(n_fraud, dtype=int),

    # Fraud transactions have different patterns:
    # 1. Very short time between transactions (automated bots)
    'Avg min between sent tnx': np.random.exponential(100, n_fraud),  # Much shorter
    'Avg min between received tnx': np.random.exponential(500, n_fraud),  # Much shorter
    'Time Diff between first and last (Mins)': np.random.uniform(100, 500000, n_fraud),  # Shorter lifespan

    # 2. High transaction volume (money laundering patterns)
    'Sent tnx': np.random.poisson(200, n_fraud),  # More transactions
    'Received Tnx': np.random.poisson(250, n_fraud),  # More transactions
    'Number of Created Contracts': np.random.poisson(10, n_fraud),  # More contracts
    'Unique Received From Addresses': np.random.poisson(50, n_fraud),  # More sources
    'Unique Sent To Addresses': np.random.poisson(60, n_fraud),  # More destinations

    # 3. Unusual value patterns (round numbers, very small/large amounts)
    'min value received': np.random.exponential(0.001, n_fraud),  # Very small
    'max value received': np.random.exponential(200, n_fraud) + 100,  # Higher
    'avg val received': np.random.exponential(15, n_fraud) + 5,  # Higher average
    'min val sent': np.random.exponential(0.001, n_fraud),  # Very small
    'max val sent': np.random.exponential(150, n_fraud) + 80,  # Higher
    'avg val sent': np.random.exponential(12, n_fraud) + 3,  # Higher average

    # Contract interaction (fraud often involves contracts)
    'min value sent to contract': np.random.exponential(0.01, n_fraud),
    'max val sent to contract': np.random.exponential(100, n_fraud) + 20,
    'avg value sent to contract': np.random.exponential(10, n_fraud) + 2,

    # Total metrics (higher volume for fraud)
    'total transactions (including tnx to create contract': np.random.poisson(450, n_fraud),
    'total Ether sent': np.random.exponential(1000, n_fraud) + 100,
    'total ether received': np.random.exponential(1200, n_fraud) + 150,
    'total ether sent contracts': np.random.exponential(300, n_fraud) + 50,
    'total ether balance': np.random.uniform(-200, 200, n_fraud),  # Often low/negative
}

# ERC20 for fraud (fraud accounts more likely to have ERC20 activity)
erc20_mask_fraud = np.random.random(n_fraud) > 0.05  # 95% have ERC20 data

fraud_data[' Total ERC20 tnxs'] = np.where(erc20_mask_fraud, np.random.poisson(50, n_fraud), np.nan)
fraud_data[' ERC20 total Ether received'] = np.where(erc20_mask_fraud, np.random.exponential(1000000, n_fraud), np.nan)
fraud_data[' ERC20 total ether sent'] = np.where(erc20_mask_fraud, np.random.exponential(900000, n_fraud), np.nan)
fraud_data[' ERC20 total Ether sent contract'] = np.where(erc20_mask_fraud, np.random.exponential(200000, n_fraud), np.nan)
fraud_data[' ERC20 uniq sent addr'] = np.where(erc20_mask_fraud, np.random.poisson(15, n_fraud), np.nan)
fraud_data[' ERC20 uniq rec addr'] = np.where(erc20_mask_fraud, np.random.poisson(25, n_fraud), np.nan)
fraud_data[' ERC20 uniq sent addr.1'] = np.where(erc20_mask_fraud, np.random.poisson(15, n_fraud), np.nan)
fraud_data[' ERC20 uniq rec contract addr'] = np.where(erc20_mask_fraud, np.random.poisson(8, n_fraud), np.nan)

fraud_data[' ERC20 avg time between sent tnx'] = np.where(erc20_mask_fraud, np.random.exponential(1000, n_fraud), np.nan)
fraud_data[' ERC20 avg time between rec tnx'] = np.where(erc20_mask_fraud, np.random.exponential(800, n_fraud), np.nan)
fraud_data[' ERC20 avg time between rec 2 tnx'] = np.where(erc20_mask_fraud, np.random.exponential(750, n_fraud), np.nan)
fraud_data[' ERC20 avg time between contract tnx'] = np.where(erc20_mask_fraud, np.random.exponential(2000, n_fraud), np.nan)

fraud_data[' ERC20 min val rec'] = np.where(erc20_mask_fraud, np.random.exponential(5000, n_fraud), np.nan)
fraud_data[' ERC20 max val rec'] = np.where(erc20_mask_fraud, np.random.exponential(5000000, n_fraud), np.nan)
fraud_data[' ERC20 avg val rec'] = np.where(erc20_mask_fraud, np.random.exponential(500000, n_fraud), np.nan)
fraud_data[' ERC20 min val sent'] = np.where(erc20_mask_fraud, np.random.exponential(4000, n_fraud), np.nan)
fraud_data[' ERC20 max val sent'] = np.where(erc20_mask_fraud, np.random.exponential(4000000, n_fraud), np.nan)
fraud_data[' ERC20 avg val sent'] = np.where(erc20_mask_fraud, np.random.exponential(400000, n_fraud), np.nan)

fraud_data[' ERC20 min val sent contract'] = np.zeros(n_fraud)
fraud_data[' ERC20 max val sent contract'] = np.zeros(n_fraud)
fraud_data[' ERC20 avg val sent contract'] = np.zeros(n_fraud)

fraud_data[' ERC20 uniq sent token name'] = np.where(erc20_mask_fraud, np.random.poisson(8, n_fraud), np.nan)
fraud_data[' ERC20 uniq rec token name'] = np.where(erc20_mask_fraud, np.random.poisson(20, n_fraud), np.nan)

# Fraud uses more obscure tokens
fraud_token_types = ['SHIB', 'PEPE', 'FLOKI', 'DOGE', 'USDT', 'SCAM', 'FAKE', 'PONZI', None]
fraud_data[' ERC20 most sent token type'] = np.random.choice(fraud_token_types, n_fraud, p=[0.15, 0.12, 0.10, 0.12, 0.10, 0.08, 0.08, 0.10, 0.15])
fraud_data[' ERC20_most_rec_token_type'] = np.random.choice(fraud_token_types, n_fraud, p=[0.18, 0.14, 0.12, 0.13, 0.12, 0.09, 0.09, 0.08, 0.05])

# Create DataFrames
df_clean = pd.DataFrame(clean_data)
df_fraud = pd.DataFrame(fraud_data)

# Combine and shuffle
df = pd.concat([df_clean, df_fraud], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Update Unnamed: 0 and Index after shuffling
df['Unnamed: 0'] = np.arange(len(df))
df['Index'] = np.arange(1, len(df) + 1)

# Save to CSV
filename = 'mydata.csv'
df.to_csv(filename, index=False)

print(f"\n‚úì Dataset generated successfully!")
print(f"‚úì Saved to: {filename}")
print(f"‚úì File size: {df.memory_usage(deep=True).sum() / 1024:.2f} KB")

# Also create a backup with timestamp for safety
from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_filename = f'transaction_dataset_backup_{timestamp}.csv'
df.to_csv(backup_filename, index=False)
print(f"‚úì Backup saved to: {backup_filename}")

# Display statistics
print("\n" + "=" * 80)
print("DATASET STATISTICS")
print("=" * 80)
print(f"Total transactions: {len(df):,}")
print(f"Features: {df.shape[1]}")
print(f"\nClass Distribution:")
print(df['FLAG'].value_counts())
print(f"\nFraud: {df['FLAG'].sum():,} ({df['FLAG'].mean()*100:.2f}%)")
print(f"Clean: {(df['FLAG']==0).sum():,} ({(1-df['FLAG'].mean())*100:.2f}%)")

print(f"\nMissing Values:")
missing_summary = df.isnull().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)
if len(missing_summary) > 0:
    print(f"Columns with missing values: {len(missing_summary)}")
    print(missing_summary.head(10))
else:
    print("No missing values")

print("\n" + "=" * 80)
print("SAMPLE DATA")
print("=" * 80)
print("\nFirst 5 rows:")
print(df.head())

print("\n" + "=" * 80)
print("KEY FEATURE COMPARISONS (Clean vs Fraud)")
print("=" * 80)

comparison_features = [
    'Avg min between sent tnx',
    'Sent tnx',
    'Received Tnx',
    'total Ether sent',
    ' Total ERC20 tnxs'
]

for feature in comparison_features:
    if feature in df.columns:
        clean_mean = df[df['FLAG'] == 0][feature].mean()
        fraud_mean = df[df['FLAG'] == 1][feature].mean()
        print(f"\n{feature}:")
        print(f"   Clean: {clean_mean:.2f}")
        print(f"   Fraud: {fraud_mean:.2f}")
        print(f"   Ratio: {fraud_mean/clean_mean:.2f}x")

ETHEREUM FRAUD DETECTION - DUMMY DATASET GENERATOR
Generating 10,000 transactions...
   Clean: 4,500 (45.0%)
   Fraud: 5,500 (55.0%)

‚úì Dataset generated successfully!
‚úì Saved to: mydata.csv
‚úì File size: 5593.16 KB
‚úì Backup saved to: transaction_dataset_backup_20251109_153947.csv

DATASET STATISTICS
Total transactions: 10,000
Features: 51

Class Distribution:
FLAG
1    5500
0    4500
Name: count, dtype: int64

Fraud: 5,500 (55.00%)
Clean: 4,500 (45.00%)

Missing Values:
Columns with missing values: 22
ERC20 most sent token type         2072
ERC20_most_rec_token_type           811
ERC20 total ether sent              601
ERC20 total Ether sent contract     601
Total ERC20 tnxs                    601
ERC20 total Ether received          601
ERC20 uniq rec addr                 601
ERC20 uniq sent addr                601
ERC20 uniq sent addr.1              601
ERC20 uniq rec contract addr        601
dtype: int64

SAMPLE DATA

First 5 rows:
   Unnamed: 0  Index                        

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    precision_score, recall_score, f1_score, accuracy_score,
    roc_auc_score, roc_curve, confusion_matrix,
    classification_report, precision_recall_curve, auc
)
import warnings
warnings.filterwarnings('ignore')

In [3]:
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("ETHEREUM FRAUD DETECTION - XGBOOST")

df = pd.read_csv("transaction_dataset.csv")
print(f"Dataset Shape: {df.shape}")
print(f"Rows: {df.shape[0]:,} | Columns: {df.shape[1]}")

print("\nEXPLORATORY DATA ANALYSIS")
print("\nDataset Info:")
print(df.info())

print("\nFirst Few Rows:")
print(df.head())

print("\nStatistical Summary:")
print(df.describe())

print("\nMissing Values:")
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
})
print(missing_df[missing_df['Missing Count'] > 0])

fraud_col = 'FLAG'
if fraud_col in df.columns:
    print(f"\nClass Distribution ({fraud_col}):")
    class_dist = df[fraud_col].value_counts()
    print(class_dist)
    fraud_pct = (class_dist.get(1, 0) / len(df)) * 100
    clean_pct = (class_dist.get(0, 0) / len(df)) * 100
    print(f"Fraud Percentage: {fraud_pct:.2f}%")
    print(f"Clean Percentage: {clean_pct:.2f}%")

    imbalance_ratio = class_dist.get(0, 0) / max(class_dist.get(1, 1), 1)
    print(f"Imbalance Ratio: {imbalance_ratio:.2f}:1")
    y = df[fraud_col]
else:
    print("Warning: Could not find fraud label column!")
    y = pd.Series([0] * len(df))

ETHEREUM FRAUD DETECTION - XGBOOST
Dataset Shape: (9841, 51)
Rows: 9,841 | Columns: 51

EXPLORATORY DATA ANALYSIS

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9841 entries, 0 to 9840
Data columns (total 51 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   Unnamed: 0                                            9841 non-null   int64  
 1   Index                                                 9841 non-null   int64  
 2   Address                                               9841 non-null   object 
 3   FLAG                                                  9841 non-null   int64  
 4   Avg min between sent tnx                              9841 non-null   float64
 5   Avg min between received tnx                          9841 non-null   float64
 6   Time Diff between first and last (Mins)               9841 non-null   float64
 7   Sent tnx    

In [4]:
print("\nDATA PREPROCESSING")

# 1. Separate features (X) and target (y)
X = df.drop(columns=[fraud_col], errors='ignore')

# 2. FIX: Explicitly drop non-predictive/ID columns (Unnamed: 0, Index, Address)
columns_to_drop = ['Unnamed: 0', 'Index', 'Address']
X = X.drop(columns=columns_to_drop, errors='ignore')

# 3. Continue with the rest of the preprocessing steps
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numeric Features ({len(numeric_cols)}): {numeric_cols[:5]}{'...' if len(numeric_cols) > 5 else ''}")
print(f"Categorical Features ({len(categorical_cols)}): {categorical_cols}")

if categorical_cols:
    print("Encoding categorical features...")
    le = LabelEncoder()
    # Handle the two remaining categorical features with Label Encoding
    for col in [' ERC20 most sent token type', ' ERC20_most_rec_token_type']:
        if col in X.columns: # Ensure only the relevant categoricals are processed
            X[col] = le.fit_transform(X[col].astype(str))
    print("Categorical encoding complete!")

# Handle missing values
if X.isnull().sum().sum() > 0:
    print("Handling missing values with median imputation...")
    X = X.fillna(X.median())
    print("Missing values filled!")

# Final check
X = X.select_dtypes(include=[np.number])
print(f"Final feature matrix shape: {X.shape}")


DATA PREPROCESSING
Numeric Features (45): ['Avg min between sent tnx', 'Avg min between received tnx', 'Time Diff between first and last (Mins)', 'Sent tnx', 'Received Tnx']...
Categorical Features (2): [' ERC20 most sent token type', ' ERC20_most_rec_token_type']
Encoding categorical features...
Categorical encoding complete!
Handling missing values with median imputation...
Missing values filled!
Final feature matrix shape: (9841, 47)


In [5]:
print("\nFEATURE SELECTION")
print("Computing initial feature correlations and importance...")

X_temp = X.copy()
y_temp = y.copy()

X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(
    X_temp, y_temp, test_size=0.3, random_state=42, stratify=y_temp
)

scaler_temp = StandardScaler()
X_train_temp_scaled = scaler_temp.fit_transform(X_train_temp)

imbalance_ratio_val = (y_train_temp == 0).sum() / max((y_train_temp == 1).sum(), 1)

temp_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    scale_pos_weight=imbalance_ratio_val,
    random_state=42,
    eval_metric='logloss',
    use_label_encoder=False
)

temp_model.fit(X_train_temp_scaled, y_train_temp)

feature_importance_initial = pd.DataFrame({
    'Feature': X.columns,
    'Importance': temp_model.feature_importances_
}).sort_values('Importance', ascending=False)

importance_threshold = 0.01
selected_features = feature_importance_initial[
    feature_importance_initial['Importance'] >= importance_threshold
]['Feature'].tolist()

print(f"Features selected (importance >= {importance_threshold}): {len(selected_features)} out of {len(X.columns)}")
print("\nTop 15 Selected Features:")
print(feature_importance_initial.head(15).to_string(index=False))

X = X[selected_features]
print(f"\nReduced feature matrix shape: {X.shape}")


FEATURE SELECTION
Computing initial feature correlations and importance...
Features selected (importance >= 0.01): 10 out of 47

Top 15 Selected Features:
                                             Feature  Importance
                                    Total ERC20 tnxs    0.469935
             Time Diff between first and last (Mins)    0.137923
                           ERC20_most_rec_token_type    0.085919
                          ERC20 most sent token type    0.081115
                                  ERC20 avg val sent    0.028989
                                ERC20 uniq sent addr    0.024042
                          ERC20 total Ether received    0.021083
                                   ERC20 max val rec    0.015976
                                    avg val received    0.011557
                      Unique Received From Addresses    0.010864
                                   ERC20 min val rec    0.009879
                                 total ether balance    0.009633

In [6]:
print("\nGENERATING VISUALIZATIONS")

n_features_to_plot = min(6, len(X.columns))
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('Feature Distributions', fontsize=16, fontweight='bold')

for idx, col in enumerate(X.columns[:n_features_to_plot]):
    ax = axes[idx // 3, idx % 3]
    ax.hist(X[col], bins=50, edgecolor='black', alpha=0.7, color='steelblue')
    ax.set_title(f'{col}', fontweight='bold')
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')

plt.tight_layout()
plt.savefig('feature_distributions.png', dpi=300, bbox_inches='tight')
print("Saved: feature_distributions.png")
plt.close()

print("Generating correlation heatmap...")
plt.figure(figsize=(14, 12))
correlation_matrix = X.corr()

if len(X.columns) > 20:
    top_features = X.var().nlargest(20).index
    correlation_matrix = X[top_features].corr()

sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm',
            center=0, square=True, linewidths=0.5, cbar_kws={'label': 'Correlation'})
plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('correlation_heatmap.png', dpi=300, bbox_inches='tight')
print("Saved: correlation_heatmap.png")
plt.close()

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

class_counts = y.value_counts()
axes[0].bar(['Clean', 'Fraud'], class_counts.values, color=['green', 'red'], alpha=0.7)
axes[0].set_ylabel('Count', fontweight='bold')
axes[0].set_title('Class Distribution', fontweight='bold')
axes[0].grid(axis='y', alpha=0.3)

axes[1].pie(class_counts.values, labels=['Clean', 'Fraud'], autopct='%1.2f%%',
            colors=['green', 'red'], startangle=90)
axes[1].set_title('Class Proportion', fontweight='bold')

plt.tight_layout()
plt.savefig('class_distribution.png', dpi=300, bbox_inches='tight')
print("Saved: class_distribution.png")
plt.close()


GENERATING VISUALIZATIONS
Saved: feature_distributions.png
Generating correlation heatmap...
Saved: correlation_heatmap.png
Saved: class_distribution.png


In [7]:
print("\nTRAIN/TEST SPLIT")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]:,} samples")
print(f"Clean: {(y_train == 0).sum():,} | Fraud: {(y_train == 1).sum():,}")
print(f"Test set: {X_test.shape[0]:,} samples")
print(f"Clean: {(y_test == 0).sum():,} | Fraud: {(y_test == 1).sum():,}")

print("\nNormalizing features with StandardScaler...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Feature scaling complete!")


TRAIN/TEST SPLIT
Training set: 6,888 samples
Clean: 5,363 | Fraud: 1,525
Test set: 2,953 samples
Clean: 2,299 | Fraud: 654

Normalizing features with StandardScaler...
Feature scaling complete!


In [8]:
print("\nMODEL TRAINING - XGBOOST")

imbalance_ratio = (y_train == 0).sum() / max((y_train == 1).sum(), 1)
print(f"Imbalance Ratio: {imbalance_ratio:.2f}")
print(f"Using scale_pos_weight: {imbalance_ratio:.2f}")

xgb_model = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=imbalance_ratio,
    random_state=42,
    eval_metric='logloss',
    use_label_encoder=False
)

print("\nPerforming 5-fold cross-validation...")
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(xgb_model, X_train_scaled, y_train, cv=cv, scoring='f1')
print(f"Cross-validation F1 scores: {cv_scores}")
print(f"Mean F1 Score: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

print("\nTraining final model on full training set...")
xgb_model.fit(X_train_scaled, y_train)
print("Training complete!")


MODEL TRAINING - XGBOOST
Imbalance Ratio: 3.52
Using scale_pos_weight: 3.52

Performing 5-fold cross-validation...
Cross-validation F1 scores: [0.98013245 0.9785832  0.98507463 0.97712418 0.97385621]
Mean F1 Score: 0.9790 (+/- 0.0037)

Training final model on full training set...
Training complete!


In [9]:
print("\nPREDICTIONS AND SCORING")

y_pred = xgb_model.predict(X_test_scaled)
y_pred_proba = xgb_model.predict_proba(X_test_scaled)[:, 1]

fraud_risk_rating = (1 - y_pred_proba) * 9 + 1
fraud_risk_rating = np.round(fraud_risk_rating, 1)

print(f"Prediction Statistics:")
print(f"Predicted Fraud: {y_pred.sum():,} ({y_pred.sum() / len(y_pred) * 100:.2f}%)")
print(f"Predicted Clean: {(y_pred == 0).sum():,} ({(y_pred == 0).sum() / len(y_pred) * 100:.2f}%)")

print(f"\nFraud Probability Statistics:")
print(f"Min: {y_pred_proba.min():.4f}")
print(f"Max: {y_pred_proba.max():.4f}")
print(f"Mean: {y_pred_proba.mean():.4f}")
print(f"Median: {np.median(y_pred_proba):.4f}")


PREDICTIONS AND SCORING
Prediction Statistics:
Predicted Fraud: 652 (22.08%)
Predicted Clean: 2,301 (77.92%)

Fraud Probability Statistics:
Min: 0.0002
Max: 0.9997
Mean: 0.2244
Median: 0.0013


In [10]:
print("\nMODEL EVALUATION")

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"Performance Metrics:")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")
print(f"ROC-AUC:   {roc_auc:.4f}")

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(f"                 Predicted")
print(f"              Clean  Fraud")
print(f"Actual Clean  {cm[0,0]:5d}  {cm[0,1]:5d}")
print(f"       Fraud  {cm[1,0]:5d}  {cm[1,1]:5d}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred,
                            target_names=['Clean', 'Fraud'],
                            zero_division=0))


MODEL EVALUATION
Performance Metrics:
Accuracy:  0.9925
Precision: 0.9847
Recall:    0.9817
F1-Score:  0.9832
ROC-AUC:   0.9994

Confusion Matrix:
                 Predicted
              Clean  Fraud
Actual Clean   2289     10
       Fraud     12    642

Classification Report:
              precision    recall  f1-score   support

       Clean       0.99      1.00      1.00      2299
       Fraud       0.98      0.98      0.98       654

    accuracy                           0.99      2953
   macro avg       0.99      0.99      0.99      2953
weighted avg       0.99      0.99      0.99      2953



In [11]:
print("\nFEATURE IMPORTANCE ANALYSIS")

feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': xgb_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("Top 10 Most Important Features:")
print(feature_importance.head(10).to_string(index=False))

plt.figure(figsize=(12, 8))
top_features = feature_importance.head(20)
plt.barh(range(len(top_features)), top_features['Importance'], color='steelblue')
plt.yticks(range(len(top_features)), top_features['Feature'])
plt.xlabel('Importance Score', fontweight='bold', fontsize=12)
plt.ylabel('Features', fontweight='bold', fontsize=12)
plt.title('Top 20 Feature Importances - XGBoost', fontweight='bold', fontsize=14)
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
print("\nSaved: feature_importance.png")
plt.close()

xgb.plot_importance(xgb_model, max_num_features=10, importance_type='weight')
plt.title("Top Features Driving Fraud Predictions", fontweight='bold', fontsize=14)
plt.tight_layout()
plt.savefig('xgboost_feature_importance.png', dpi=300, bbox_inches='tight')
print("Saved: xgboost_feature_importance.png")
plt.close()


FEATURE IMPORTANCE ANALYSIS
Top 10 Most Important Features:
                                Feature  Importance
                       Total ERC20 tnxs    0.318866
              ERC20_most_rec_token_type    0.225169
             ERC20 most sent token type    0.178981
Time Diff between first and last (Mins)    0.118817
                   ERC20 uniq sent addr    0.042051
                      ERC20 max val rec    0.033926
         Unique Received From Addresses    0.028763
                     ERC20 avg val sent    0.022735
                       avg val received    0.020464
             ERC20 total Ether received    0.010228

Saved: feature_importance.png
Saved: xgboost_feature_importance.png


In [12]:
print("\nGENERATING RESULT VISUALIZATIONS")

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True,
            xticklabels=['Clean', 'Fraud'],
            yticklabels=['Clean', 'Fraud'])
plt.title('Confusion Matrix', fontweight='bold', fontsize=14)
plt.ylabel('Actual Label', fontweight='bold', fontsize=12)
plt.xlabel('Predicted Label', fontweight='bold', fontsize=12)
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
print("Saved: confusion_matrix.png")
plt.close()

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

axes[0].hist(y_pred_proba[y_test == 0], bins=50, alpha=0.7, label='Clean', color='green')
axes[0].hist(y_pred_proba[y_test == 1], bins=50, alpha=0.7, label='Fraud', color='red')
axes[0].set_xlabel('Fraud Probability', fontweight='bold')
axes[0].set_ylabel('Frequency', fontweight='bold')
axes[0].set_title('Fraud Probability Distribution', fontweight='bold')
axes[0].legend()
axes[0].grid(alpha=0.3)

axes[1].hist(fraud_risk_rating[y_test == 0], bins=50, alpha=0.7, label='Clean', color='green')
axes[1].hist(fraud_risk_rating[y_test == 1], bins=50, alpha=0.7, label='Fraud', color='red')
axes[1].set_xlabel('Fraud Risk Rating (1-10)', fontweight='bold')
axes[1].set_ylabel('Frequency', fontweight='bold')
axes[1].set_title('Fraud Risk Rating Distribution', fontweight='bold')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('fraud_probability_distribution.png', dpi=300, bbox_inches='tight')
print("Saved: fraud_probability_distribution.png")
plt.close()

fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2,
         label=f'XGBoost (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--',
         label='Random Classifier (AUC = 0.50)')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontweight='bold', fontsize=12)
plt.ylabel('True Positive Rate', fontweight='bold', fontsize=12)
plt.title('ROC Curve - XGBoost Fraud Detection', fontweight='bold', fontsize=14)
plt.legend(loc="lower right", fontsize=12)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('roc_curve.png', dpi=300, bbox_inches='tight')
print("Saved: roc_curve.png")
plt.close()

precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_pred_proba)
pr_auc = auc(recall_vals, precision_vals)

plt.figure(figsize=(10, 8))
plt.plot(recall_vals, precision_vals, color='darkorange', lw=2,
         label=f'XGBoost (AUC = {pr_auc:.4f})')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall', fontweight='bold', fontsize=12)
plt.ylabel('Precision', fontweight='bold', fontsize=12)
plt.title('Precision-Recall Curve - XGBoost', fontweight='bold', fontsize=14)
plt.legend(loc="lower left", fontsize=12)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('precision_recall_curve.png', dpi=300, bbox_inches='tight')
print("Saved: precision_recall_curve.png")
plt.close()


GENERATING RESULT VISUALIZATIONS
Saved: confusion_matrix.png
Saved: fraud_probability_distribution.png
Saved: roc_curve.png
Saved: precision_recall_curve.png


In [13]:
print("\nFINAL RESULTS OUTPUT")

results_df = pd.DataFrame({
    'Transaction_Index': X_test.index,
    'Fraud_Probability': y_pred_proba,
    'Prediction': ['Fraud' if p == 1 else 'Clean' for p in y_pred],
    'Fraud_Risk_Rating_1_10': fraud_risk_rating,
    'Actual_Label': ['Fraud' if y == 1 else 'Clean' for y in y_test],
    'Correct_Prediction': y_pred == y_test
})

results_df = results_df.sort_values('Fraud_Probability', ascending=False)

print("\nSample Results (Top 10 Highest Risk):")
print(results_df.head(10).to_string(index=False))

print("\nSample Results (Top 10 Lowest Risk):")
print(results_df.tail(10).to_string(index=False))

results_df.to_csv('fraud_detection_results_xgboost.csv', index=False)
print("\nFull results saved to: fraud_detection_results_xgboost.csv")

feature_importance.to_csv('feature_importance_xgboost.csv', index=False)
print("Feature importance saved to: feature_importance_xgboost.csv")

print("\nSUMMARY")
print(f"Total transactions analyzed: {len(results_df):,}")
print(f"Flagged as fraud: {(y_pred == 1).sum():,} ({(y_pred == 1).sum() / len(y_pred) * 100:.2f}%)")
print(f"Flagged as clean: {(y_pred == 0).sum():,} ({(y_pred == 0).sum() / len(y_pred) * 100:.2f}%)")

print(f"\nModel Performance Summary:")
print(f"Accuracy:  {accuracy:.2%}")
print(f"Precision: {precision:.2%}")
print(f"Recall:    {recall:.2%}")
print(f"F1-Score:  {f1:.4f}")
print(f"ROC-AUC:   {roc_auc:.4f}")

tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)
print(f"Specificity: {specificity:.2%}")

if tp + fn > 0:
    fraud_detection_rate = tp / (tp + fn)
    print(f"Fraud Detection Rate: {fraud_detection_rate:.2%}")

print("\nFRAUD DETECTION ANALYSIS COMPLETE!")
print("Generated Files:")
print("feature_distributions.png")
print("correlation_heatmap.png")
print("class_distribution.png")
print("feature_importance.png")
print("xgboost_feature_importance.png")
print("confusion_matrix.png")
print("fraud_probability_distribution.png")
print("roc_curve.png")
print("precision_recall_curve.png")
print("fraud_detection_results_xgboost.csv")
print("feature_importance_xgboost.csv")


FINAL RESULTS OUTPUT

Sample Results (Top 10 Highest Risk):
 Transaction_Index  Fraud_Probability Prediction  Fraud_Risk_Rating_1_10 Actual_Label  Correct_Prediction
              7682           0.999715      Fraud                     1.0        Fraud                True
              9261           0.999715      Fraud                     1.0        Fraud                True
              7816           0.999712      Fraud                     1.0        Fraud                True
              8797           0.999712      Fraud                     1.0        Fraud                True
              9097           0.999709      Fraud                     1.0        Fraud                True
              9147           0.999709      Fraud                     1.0        Fraud                True
              8590           0.999707      Fraud                     1.0        Fraud                True
              9282           0.999706      Fraud                     1.0        Fraud      

In [15]:
# ============================
# üîç TO TEST USER INPUTS
# ============================

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

print("üß† FRAUD PREDICTION ON NEW DATASET")

try:
    # Load the new dataset
    new_df = pd.read_csv("mydata.csv")

    print(f"\n‚úÖ Dataset loaded successfully!")
    print(f"   Rows: {new_df.shape[0]:,} | Columns: {new_df.shape[1]}")
    print("\nFirst 3 rows of your data:")
    print(new_df.head(3))

    # Check for 'FLAG' column (actual labels)
    has_flag = 'FLAG' in new_df.columns
    if has_flag:
        print("\n‚ö†Ô∏è FLAG column detected ‚Äî removing for prediction.")
        actual_labels = new_df['FLAG'].copy()
        new_df = new_df.drop(columns=['FLAG'])
    else:
        actual_labels = None
        print("\nNo FLAG column found ‚Äî assuming unlabeled test data.")

    # Save identifying info (if any)
    original_indices = new_df.index.copy()
    addresses = new_df['Address'].copy() if 'Address' in new_df.columns else None

    # Preprocessing (same as training)
    print("\nüßπ PREPROCESSING NEW DATA")

    # Drop ID-like columns
    cols_to_drop = ['Unnamed: 0', 'Index', 'Address']
    new_X = new_df.drop(columns=[c for c in cols_to_drop if c in new_df.columns], errors='ignore')
    print("   Removed identifier columns")

    # Encode categorical columns
    cat_cols = new_X.select_dtypes(include=['object', 'category']).columns.tolist()
    if cat_cols:
        print(f"   Encoding {len(cat_cols)} categorical features...")
        le = LabelEncoder()
        for c in cat_cols:
            new_X[c] = le.fit_transform(new_X[c].astype(str))

    # Handle missing values
    if new_X.isnull().sum().sum() > 0:
        print("   Filling missing values with median...")
        new_X = new_X.fillna(new_X.median())

    # Keep only numeric features
    new_X = new_X.select_dtypes(include=[np.number])

    # Match training features
    print(f"\nüìä Aligning with {len(selected_features)} training features...")
    missing_feats = [f for f in selected_features if f not in new_X.columns]
    if missing_feats:
        print(f"   Missing features: {missing_feats}")
        for f in missing_feats:
            new_X[f] = 0
    new_X = new_X[selected_features]

    print(f"   Final feature matrix: {new_X.shape}")

    # Scale features using training scaler
    print("‚öñÔ∏è Scaling features...")
    new_X_scaled = scaler.transform(new_X)

    # Predict
    print("\nüöÄ MAKING PREDICTIONS...")
    new_preds = xgb_model.predict(new_X_scaled)
    new_proba = xgb_model.predict_proba(new_X_scaled)[:, 1]

    # Compute risk rating
    risk_rating = np.round((1 - new_proba) * 9 + 1, 1)

    # Compile results
    results_new = pd.DataFrame({
        'Transaction_Index': original_indices,
        'Fraud_Probability': new_proba,
        'Prediction': ['Fraud' if p == 1 else 'Clean' for p in new_preds],
        'Fraud_Risk_Rating_1_10': risk_rating,
        'Confidence': np.maximum(new_proba, 1 - new_proba) * 100
    })

    if addresses is not None:
        results_new.insert(1, 'Address', addresses.values)

    if actual_labels is not None:
        results_new['Actual_Label'] = ['Fraud' if y == 1 else 'Clean' for y in actual_labels]
        results_new['Correct_Prediction'] = new_preds == actual_labels.values

    results_new = results_new.sort_values('Fraud_Probability', ascending=False)

    # Summary
    print("\n‚úÖ PREDICTIONS COMPLETE")
    print(f"Total Transactions: {len(new_preds):,}")
    print(f"   Predicted Fraud: {(new_preds == 1).sum():,}")
    print(f"   Predicted Clean: {(new_preds == 0).sum():,}")

    print("\nüìà Probability Summary:")
    print(f"   Min: {new_proba.min():.4f} | Max: {new_proba.max():.4f} | Mean: {new_proba.mean():.4f}")

    # If labeled, compute performance
    if actual_labels is not None:
        print("\nüéØ PERFORMANCE ON NEW DATA (with labels)")
        acc = accuracy_score(actual_labels, new_preds)
        prec = precision_score(actual_labels, new_preds, zero_division=0)
        rec = recall_score(actual_labels, new_preds, zero_division=0)
        f1 = f1_score(actual_labels, new_preds, zero_division=0)
        roc = roc_auc_score(actual_labels, new_proba)
        cm = confusion_matrix(actual_labels, new_preds)

        print(f"Accuracy:  {acc:.2%}")
        print(f"Precision: {prec:.2%}")
        print(f"Recall:    {rec:.2%}")
        print(f"F1-Score:  {f1:.4f}")
        print(f"ROC-AUC:   {roc:.4f}")

        print("\nConfusion Matrix:")
        print(cm)

    # Categorize by risk
    high = (new_proba >= 0.7).sum()
    med = ((new_proba >= 0.3) & (new_proba < 0.7)).sum()
    low = (new_proba < 0.3).sum()

    print("\nRISK CATEGORIES:")
    print(f"   HIGH (‚â•70%):  {high:,}")
    print(f"   MEDIUM (30‚Äì70%): {med:,}")
    print(f"   LOW (<30%):   {low:,}")

    # Save output
    results_new.to_csv("new_fraud_predictions.csv", index=False)
    print("\nüíæ Results saved as: new_fraud_predictions.csv")

    # Visualization
    print("\nüìä Generating visualizations...")

    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    axes = axes.flatten()

    axes[0].hist(new_proba, bins=50, color='skyblue', edgecolor='black')
    axes[0].axvline(0.5, color='red', linestyle='--')
    axes[0].set_title("Fraud Probability Distribution")

    axes[1].hist(risk_rating, bins=30, color='orange', edgecolor='black')
    axes[1].set_title("Fraud Risk Rating (1=High Risk)")

    pred_counts = pd.Series(new_preds).value_counts()
    axes[2].bar(['Clean', 'Fraud'], [pred_counts.get(0,0), pred_counts.get(1,0)], color=['green','red'])
    axes[2].set_title("Prediction Distribution")

    axes[3].bar(['Low Risk', 'Medium Risk', 'High Risk'], [low, med, high], color=['green','orange','red'])
    axes[3].set_title("Risk Category Distribution")

    plt.tight_layout()
    plt.savefig("new_data_predictions_analysis.png", dpi=300)
    plt.close()

    print("   ‚úÖ Saved: new_data_predictions_analysis.png")

    if actual_labels is not None:
        plt.figure(figsize=(6,5))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Clean','Fraud'], yticklabels=['Clean','Fraud'])
        plt.title("Confusion Matrix - New Data")
        plt.savefig("new_data_confusion_matrix.png", dpi=300)
        plt.close()
        print("   ‚úÖ Saved: new_data_confusion_matrix.png")

    print("\nüö® RECOMMENDATION:")
    if high > 0:
        print(f"   ‚ö†Ô∏è {high} HIGH-RISK transactions ‚Äî review immediately!")
    elif med > 0:
        print(f"   üü† {med} medium-risk transactions ‚Äî monitor closely.")
    else:
        print("   ‚úÖ All transactions appear low-risk. Routine monitoring recommended.")

    print("\nFRAUD PREDICTION COMPLETE ‚úÖ")

except Exception as e:
    print(f"\n‚ùå ERROR: {str(e)}")


üß† FRAUD PREDICTION ON NEW DATASET

‚úÖ Dataset loaded successfully!
   Rows: 10,000 | Columns: 51

First 3 rows of your data:
   Unnamed: 0  Index                                     Address  FLAG  \
0           0      1  0xcea1bee6387add3cf4f5ec054c028185bbd5d4fe     1   
1           1      2  0xeb71ffd67842707dd7763011edba9b55b8ef217d     1   
2           2      3  0x72506a6c1439c8bc12c0942bf3e35966edea724b     0   

   Avg min between sent tnx  Avg min between received tnx  \
0                 21.543460                   1778.274159   
1                 29.539527                     28.838359   
2               8992.066377                  11413.995351   

   Time Diff between first and last (Mins)  Sent tnx  Received Tnx  \
0                            106193.687448       200           264   
1                            105590.597403       215           247   
2                            563347.896832        56            83   

   Number of Created Contracts  ...   ERC20 min 