In [None]:
# IMPORTING LIBRARIES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix,
    classification_report
)
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Set style for better visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# LOADING DATASET
print("=" * 80)
print("ETHEREUM FRAUD DETECTION - ISOLATION FOREST")
print("=" * 80)

# Load the dataset
df = pd.read_csv("/content/transaction_dataset.csv")

print(f"Dataset Shape: {df.shape}")
print(f"   Rows: {df.shape[0]:,} | Columns: {df.shape[1]}")

ETHEREUM FRAUD DETECTION - ISOLATION FOREST
Dataset Shape: (9841, 51)
   Rows: 9,841 | Columns: 51


In [None]:
# EDA
print("\n" + "=" * 80)
print("EXPLORATORY DATA ANALYSIS")
print("=" * 80)

# Display basic info
print("Dataset Info:")
print(df.info())

print("First Few Rows:")
print(df.head())

print("Statistical Summary:")
print(df.describe())

# Check for missing values
print("Missing Values:")
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
})
print(missing_df[missing_df['Missing Count'] > 0])

if missing.sum() == 0:
    print("No missing values found!")

# Check class distribution (assuming 'FLAG' column indicates fraud)
# Common column names: 'FLAG', 'Class', 'isFraud', 'Fraud'
fraud_col = None
for col in ['FLAG', 'Class', 'isFraud', 'Fraud', 'is_fraud']:
    if col in df.columns:
        fraud_col = col
        break

if fraud_col:
    print(f"Class Distribution ({fraud_col}):")
    class_dist = df[fraud_col].value_counts()
    print(class_dist)
    print(f"Fraud Percentage: {(class_dist.get(1, 0) / len(df)) * 100:.2f}%")
    print(f"Clean Percentage: {(class_dist.get(0, 0) / len(df)) * 100:.2f}%")

    # Store the target variable
    y = df[fraud_col]
else:
    print("Warning: Could not find fraud label column!")
    print("Available columns:", df.columns.tolist())
    # Create dummy target for demonstration
    y = pd.Series([0] * len(df))


EXPLORATORY DATA ANALYSIS
Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9841 entries, 0 to 9840
Data columns (total 51 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   Unnamed: 0                                            9841 non-null   int64  
 1   Index                                                 9841 non-null   int64  
 2   Address                                               9841 non-null   object 
 3   FLAG                                                  9841 non-null   int64  
 4   Avg min between sent tnx                              9841 non-null   float64
 5   Avg min between received tnx                          9841 non-null   float64
 6   Time Diff between first and last (Mins)               9841 non-null   float64
 7   Sent tnx                                              9841 non-null   int64  
 8   Received Tnx     

In [None]:
# DATA PREPROCESSING
print("\n" + "=" * 80)
print("DATA PREPROCESSING")
print("=" * 80)

# Separate features and target
X = df.drop(columns=[fraud_col] if fraud_col else [], errors='ignore')

# Identify numeric and categorical columns
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numeric Features ({len(numeric_cols)}): {numeric_cols[:5]}{'...' if len(numeric_cols) > 5 else ''}")
print(f"Categorical Features ({len(categorical_cols)}): {categorical_cols}")

# Handle categorical features
if categorical_cols:
    print("Encoding categorical features...")
    le = LabelEncoder()
    for col in categorical_cols:
        X[col] = le.fit_transform(X[col].astype(str))
    print("Categorical encoding complete!")

# Handle missing values (if any)
if X.isnull().sum().sum() > 0:
    print("Handling missing values...")
    X = X.fillna(X.median())
    print("Missing values filled with median!")

# Remove any non-numeric columns that couldn't be processed
X = X.select_dtypes(include=[np.number])

print(f"Final feature matrix shape: {X.shape}")


DATA PREPROCESSING
Numeric Features (47): ['Unnamed: 0', 'Index', 'Avg min between sent tnx', 'Avg min between received tnx', 'Time Diff between first and last (Mins)']...
Categorical Features (3): ['Address', ' ERC20 most sent token type', ' ERC20_most_rec_token_type']
Encoding categorical features...
Categorical encoding complete!
Handling missing values...
Missing values filled with median!
Final feature matrix shape: (9841, 50)


In [None]:
# VISUALISATION - FEATURE DISTRIBUTIONS
print("\n" + "=" * 80)
print("GENERATING VISUALIZATIONS")
print("=" * 80)

# Plot distributions of key features
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('Feature Distributions', fontsize=16, fontweight='bold')

for idx, col in enumerate(X.columns[:6]):
    ax = axes[idx // 3, idx % 3]
    ax.hist(X[col], bins=50, edgecolor='black', alpha=0.7)
    ax.set_title(f'{col}', fontweight='bold')
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')

plt.tight_layout()
plt.savefig('feature_distributions.png', dpi=300, bbox_inches='tight')
print("Saved: feature_distributions.png")
plt.close()

# Correlation heatmap
print("Generating correlation heatmap...")
plt.figure(figsize=(14, 12))
correlation_matrix = X.corr()

# For large datasets, show only top correlations
if len(X.columns) > 20:
    # Get top 20 features with highest variance
    top_features = X.var().nlargest(20).index
    correlation_matrix = X[top_features].corr()

sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm',
            center=0, square=True, linewidths=0.5)
plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('correlation_heatmap.png', dpi=300, bbox_inches='tight')
print("Saved: correlation_heatmap.png")
plt.close()


GENERATING VISUALIZATIONS
Saved: feature_distributions.png
Generating correlation heatmap...
Saved: correlation_heatmap.png


In [None]:
# TRAIN/TEST SPLIT
print("\n" + "=" * 80)
print("TRAIN/TEST SPLIT")
print("=" * 80)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y if fraud_col else None
)

print(f"Training set: {X_train.shape[0]:,} samples")
print(f"Test set: {X_test.shape[0]:,} samples")

# Normalize features
print("Normalizing features with StandardScaler...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Feature scaling complete!")


TRAIN/TEST SPLIT
Training set: 6,888 samples
Test set: 2,953 samples
Normalizing features with StandardScaler...
Feature scaling complete!


In [None]:
# MODEL TRAINING - ISOLATION FOREST
print("\n" + "=" * 80)
print("MODEL TRAINING - ISOLATION FOREST")
print("=" * 80)

# Calculate contamination based on actual fraud rate
if fraud_col:
    contamination_rate = y_train.sum() / len(y_train)
    contamination_rate = max(0.01, min(0.5, contamination_rate))  # Bound between 1% and 50%
else:
    contamination_rate = 0.1

print(f"Contamination rate: {contamination_rate:.4f}")

# Train Isolation Forest with parameter tuning
print("Training Isolation Forest with hyperparameter tuning...")

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_samples': [256, 512, 'auto'],
    'contamination': [contamination_rate * 0.8, contamination_rate, contamination_rate * 1.2],
    'max_features': [0.5, 0.75, 1.0]
}

best_f1 = 0
best_params = None
best_model = None

# Manual grid search (since IsolationForest doesn't support GridSearchCV directly)
for n_est in param_grid['n_estimators']:
    for max_samp in param_grid['max_samples']:
        for contam in param_grid['contamination']:
            for max_feat in param_grid['max_features']:
                model = IsolationForest(
                    n_estimators=n_est,
                    max_samples=max_samp,
                    contamination=contam,
                    max_features=max_feat,
                    random_state=42,
                    n_jobs=-1
                )

                model.fit(X_train_scaled)
                y_pred = model.predict(X_test_scaled)
                y_pred = np.where(y_pred == -1, 1, 0)  # Convert to 0/1

                if y_test.sum() > 0:  # Only calculate if there are positive samples
                    f1 = f1_score(y_test, y_pred, zero_division=0)

                    if f1 > best_f1:
                        best_f1 = f1
                        best_params = {
                            'n_estimators': n_est,
                            'max_samples': max_samp,
                            'contamination': contam,
                            'max_features': max_feat
                        }
                        best_model = model

if best_model is None:
    # Fallback to default model
    best_model = IsolationForest(
        n_estimators=200,
        contamination=contamination_rate,
        random_state=42,
        n_jobs=-1
    )
    best_model.fit(X_train_scaled)
    best_params = {'n_estimators': 200, 'contamination': contamination_rate}

print(f"Best parameters found:")
for param, value in best_params.items():
    print(f"   {param}: {value}")


MODEL TRAINING - ISOLATION FOREST
Contamination rate: 0.2214
Training Isolation Forest with hyperparameter tuning...
Best parameters found:
   n_estimators: 100
   max_samples: 512
   contamination: 0.2656794425087108
   max_features: 0.75


In [None]:
# PREDICTIONS AND SCORING
print("\n" + "=" * 80)
print("PREDICTIONS AND SCORING")
print("=" * 80)

# Get anomaly scores and predictions
anomaly_scores = best_model.score_samples(X_test_scaled)
y_pred = best_model.predict(X_test_scaled)
y_pred_binary = np.where(y_pred == -1, 1, 0)  # -1 = anomaly (fraud), 1 = normal

# Create scaled rating (1-10 scale)
# Anomaly scores are typically in range [-0.5, 0.5] but can vary
# Lower score = more anomalous = higher fraud risk = lower rating
score_min = anomaly_scores.min()
score_max = anomaly_scores.max()

# Normalize to [0, 1] then scale to [1, 10]
normalized_scores = (anomaly_scores - score_min) / (score_max - score_min)
scaled_ratings = normalized_scores * 9 + 1  # Scale to 1-10
scaled_ratings = np.round(scaled_ratings, 1)

print(f"Anomaly Score Statistics:")
print(f"   Min: {anomaly_scores.min():.4f}")
print(f"   Max: {anomaly_scores.max():.4f}")
print(f"   Mean: {anomaly_scores.mean():.4f}")
print(f"   Std: {anomaly_scores.std():.4f}")

print(f"Scaled Rating Statistics (1-10):")
print(f"   Min: {scaled_ratings.min():.1f}")
print(f"   Max: {scaled_ratings.max():.1f}")
print(f"   Mean: {scaled_ratings.mean():.1f}")
print(f"   Std: {scaled_ratings.std():.1f}")


PREDICTIONS AND SCORING
Anomaly Score Statistics:
   Min: -0.7489
   Max: -0.3150
   Mean: -0.3476
   Std: 0.0449
Scaled Rating Statistics (1-10):
   Min: 1.0
   Max: 10.0
   Mean: 9.3
   Std: 0.9


In [None]:
# MODEL EVALUATION
print("\n" + "=" * 80)
print("MODEL EVALUATION")
print("=" * 80)

if y_test.sum() > 0:  # Only evaluate if there are positive samples
    precision = precision_score(y_test, y_pred_binary, zero_division=0)
    recall = recall_score(y_test, y_pred_binary, zero_division=0)
    f1 = f1_score(y_test, y_pred_binary, zero_division=0)
    roc_auc = roc_auc_score(y_test, -anomaly_scores)  # Negative because lower = more anomalous

    print(f"Performance Metrics:")
    print(f"   Precision: {precision:.4f}")
    print(f"   Recall:    {recall:.4f}")
    print(f"   F1-Score:  {f1:.4f}")
    print(f"   ROC-AUC:   {roc_auc:.4f}")

    print("Confusion Matrix:")
    cm = confusion_matrix(y_test, y_pred_binary)
    print(cm)

    print("Classification Report:")
    print(classification_report(y_test, y_pred_binary,
                                target_names=['Clean', 'Fraud'],
                                zero_division=0))
else:
    print("No fraud samples in test set - skipping performance metrics")
    roc_auc = None


MODEL EVALUATION
Performance Metrics:
   Precision: 0.1407
   Recall:    0.1682
   F1-Score:  0.1532
   ROC-AUC:   0.4743
Confusion Matrix:
[[1627  672]
 [ 544  110]]
Classification Report:
              precision    recall  f1-score   support

       Clean       0.75      0.71      0.73      2299
       Fraud       0.14      0.17      0.15       654

    accuracy                           0.59      2953
   macro avg       0.45      0.44      0.44      2953
weighted avg       0.61      0.59      0.60      2953



In [None]:
# VISUALIZATIONS - RESULTS
print("\n" + "=" * 80)
print("GENERATING RESULT VISUALIZATIONS")
print("=" * 80)

# Anomaly score distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

axes[0].hist(anomaly_scores[y_test == 0], bins=50, alpha=0.7, label='Clean', color='blue')
axes[0].hist(anomaly_scores[y_test == 1], bins=50, alpha=0.7, label='Fraud', color='red')
axes[0].set_xlabel('Anomaly Score', fontweight='bold')
axes[0].set_ylabel('Frequency', fontweight='bold')
axes[0].set_title('Anomaly Score Distribution', fontweight='bold')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Scaled rating distribution
axes[1].hist(scaled_ratings[y_test == 0], bins=50, alpha=0.7, label='Clean', color='blue')
axes[1].hist(scaled_ratings[y_test == 1], bins=50, alpha=0.7, label='Fraud', color='red')
axes[1].set_xlabel('Scaled Rating (1-10)', fontweight='bold')
axes[1].set_ylabel('Frequency', fontweight='bold')
axes[1].set_title('Scaled Rating Distribution', fontweight='bold')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('anomaly_score_distribution.png', dpi=300, bbox_inches='tight')
print("Saved: anomaly_score_distribution.png")
plt.close()

# ROC Curve
if roc_auc is not None:
    fpr, tpr, thresholds = roc_curve(y_test, -anomaly_scores)

    plt.figure(figsize=(10, 8))
    plt.plot(fpr, tpr, color='darkorange', lw=2,
             label=f'ROC curve (AUC = {roc_auc:.4f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', fontweight='bold', fontsize=12)
    plt.ylabel('True Positive Rate', fontweight='bold', fontsize=12)
    plt.title('ROC Curve - Isolation Forest', fontweight='bold', fontsize=14)
    plt.legend(loc="lower right", fontsize=12)
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig('roc_curve.png', dpi=300, bbox_inches='tight')
    print("Saved: roc_curve.png")
    plt.close()


GENERATING RESULT VISUALIZATIONS
Saved: anomaly_score_distribution.png
Saved: roc_curve.png


In [None]:
# OUTPUT RESULTS
print("\n" + "=" * 80)
print("FINAL RESULTS OUTPUT")
print("=" * 80)

# Create results DataFrame
results_df = pd.DataFrame({
    'Transaction_Index': X_test.index,
    'Anomaly_Score': anomaly_scores,
    'Prediction': ['Fraud' if p == 1 else 'Clean' for p in y_pred_binary],
    'Scaled_Rating_1_10': scaled_ratings,
    'Actual_Label': ['Fraud' if y == 1 else 'Clean' for y in y_test]
})

# Sort by fraud risk (lowest rating = highest risk)
results_df = results_df.sort_values('Scaled_Rating_1_10')

print("Sample Results (Top 10 Highest Risk):")
print(results_df.head(10).to_string(index=False))

print("Sample Results (Top 10 Lowest Risk):")
print(results_df.tail(10).to_string(index=False))

# Save results
results_df.to_csv('fraud_detection_results.csv', index=False)
print("Full results saved to: fraud_detection_results.csv")

# Summary statistics
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
print(f"Total transactions analyzed: {len(results_df):,}")
print(f"Flagged as fraud: {(y_pred_binary == 1).sum():,} ({(y_pred_binary == 1).sum() / len(y_pred_binary) * 100:.2f}%)")
print(f"Flagged as clean: {(y_pred_binary == 0).sum():,} ({(y_pred_binary == 0).sum() / len(y_pred_binary) * 100:.2f}%)")

if y_test.sum() > 0:
    print(f"Model Performance Summary:")
    print(f"   • Precision: {precision:.2%} (of flagged frauds, how many were correct)")
    print(f"   • Recall: {recall:.2%} (of actual frauds, how many were caught)")
    print(f"   • F1-Score: {f1:.4f} (harmonic mean of precision and recall)")
    print(f"   • ROC-AUC: {roc_auc:.4f} (overall discriminative ability)")

print("\n" + "=" * 80)
print("FRAUD DETECTION ANALYSIS COMPLETE!")
print("=" * 80)
print("Generated Files:")
print("   • feature_distributions.png")
print("   • correlation_heatmap.png")
print("   • anomaly_score_distribution.png")
print("   • roc_curve.png")
print("   • fraud_detection_results.csv")
print("\n" + "=" * 80)


FINAL RESULTS OUTPUT
Sample Results (Top 10 Highest Risk):
 Transaction_Index  Anomaly_Score Prediction  Scaled_Rating_1_10 Actual_Label
               454      -0.748923      Fraud                 1.0        Clean
                54      -0.745756      Fraud                 1.1        Clean
               914      -0.713719      Fraud                 1.7        Clean
               525      -0.706238      Fraud                 1.9        Clean
               145      -0.700745      Fraud                 2.0        Clean
              4702      -0.675440      Fraud                 2.5        Clean
               878      -0.670026      Fraud                 2.6        Clean
              3140      -0.650598      Fraud                 3.0        Clean
               158      -0.646473      Fraud                 3.1        Clean
              2262      -0.649262      Fraud                 3.1        Clean
Sample Results (Top 10 Lowest Risk):
 Transaction_Index  Anomaly_Score Prediction 