# Ethereum Fraud Detection Data Processing

## Introduction
This notebook processes a dataset for detecting fraudulent Ethereum accounts based on transaction patterns, ERC20 token activities, and account behaviors. The dataset is unbalanced, and we'll analyze, preprocess, and split it for federated learning experiments.

## Objective
The goal is to predict fraudulent Ethereum transactions using historical transaction data. This is a binary classification problem where:
- **FLAG = 1** indicates fraud
- **FLAG = 0** indicates non-fraud

## Federated Learning Setup
We'll split the data into N non-IID datasets with different fraud ratios to simulate real-world client data distribution.

## Import Libraries

In [None]:
!source venv/bin/activate

: 

In [None]:
%pip install --upgrade scikit-learn
import sklearn
print(sklearn.__version__)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.base import BaseEstimator
from imblearn.over_sampling import SMOTE
import os
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_context("notebook", font_scale=1.2)

## Configuration

Define key parameters for data processing and federated learning setup.

In [None]:
# Configuration
DATASET_PATH = '/Users/ayushpatne/Developer/FL_Major/transaction_dataset.csv'
OUTPUT_DATA_DIR = '/Users/ayushpatne/Developer/FL_Major/project/data/'
NUM_CLIENTS = 3

# Client fraud ratios: different proportions to create non-IID distribution
# These are target fraud ratios for the *entirety* of each client's data (train+test)
CLIENT_FRAUD_CONFIG = {
    0: {'fraud_ratio': 0.05, 'name': 'client_1'},  # 5% fraud
    1: {'fraud_ratio': 0.10, 'name': 'client_2'},  # 10% fraud
    2: {'fraud_ratio': 0.20, 'name': 'client_3'}   # 20% fraud
}

TEST_SIZE = 0.2  # Test set size for each client's data
RANDOM_STATE = 42  # For reproducibility

# Create output directory if it doesn't exist
if not os.path.exists(OUTPUT_DATA_DIR):
    os.makedirs(OUTPUT_DATA_DIR)

print(f"Configuration Summary:")
print(f"- Number of Clients: {NUM_CLIENTS}")
print(f"- Test Set Size: {TEST_SIZE * 100}%")
print(f"- Client Fraud Distribution:")
for client_id, config in CLIENT_FRAUD_CONFIG.items():
    print(f"  * {config['name']}: {config['fraud_ratio'] * 100}% fraud")

## Data Loading and Preprocessing

This section loads the Ethereum transaction dataset and performs necessary preprocessing steps.

In [None]:
def load_and_preprocess_data(file_path):
    """Loads and preprocesses the Ethereum fraud dataset."""
    print(f"Loading data from {file_path}...")
    df = pd.read_csv(file_path)
    
    # Initial data exploration
    print(f"\nInitial dataset shape: {df.shape}")
    print(f"\nData types:\n{df.dtypes}")
    
    # Check class distribution before preprocessing
    if 'FLAG' in df.columns:
        fraud_count = df['FLAG'].sum()
        total_count = len(df)
        fraud_ratio = fraud_count / total_count
        print(f"\nInitial class distribution:")
        print(f"- Total records: {total_count}")
        print(f"- Fraud records: {fraud_count} ({fraud_ratio:.2%})")
        print(f"- Non-fraud records: {total_count - fraud_count} ({1-fraud_ratio:.2%})")

    # Drop unnecessary columns
    df = df.drop(columns=['Index', 'Address'], errors='ignore')

    # Clean column names (remove leading/trailing spaces, replace special chars)
    df.columns = ["_".join(c if c.isalnum() else "_" for c in str(x)) for x in df.columns]
    df = df.rename(columns={'FLAG': 'FLAG'})  # Ensure FLAG column is correctly named

    # Identify potential categorical columns
    categorical_cols = [
        'ERC20_most_sent_token_type',
        'ERC20_most_rec_token_type'
    ]
    
    # Implement robust categorical feature handling
    for col in categorical_cols:
        if col in df.columns:
            if df[col].dtype == 'object' or df[col].dtype == 'category':
                df[col] = df[col].astype(str)  # Ensure it's string type
                df[col].fillna('Unknown', inplace=True)
                le = LabelEncoder()
                df[col] = le.fit_transform(df[col])
                print(f"Encoded {col} with {len(le.classes_)} unique values")
            else:
                print(f"Column {col} is not of object/category type, it's {df[col].dtype}. Skipping label encoding.")
        else:
            print(f"Categorical column {col} not found in dataframe.")

    # Fill NaN values for numeric columns with mean
    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
    if 'FLAG' in numeric_cols:
        numeric_cols.remove('FLAG')
    
    # Count NaNs before filling
    nan_counts = df[numeric_cols].isna().sum()
    print(f"\nNaN counts before filling:")
    print(nan_counts[nan_counts > 0])
    
    for col in numeric_cols:
        df[col].fillna(df[col].mean(), inplace=True)

    # Ensure all remaining object columns that should be numeric are converted
    for col in df.columns:
        if df[col].dtype == 'object' and col != 'FLAG':
            df[col] = pd.to_numeric(df[col].str.replace(',', ''), errors='coerce')
            df[col].fillna(0, inplace=True)

    # Drop rows where FLAG is NaN if any (should not happen if FLAG is 0/1)
    df.dropna(subset=['FLAG'], inplace=True)
    df['FLAG'] = df['FLAG'].astype(int)

    # Remove duplicate rows
    dupes_before = len(df)
    df.drop_duplicates(inplace=True)
    dupes_after = len(df)
    print(f"\nRemoved {dupes_before - dupes_after} duplicate rows")
    
    # Separate features and target
    X = df.drop(columns=['FLAG'])
    y = df['FLAG']

    # Identify numeric features for scaling
    numeric_features_to_scale = X.select_dtypes(include=np.number).columns.tolist()
    
    # Scale numerical features
    scaler = StandardScaler()
    X[numeric_features_to_scale] = scaler.fit_transform(X[numeric_features_to_scale])

    # Save the scaler and feature columns
    joblib.dump(scaler, os.path.join(OUTPUT_DATA_DIR, 'scaler.joblib'))
    joblib.dump(X.columns.tolist(), os.path.join(OUTPUT_DATA_DIR, 'feature_columns.joblib'))
    
    print(f"\nPreprocessed data summary:")
    print(f"- Final dataset shape: {X.shape[0]} samples, {X.shape[1]} features")
    print(f"- Features: {X.shape[1]}")
    return X, y

In [None]:
# Load and preprocess the data
X, y = load_and_preprocess_data(DATASET_PATH)

## Data Exploration

Let's explore the preprocessed data to better understand its characteristics.

In [None]:
# Basic statistics for features
print("Feature statistics:")
X.describe().T

In [None]:
# Class distribution visualization
plt.figure(figsize=(10, 6))
sns.countplot(x=y)
plt.title('Class Distribution Before Balancing')
plt.xlabel('Fraud Flag (0=Non-Fraud, 1=Fraud)')
plt.ylabel('Count')

# Add count labels
for i, count in enumerate(y.value_counts()):
    plt.text(i, count + 50, f"{count} ({count/len(y):.1%})", ha='center')

plt.tight_layout()
plt.show()

In [None]:
# Create a correlation heatmap for selected features
plt.figure(figsize=(12, 10))
# Select top 15 features with highest correlation to FLAG
full_df = pd.concat([X, y], axis=1)
corr_with_fraud = abs(full_df.corr()['FLAG']).sort_values(ascending=False)
top_corr_features = corr_with_fraud.index[:16]  # Include FLAG and top 15 features

# Create correlation matrix for these features
corr_matrix = full_df[top_corr_features].corr()
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Heatmap (Top 15 Features)')
plt.tight_layout()
plt.show()

In [None]:
# Check for missing values
plt.figure(figsize=(12, 8))
plt.title('Missing Values Heatmap')
sns.heatmap(full_df.isnull(), cbar=False, cmap='viridis', yticklabels=False)
plt.tight_layout()
plt.show()

print(f"Total missing values: {full_df.isnull().sum().sum()}")

In [None]:
# Distribution of a few important numeric features
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Distribution of Key Features by Fraud Class')

# Pick 4 interesting features
top_features = corr_with_fraud.index[1:5]  # Skip FLAG itself

for i, feature in enumerate(top_features):
    row, col = i // 2, i % 2
    sns.histplot(data=full_df, x=feature, hue='FLAG', bins=30, kde=True, ax=axes[row, col])
    axes[row, col].set_title(f'Distribution of {feature}')
    axes[row, col].set_xlabel(feature)
    axes[row, col].set_ylabel('Frequency')
    axes[row, col].legend(['Non-Fraud', 'Fraud'])

plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.show()

## Data Balancing with SMOTE

The dataset is imbalanced with respect to the fraud class. We'll use SMOTE (Synthetic Minority Over-sampling Technique) to balance it.

In [None]:
def balance_data(X, y):
    """Balances the dataset using SMOTE."""
    print(f"Original class distribution:")
    print(f"- Class 0 (Non-Fraud): {sum(y == 0)} samples ({sum(y == 0)/len(y):.2%})")
    print(f"- Class 1 (Fraud): {sum(y == 1)} samples ({sum(y == 1)/len(y):.2%})")
    
    smote = SMOTE(random_state=RANDOM_STATE)
    X_balanced, y_balanced = smote.fit_resample(X, y)
    
    print(f"\nBalanced class distribution:")
    print(f"- Class 0 (Non-Fraud): {sum(y_balanced == 0)} samples ({sum(y_balanced == 0)/len(y_balanced):.2%})")
    print(f"- Class 1 (Fraud): {sum(y_balanced == 1)} samples ({sum(y_balanced == 1)/len(y_balanced):.2%})")
    print(f"- Total samples after balancing: {len(y_balanced)}")
    return X_balanced, y_balanced

In [None]:
# Apply SMOTE to balance the dataset
X_balanced, y_balanced = balance_data(X, y)

In [None]:
# Plot class distribution before and after balancing
fig, ax = plt.subplots(1, 2, figsize=(15, 6))

# Before balancing
sns.countplot(x=y, ax=ax[0])
ax[0].set_title('Class Distribution Before Balancing')
ax[0].set_xlabel('Fraud Flag (0=Non-Fraud, 1=Fraud)')
ax[0].set_ylabel('Count')
for i, count in enumerate(y.value_counts()):
    ax[0].text(i, count + 50, f"{count} ({count/len(y):.1%})", ha='center')

# After balancing
sns.countplot(x=y_balanced, ax=ax[1])
ax[1].set_title('Class Distribution After Balancing (SMOTE)')
ax[1].set_xlabel('Fraud Flag (0=Non-Fraud, 1=Fraud)')
ax[1].set_ylabel('Count')
for i, count in enumerate(pd.Series(y_balanced).value_counts()):
    ax[1].text(i, count + 50, f"{count} ({count/len(y_balanced):.1%})", ha='center')

plt.tight_layout()
plt.show()

## Creating Non-IID Data Splits for Federated Learning

This function creates heterogeneous data splits for federated learning clients with varying fraud ratios.

In [None]:
def create_non_iid_splits(X, y, num_clients, client_fraud_config, test_size, random_state):
    """Creates non-IID data splits for clients based on fraud ratios."""
    data = X.copy()
    data['FLAG'] = y

    fraud_data = data[data['FLAG'] == 1]
    non_fraud_data = data[data['FLAG'] == 0]
    
    print(f"Total data distribution:")
    print(f"- Fraud samples: {len(fraud_data)}")
    print(f"- Non-fraud samples: {len(non_fraud_data)}")
    print(f"- Total samples: {len(data)}")

    client_data_indices = [[] for _ in range(num_clients)]
    
    # Shuffle data to ensure randomness before splitting
    fraud_data = fraud_data.sample(frac=1, random_state=random_state).reset_index(drop=True)
    non_fraud_data = non_fraud_data.sample(frac=1, random_state=random_state).reset_index(drop=True)

    # Distribute data to clients based on target fraud ratios
    available_fraud_indices = list(fraud_data.index)
    available_non_fraud_indices = list(non_fraud_data.index)

    # Calculate total samples per client (approximate)
    total_samples = len(data)
    samples_per_client_approx = total_samples // num_clients

    for client_id in range(num_clients):
        target_fraud_ratio = client_fraud_config[client_id]['fraud_ratio']
        
        # Determine number of fraud samples for this client
        num_fraud_samples_for_client = int(samples_per_client_approx * target_fraud_ratio)
        num_fraud_samples_for_client = min(num_fraud_samples_for_client, len(available_fraud_indices))

        # Determine number of non-fraud samples
        num_non_fraud_samples_for_client = samples_per_client_approx - num_fraud_samples_for_client
        num_non_fraud_samples_for_client = min(num_non_fraud_samples_for_client, len(available_non_fraud_indices))
        num_non_fraud_samples_for_client = max(0, num_non_fraud_samples_for_client)  # Ensure non-negative

        client_fraud_sample_indices = available_fraud_indices[:num_fraud_samples_for_client]
        client_non_fraud_sample_indices = available_non_fraud_indices[:num_non_fraud_samples_for_client]
        
        client_data_indices[client_id].extend(fraud_data.iloc[client_fraud_sample_indices].index.tolist())
        client_data_indices[client_id].extend(non_fraud_data.iloc[client_non_fraud_sample_indices].index.tolist())
        
        # Update available indices
        available_fraud_indices = available_fraud_indices[num_fraud_samples_for_client:]
        available_non_fraud_indices = available_non_fraud_indices[num_non_fraud_samples_for_client:]

    # Handle remaining data - distribute to the last client
    if available_fraud_indices:
        client_data_indices[-1].extend(fraud_data.iloc[available_fraud_indices].index.tolist())
    if available_non_fraud_indices:
        client_data_indices[-1].extend(non_fraud_data.iloc[available_non_fraud_indices].index.tolist())

    client_datasets = []
    for client_id in range(num_clients):
        client_df = data.iloc[client_data_indices[client_id]].reset_index(drop=True)
        
        if len(client_df) == 0:
            print(f"Warning: Client {client_id} has no data.")
            client_train_df = pd.DataFrame(columns=data.columns)
            client_test_df = pd.DataFrame(columns=data.columns)
        elif len(client_df) < 2:
            print(f"Warning: Client {client_id} has only {len(client_df)} samples. Using all for training.")
            client_train_df = client_df
            client_test_df = pd.DataFrame(columns=data.columns)  # Empty test set
        else:
            client_X = client_df.drop(columns=['FLAG'])
            client_y = client_df['FLAG']
            # Stratify if possible, otherwise random split
            try:
                X_train, X_test, y_train, y_test = train_test_split(
                    client_X, client_y, test_size=test_size, random_state=random_state, stratify=client_y
                )
            except ValueError:  # Happens if a class has too few samples for stratification
                X_train, X_test, y_train, y_test = train_test_split(
                    client_X, client_y, test_size=test_size, random_state=random_state
                )

            client_train_df = X_train.copy()
            client_train_df['FLAG'] = y_train
            client_test_df = X_test.copy()
            client_test_df['FLAG'] = y_test
        
        client_datasets.append({
            'name': client_fraud_config[client_id]['name'],
            'train': client_train_df,
            'test': client_test_df
        })
        
        # Create output directory for client data visualizations
        client_output_dir = os.path.join(OUTPUT_DATA_DIR, client_fraud_config[client_id]['name'])
        if not os.path.exists(client_output_dir):
            os.makedirs(client_output_dir)
        
        # Calculate and print client data statistics
        actual_fraud_ratio_train = client_train_df['FLAG'].mean() if len(client_train_df) > 0 else 0
        actual_fraud_ratio_test = client_test_df['FLAG'].mean() if len(client_test_df) > 0 else 0
        
        print(f"\nClient {client_id} ({client_fraud_config[client_id]['name']}):")
        print(f"- Train size: {len(client_train_df)} samples")
        print(f"- Test size: {len(client_test_df)} samples")
        print(f"- Train Fraud Ratio: {actual_fraud_ratio_train:.3f} ({actual_fraud_ratio_train:.1%})")
        print(f"- Test Fraud Ratio: {actual_fraud_ratio_test:.3f} ({actual_fraud_ratio_test:.1%})")

    return client_datasets

In [None]:
# Create non-IID client splits
print("Creating non-IID client splits...")
client_datasets = create_non_iid_splits(X_balanced, y_balanced, NUM_CLIENTS, CLIENT_FRAUD_CONFIG, TEST_SIZE, RANDOM_STATE)

## Visualize Client Data Distributions

Let's visualize how the data is distributed across clients.

In [None]:
# Visualize fraud ratio distribution across clients
client_names = []
train_fraud_ratios = []
test_fraud_ratios = []
train_sizes = []
test_sizes = []

for client in client_datasets:
    client_names.append(client['name'])
    train_fraud_ratios.append(client['train']['FLAG'].mean() if len(client['train']) > 0 else 0)
    test_fraud_ratios.append(client['test']['FLAG'].mean() if len(client['test']) > 0 else 0)
    train_sizes.append(len(client['train']))
    test_sizes.append(len(client['test']))

# Plotting
fig, ax = plt.subplots(1, 2, figsize=(15, 6))
ax[0].bar(client_names, train_fraud_ratios, color='blue')
ax[0].set_title('Training Fraud Ratios')
ax[0].set_xlabel('Client')
ax[0].set_ylabel('Fraud Ratio')

ax[1].bar(client_names, test_fraud_ratios, color='green')
ax[1].set_title('Testing Fraud Ratios')
ax[1].set_xlabel('Client')
ax[1].set_ylabel('Fraud Ratio')

plt.tight_layout()
plt.show()

## Visualize Client Data Distributions

Let's visualize how the data is distributed across clients.

In [None]:
# Visualize fraud ratio distribution across clients
client_names = []
train_fraud_ratios = []
test_fraud_ratios = []
train_sizes = []
test_sizes = []

for client in client_datasets:
    client_names.append(client['name'])
    train_fraud_ratios.append(client['train']['FLAG'].mean() if len(client['train']) > 0 else 0)
    test_fraud_ratios.append(client['test']['FLAG'].mean() if len(client['test']) > 0 else 0)
    train_sizes.append(len(client['train']))
    test_sizes.append(len(client['test']))

# Plotting
fig, ax = plt.subplots(1, 2, figsize=(15, 6))
ax[0].bar(client_names, train_fraud_ratios, color='blue')
ax[0].set_title('Training Fraud Ratios')
ax[0].set_xlabel('Client')
ax[0].set_ylabel('Fraud Ratio')

ax[1].bar(client_names, test_fraud_ratios, color='green')
ax[1].set_title('Testing Fraud Ratios')
ax[1].set_xlabel('Client')
ax[1].set_ylabel('Fraud Ratio')

plt.tight_layout()
plt.show()

## Visualize Client Data Distributions

Let's visualize how the data is distributed across clients.

In [None]:
# Visualize fraud ratio distribution across clients
client_names = []
train_fraud_ratios = []
test_fraud_ratios = []
train_sizes = []
test_sizes = []

for client in client_datasets:
    client_names.append(client['name'])
    train_fraud_ratios.append(client['train']['FLAG'].mean() if len(client['train']) > 0 else 0)
    test_fraud_ratios.append(client['test']['FLAG'].mean() if len(client['test']) > 0 else 0)
    train_sizes.append(len(client['train']))
    test_sizes.append(len(client['test']))

# Plotting
fig, ax = plt.subplots(1, 2, figsize=(15, 6))
ax[0].bar(client_names, train_fraud_ratios, color='blue')
ax[0].set_title('Training Fraud Ratios')
ax[0].set_xlabel('Client')
ax[0].set_ylabel('Fraud Ratio')

ax[1].bar(client_names, test_fraud_ratios, color='green')
ax[1].set_title('Testing Fraud Ratios')
ax[1].set_xlabel('Client')
ax[1].set_ylabel('Fraud Ratio')

plt.tight_layout()
plt.show()

## Visualize Client Data Distributions

Let's visualize how the data is distributed across clients.

In [None]:
# Visualize fraud ratio distribution across clients
client_names = []
train_fraud_ratios = []
test_fraud_ratios = []
train_sizes = []
test_sizes = []

for client in client_datasets:
    client_names.append(client['name'])
    train_fraud_ratios.append(client['train']['FLAG'].mean() if len(client['train']) > 0 else 0)
    test_fraud_ratios.append(client['test']['FLAG'].mean() if len(client['test']) > 0 else 0)
    train_sizes.append(len(client['train']))
    test_sizes.append(len(client['test']))

# Plotting
fig, ax = plt.subplots(1, 2, figsize=(15, 6))
ax[0].bar(client_names, train_fraud_ratios, color='blue')
ax[0].set_title('Training Fraud Ratios')
ax[0].set_xlabel('Client')
ax[0].set_ylabel('Fraud Ratio')

ax[1].bar(client_names, test_fraud_ratios, color='green')
ax[1].set_title('Testing Fraud Ratios')
ax[1].set_xlabel('Client')
ax[1].set_ylabel('Fraud Ratio')

plt.tight_layout()
plt.show()

## Visualize Client Data Distributions

Let's visualize how the data is distributed across clients.

In [None]:
# Visualize fraud ratio distribution across clients
client_names = []
train_fraud_ratios = []
test_fraud_ratios = []
train_sizes = []
test_sizes = []

for client in client_datasets:
    client_names.append(client['name'])
    train_fraud_ratios.append(client['train']['FLAG'].mean() if len(client['train']) > 0 else 0)
    test_fraud_ratios.append(client['test']['FLAG'].mean() if len(client['test']) > 0 else 0)
    train_sizes.append(len(client['train']))
    test_sizes.append(len(client['test']))

# Plotting
fig, ax = plt.subplots(1, 2, figsize=(15, 6))
ax[0].bar(client_names, train_fraud_ratios, color='blue')
ax[0].set_title('Training Fraud Ratios')
ax[0].set_xlabel('Client')
ax[0].set_ylabel('Fraud Ratio')

ax[1].bar(client_names, test_fraud_ratios, color='green')
ax[1].set_title('Testing Fraud Ratios')
ax[1].set_xlabel('Client')
ax[1].set_ylabel('Fraud Ratio')

plt.tight_layout()
plt.show()

## Visualize Client Data Distributions

Let's visualize how the data is distributed across clients.

In [None]:
# Visualize fraud ratio distribution across clients
client_names = []
train_fraud_ratios = []
test_fraud_ratios = []
train_sizes = []
test_sizes = []

for client in client_datasets:
    client_names.append(client['name'])
    train_fraud_ratios.append(client['train']['FLAG'].mean() if len(client['train']) > 0 else 0)
    test_fraud_ratios.append(client['test']['FLAG'].mean() if len(client['test']) > 0 else 0)
    train_sizes.append(len(client['train']))
    test_sizes.append(len(client['test']))

# Plotting
fig, ax = plt.subplots(1, 2, figsize=(15, 6))
ax[0].bar(client_names, train_fraud_ratios, color='blue')
ax[0].set_title('Training Fraud Ratios')
ax[0].set_xlabel('Client')
ax[0].set_ylabel('Fraud Ratio')

ax[1].bar(client_names, test_fraud_ratios, color='green')
ax[1].set_title('Testing Fraud Ratios')
ax[1].set_xlabel('Client')
ax[1].set_ylabel('Fraud Ratio')

plt.tight_layout()
plt.show()

## Visualize Client Data Distributions

Let's visualize how the data is distributed across clients.

In [None]:
# Visualize fraud ratio distribution across clients
client_names = []
train_fraud_ratios = []
test_fraud_ratios = []
train_sizes = []
test_sizes = []

for client in client_datasets:
    client_names.append(client['name'])
    train_fraud_ratios.append(client['train']['FLAG'].mean() if len(client['train']) > 0 else 0)
    test_fraud_ratios.append(client['test']['FLAG'].mean() if len(client['test']) > 0 else 0)
    train_sizes.append(len(client['train']))
    test_sizes.append(len(client['test']))

# Plotting
fig, ax = plt.subplots(1, 2, figsize=(15, 6))
ax[0].bar(client_names, train_fraud_ratios, color='blue')
ax[0].set_title('Training Fraud Ratios')
ax[0].set_xlabel('Client')
ax[0].set_ylabel('Fraud Ratio')

ax[1].bar(client_names, test_fraud_ratios, color='green')
ax[1].set_title('Testing Fraud Ratios')
ax[1].set_xlabel('Client')
ax[1].set_ylabel('Fraud Ratio')

plt.tight_layout()
plt.show()

## Visualize Client Data Distributions

Let's visualize how the data is distributed across clients.

In [None]:
# Visualize fraud ratio distribution across clients
client_names = []
train_fraud_ratios = []
test_fraud_ratios = []
train_sizes = []
test_sizes = []

for client in client_datasets:
    client_names.append(client['name'])
    train_fraud_ratios.append(client['train']['FLAG'].mean() if len(client['train']) > 0 else 0)
    test_fraud_ratios.append(client['test']['FLAG'].mean() if len(client['test']) > 0 else 0)
    train_sizes.append(len(client['train']))
    test_sizes.append(len(client['test']))

# Plotting
fig, ax = plt.subplots(1, 2, figsize=(15, 6))
ax[0].bar(client_names, train_fraud_ratios, color='blue')
ax[0].set_title('Training Fraud Ratios')
ax[0].set_xlabel('Client')
ax[0].set_ylabel('Fraud Ratio')

ax[1].bar(client_names, test_fraud_ratios, color='green')
ax[1].set_title('Testing Fraud Ratios')
ax[1].set_xlabel('Client')
ax[1].set_ylabel('Fraud Ratio')

plt.tight_layout()
plt.show()

## Visualize Client Data Distributions

Let's visualize how the data is distributed across clients.

In [None]:
# Visualize fraud ratio distribution across clients
client_names = []
train_fraud_ratios = []
test_fraud_ratios = []
train_sizes = []
test_sizes = []

for client in client_datasets:
    client_names.append(client['name'])
    train_fraud_ratios.append(client['train']['FLAG'].mean() if len(client['train']) > 0 else 0)
    test_fraud_ratios.append(client['test']['FLAG'].mean() if len(client['test']) > 0 else 0)
    train_sizes.append(len(client['train']))
    test_sizes.append(len(client['test']))

# Plotting
fig, ax = plt.subplots(1, 2, figsize=(15, 6))
ax[0].bar(client_names, train_fraud_ratios, color='blue')
ax[0].set_title('Training Fraud Ratios')
ax[0].set_xlabel('Client')
ax[0].set_ylabel('Fraud Ratio')

ax[1].bar(client_names, test_fraud_ratios, color='green')
ax[1].set_title('Testing Fraud Ratios')
ax[1].set_xlabel('Client')
ax[1].set_ylabel('Fraud Ratio')

plt.tight_layout()
plt.show()

## Visualize Client Data Distributions

Let's visualize how the data is distributed across clients.

In [None]:
# Visualize fraud ratio distribution across clients
client_names = []
train_fraud_ratios = []
test_fraud_ratios = []
train_sizes = []
test_sizes = []

for client in client_datasets:
    client_names.append(client['name'])
    train_fraud_ratios.append(client['train']['FLAG'].mean() if len(client['train']) > 0 else 0)
    test_fraud_ratios.append(client['test']['FLAG'].mean() if len(client['test']) > 0 else 0)
    train_sizes.append(len(client['train']))
    test_sizes.append(len(client['test']))

# Plotting
fig, ax = plt.subplots(1, 2, figsize=(15, 6))
ax[0].bar(client_names, train_fraud_ratios, color='blue')
ax[0].set_title('Training Fraud Ratios')
ax[0].set_xlabel('Client')
ax[0].set_ylabel('Fraud Ratio')

ax[1].bar(client_names, test_fraud_ratios, color='green')
ax[1].set_title('Testing Fraud Ratios')
ax[1].set_xlabel('Client')
ax[1].set_ylabel('Fraud Ratio')

plt.tight_layout()
plt.show()

## Visualize Client Data Distributions

Let's visualize how the data is distributed across clients.

In [None]:
# Visualize fraud ratio distribution across clients
client_names = []
train_fraud_ratios = []
test_fraud_ratios = []
train_sizes = []
test_sizes = []

for client in client_datasets:
    client_names.append(client['name'])
    train_fraud_ratios.append(client['train']['FLAG'].mean() if len(client['train']) > 0 else 0)
    test_fraud_ratios.append(client['test']['FLAG'].mean() if len(client['test']) > 0 else 0)
    train_sizes.append(len(client['train']))
    test_sizes.append(len(client['test']))

# Plotting
fig, ax = plt.subplots(1, 2, figsize=(15, 6))
ax[0].bar(client_names, train_fraud_ratios, color='blue')
ax[0].set_title('Training Fraud Ratios')
ax[0].set_xlabel('Client')
ax[0].set_ylabel('Fraud Ratio')

ax[1].bar(client_names, test_fraud_ratios, color='green')
ax[1].set_title('Testing Fraud Ratios')
ax[1].set_xlabel('Client')
ax[1].set_ylabel('Fraud Ratio')

plt.tight_layout()
plt.show()

## Visualize Client Data Distributions

Let's visualize how the data is distributed across clients.

In [None]:
# Visualize fraud ratio distribution across clients
client_names = []
train_fraud_ratios = []
test_fraud_ratios = []
train_sizes = []
test_sizes = []

for client in client_datasets:
    client_names.append(client['name'])
    train_fraud_ratios.append(client['train']['FLAG'].mean() if len(client['train']) > 0 else 0)
    test_fraud_ratios.append(client['test']['FLAG'].mean() if len(client['test']) > 0 else 0)
    train_sizes.append(len(client['train']))
    test_sizes.append(len(client['test']))

# Plotting
fig, ax = plt.subplots(1, 2, figsize=(15, 6))
ax[0].bar(client_names, train_fraud_ratios, color='blue')
ax[0].set_title('Training Fraud Ratios')
ax[0].set_xlabel('Client')
ax[0].set_ylabel('Fraud Ratio')

ax[1].bar(client_names, test_fraud_ratios, color='green')
ax[1].set_title('Testing Fraud Ratios')
ax[1].set_xlabel('Client')
ax[1].set_ylabel('Fraud Ratio')

plt.tight_layout()
plt.show()

## Visualize Client Data Distributions

Let's visualize how the data is distributed across clients.

In [None]:
# Visualize fraud ratio distribution across clients
client_names = []
train_fraud_ratios = []
test_fraud_ratios = []
train_sizes = []
test_sizes = []

for client in client_datasets:
    client_names.append(client['name'])
    train_fraud_ratios.append(client['train']['FLAG'].mean() if len(client['train']) > 0 else 0)
    test_fraud_ratios.append(client['test']['FLAG'].mean() if len(client['test']) > 0 else 0)
    train_sizes.append(len(client['train']))
    test_sizes.append(len(client['test']))

# Plotting
fig, ax = plt.subplots(1, 2, figsize=(15, 6))
ax[0].bar(client_names, train_fraud_ratios, color='blue')
ax[0].set_title('Training Fraud Ratios')
ax[0].set_xlabel('Client')
ax[0].set_ylabel('Fraud Ratio')

ax[1].bar(client_names, test_fraud_ratios, color='green')
ax[1].set_title('Testing Fraud Ratios')
ax[1].set_xlabel('Client')
ax[1].set_ylabel('Fraud Ratio')

plt.tight_layout()
plt.show()

## Visualize Client Data Distributions

Let's visualize how the data is distributed across clients.

In [None]:
# Visualize fraud ratio distribution across clients
client_names = []
train_fraud_ratios = []
test_fraud_ratios = []
train_sizes = []
test_sizes = []

for client in client_datasets:
    client_names.append(client['name'])
    train_fraud_ratios.append(client['train']['FLAG'].mean() if len(client['train']) > 0 else 0)
    test_fraud_ratios.append(client['test']['FLAG'].mean() if len(client['test']) > 0 else 0)
    train_sizes.append(len(client['train']))
    test_sizes.append(len(client['test']))

# Plotting
fig, ax = plt.subplots(1, 2, figsize=(15, 6))
ax[0].bar(client_names, train_fraud_ratios, color='blue')
ax[0].set_title('Training Fraud Ratios')
ax[0].set_xlabel('Client')
ax[0].set_ylabel('Fraud Ratio')

ax[1].bar(client_names, test_fraud_ratios, color='green')
ax[1].set_title('Testing Fraud Ratios')
ax[1].set_xlabel('Client')
ax[1].set_ylabel('Fraud Ratio')

plt.tight_layout()
plt.show()

## Visualize Client Data Distributions

Let's visualize how the data is distributed across clients.

In [None]:
# Visualize fraud ratio distribution across clients
client_names = []
train_fraud_ratios = []
test_fraud_ratios = []
train_sizes = []
test_sizes = []

for client in client_datasets:
    client_names.append(client['name'])
    train_fraud_ratios.append(client['train']['FLAG'].mean() if len(client['train']) > 0 else 0)
    test_fraud_ratios.append(client['test']['FLAG'].mean() if len(client['test']) > 0 else 0)
    train_sizes.append(len(client['train']))
    test_sizes.append(len(client['test']))

# Plotting
fig, ax = plt.subplots(1, 2, figsize=(15, 6))
ax[0].bar(client_names, train_fraud_ratios, color='blue')
ax[0].set_title('Training Fraud Ratios')
ax[0].set_xlabel('Client')
ax[0].set_ylabel('Fraud Ratio')

ax[1].bar(client_names, test_fraud_ratios, color='green')
ax[1].set_title('Testing Fraud Ratios')
ax[1].set_xlabel('Client')
ax[1].set_ylabel('Fraud Ratio')

plt.tight_layout()
plt.show()

## Visualize Client Data Distributions

Let's visualize how the data is distributed across clients.

In [None]:
# Visualize fraud ratio distribution across clients
client_names = []
train_fraud_ratios = []
test_fraud_ratios = []
train_sizes = []
test_sizes = []

for client in client_datasets:
    client_names.append(client['name'])
    train_fraud_ratios.append(client['train']['FLAG'].mean() if len(client['train']) > 0 else 0)
    test_fraud_ratios.append(client['test']['FLAG'].mean() if len(client['test']) > 0 else 0)
    train_sizes.append(len(client['train']))
    test_sizes.append(len(client['test']))

# Plotting
fig, ax = plt.subplots(1, 2, figsize=(15, 6))
ax[0].bar(client_names, train_fraud_ratios, color='blue')
ax[0].set_title('Training Fraud Ratios')
ax[0].set_xlabel('Client')
ax[0].set_ylabel('Fraud Ratio')

ax[1].bar(client_names, test_fraud_ratios, color='green')
ax[1].set_title('Testing Fraud Ratios')
ax[1].set_xlabel('Client')
ax[1].set_ylabel('Fraud Ratio')

plt.tight_layout()
plt.show()

## Visualize Client Data Distributions

Let's visualize how the data is distributed across clients.

In [None]:
# Visualize fraud ratio distribution across clients
client_names = []
train_fraud_ratios = []
test_fraud_ratios = []
train_sizes = []
test_sizes = []

for client in client_datasets:
    client_names.append(client['name'])
    train_fraud_ratios.append(client['train']['FLAG'].mean() if len(client['train']) > 0 else 0)
    test_fraud_ratios.append(client['test']['FLAG'].mean() if len(client['test']) > 0 else 0)
    train_sizes.append(len(client['train']))
    test_sizes.append(len(client['test']))

# Plotting
fig, ax = plt.subplots(1, 2, figsize=(15, 6))
ax[0].bar(client_names, train_fraud_ratios, color='blue')
ax[0].set_title('Training Fraud Ratios')
ax[0].set_xlabel('Client')
ax[0].set_ylabel('Fraud Ratio')

ax[1].bar(client_names, test_fraud_ratios, color='green')
ax[1].set_title('Testing Fraud Ratios')
ax[1].set_xlabel('Client')
ax[1].set_ylabel('Fraud Ratio')

plt.tight_layout()
plt.show()

## Visualize Client Data Distributions

Let's visualize how the data is distributed across clients.

In [None]:
# Visualize fraud ratio distribution across clients
client_names = []
train_fraud_ratios = []
test_fraud_ratios = []
train_sizes = []
test_sizes = []

for client in client_datasets:
    client_names.append(client['name'])
    train_fraud_ratios.append(client['train']['FLAG'].mean() if len(client['train']) > 0 else 0)
    test_fraud_ratios.append(client['test']['FLAG'].mean() if len(client['test']) > 0 else 0)
    train_sizes.append(len(client['train']))
    test_sizes.append(len(client['test']))

# Plotting
fig, ax = plt.subplots(1, 2, figsize=(15, 6))
ax[0].bar(client_names, train_fraud_ratios, color='blue')
ax[0].set_title('Training Fraud Ratios')
ax[0].set_xlabel('Client')
ax[0].set_ylabel('Fraud Ratio')

ax[1].bar(client_names, test_fraud_ratios, color='green')
ax[1].set_title('Testing Fraud Ratios')
ax[1].set_xlabel('Client')
ax[1].set_ylabel('Fraud Ratio')

plt.tight_layout()
plt.show()

## Visualize Client Data Distributions

Let's visualize how the data is distributed across clients.

In [None]:
# Visualize fraud ratio distribution across clients
client_names = []
train_fraud_ratios = []
test_fraud_ratios = []
train_sizes = []
test_sizes = []

for client in client_datasets:
    client_names.append(client['name'])
    train_fraud_ratios.append(client['train']['FLAG'].mean() if len(client['train']) > 0 else 0)
    test_fraud_ratios.append(client['test']['FLAG'].mean() if len(client['test']) > 0 else 0)
    train_sizes.append(len(client['train']))
    test_sizes.append(len(client['test']))

# Plotting
fig, ax = plt.subplots(1, 2, figsize=(15, 6))
ax[0].bar(client_names, train_fraud_ratios, color='blue')
ax[0].set_title('Training Fraud Ratios')
ax[0].set_xlabel('Client')
ax[0].set_ylabel('Fraud Ratio')

ax[1].bar(client_names, test_fraud_ratios, color='green')
ax[1].set_title('Testing Fraud Ratios')
ax[1].set_xlabel('Client')
ax[1].set_ylabel('Fraud Ratio')

plt.tight_layout()
plt.show()

## Visualize Client Data Distributions

Let's visualize how the data is distributed across clients.