# Fraud Detection Synthetic Data Generation (Generator–Discriminator)

This notebook implements a pipeline to generate synthetic fraud transaction data using a Conditional GAN (CTGAN).

## 1. Setup Environment & Load Data

In [None]:
# Install necessary libraries
!pip install ctgan pandas numpy scikit-learn matplotlib seaborn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ctgan import CTGAN
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

# Load dataset
file_path = 'crypto_scam_transaction_dataset.csv'
df = pd.read_csv(file_path)

print("Dataset Shape:", df.shape)
df.head()

## 2. Data Preprocessing

In [None]:
# Handle missing values
print("Missing values before imputation:\n", df.isnull().sum())

# Impute numerical columns with median
numerical_cols_with_na = ['gas_fee_usd', 'avg_txn_interval_sender_min']
for col in numerical_cols_with_na:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].median())

# Impute categorical columns with 'Unknown'
categorical_cols_with_na = ['platform']
for col in categorical_cols_with_na:
    if col in df.columns:
        df[col] = df[col].fillna('Unknown')

print("Missing values after imputation:\n", df.isnull().sum())

In [None]:
# Ensure is_scam is the target label
target = 'is_scam'

# Drop transaction_id as it's not a feature for generation
if 'transaction_id' in df.columns:
    df = df.drop(columns=['transaction_id'])

# Convert timestamp to numeric features
# timestamps are likely Unix timestamps. We can extract hour of day and day of week.
if 'timestamp' in df.columns:
    df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
    df['hour'] = df['datetime'].dt.hour
    df['day_of_week'] = df['datetime'].dt.dayofweek
    
    # Calculate time gap between transactions if applicable (sorting by timestamp first)
    df = df.sort_values(by='timestamp')
    df['time_gap'] = df['timestamp'].diff().fillna(0)
    
    # Drop original timestamp and temporary datetime column
    df = df.drop(columns=['timestamp', 'datetime'])

print("Columns after timestamp conversion:", df.columns)
df.head()

In [None]:
# Identify categorical and numerical columns
categorical_features = ['blockchain', 'transaction_type', 'token_type', 'platform']
numerical_features = [
    'sender_wallet_age_days', 'receiver_wallet_age_days', 'transaction_amount_usd', 'gas_fee_usd',
    'num_prev_transactions_sender', 'num_prev_transactions_receiver', 'avg_txn_interval_sender_min',
    'failed_txn_ratio_sender', 'velocity_score', 'anomaly_score', 'time_gap'
]

# 'is_cross_chain' is binary, can be treated as categorical or numeric. Let's treat as categorical for CTGAN conditioning if needed, or just numeric binary.
# Let's verify existing columns match our list
existing_cols = df.columns.tolist()
categorical_features = [col for col in categorical_features if col in existing_cols]
numerical_features = [col for col in numerical_features if col in existing_cols]

print("Categorical Features:", categorical_features)
print("Numerical Features:", numerical_features)

In [None]:
# Encode categorical features (optional for CTGAN as it handles them, but good practice if using other models later)
# CTGAN natively handles categorical columns if we pass them in 'discrete_columns'.
# However, the request asks to "Encode categorical features". Let's use Label Encoding for simplicity if we were to pass to a standard non-GAN model,
# but for CTGAN, it's BEST to leave them as strings or object types and specify them as discrete_columns.
# WE WILL SKIP EXPLICIT ENCODING FOR CTGAN input to allow it to learn the categories, 
# BUT we will ensure they are object/string type.

for col in categorical_features:
    df[col] = df[col].astype(str)

# Scale numerical features
# CTGAN also handles normalization internally (using ModeSpecificNormalization).
# Explicit scaling might interfere with CTGAN's internal processing if we want to reverse it easily for the synthetic data.
# We will SKIP manual scaling to let CTGAN handle the distributions effectively, 
# or we can scale and then remember to inverse transform. 
# Given the prompt "Scale numerical features", we will apply MinMax scaling but keep the scaler to inverse transform later.

scaler = MinMaxScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

print("Data scaled. Example:")
df.head()

## 3. Train Generator–Discriminator (CTGAN)

In [None]:
# Define discrete columns which are categorical + boolean/binary columns that are not continuous
discrete_columns = categorical_features + ['is_cross_chain', 'is_scam', 'hour', 'day_of_week']
# Ensure these are in the dataframe
discrete_columns = [col for col in discrete_columns if col in df.columns]

# Initialize CTGAN
ctgan = CTGAN(epochs=10, verbose=True) # epochs set to 10 for quick demonstration, increase for better quality

print("Training CTGAN on real data...")
ctgan.fit(df, discrete_columns=discrete_columns)
print("Training Complete.")

## 4. Generate Synthetic Data

In [None]:
# Condition generation on is_scam = 1
num_samples = 1000 # Generate 1000 synthetic scam transactions

# CTGAN sample method doesn't support direct conditional sampling in the basic API in older versions, 
# but newer versions might. 
# If CTGAN doesn't support `conditional_column` arg directly in `sample`, we might need to use `sample` and filter,
# OR use the conditional vector logic if taking a deeper approach.
# However, typically standard CTGAN generates the whole distribution. 
# To properly condition, we can use the 'sample' method which generates from the learned distribution.
# If we need specific 'is_scam=1', we can standardly generate a batch and filter, or use the `conditional` parameters if available in the specific library version installed.
# Let's assume standard sampling and we filter for now, or we can retrain on ONLY scam data if we strictly want a scam generator.
# STRATEGY: Train on FULL data (as requested), and then we can try to sample.
# NOTE: SDV (Synthetic Data Vault) wrapper around CTGAN has easier conditional sampling. CTGAN raw might need more manual work.
# Let's try to generate a larger batch and filter, or check if we can enforce conditions.

# Generating synthetic data
synthetic_data = ctgan.sample(num_samples * 5) # Generate more to ensure we get enough scam labels if the model learned the ratio

# Filter for scam transactions
synthetic_scam_data = synthetic_data[synthetic_data['is_scam'] == 1]

# If we didn't get enough, we might just take what we have or generate more.
if len(synthetic_scam_data) < num_samples:
    print(f"Warning: Only generated {len(synthetic_scam_data)} scam samples. You may need to train longer or generate more samples.")
else:
    synthetic_scam_data = synthetic_scam_data.sample(num_samples)

print(f"Generated {len(synthetic_scam_data)} synthetic scam transactions.")
synthetic_scam_data.head()

## 5. Combine & Validate

In [None]:
# Inverse transform the numerical columns for both real and synthetic data validation
real_data_rescaled = df.copy()
real_data_rescaled[numerical_features] = scaler.inverse_transform(real_data_rescaled[numerical_features])

synthetic_scam_rescaled = synthetic_scam_data.copy()
synthetic_scam_rescaled[numerical_features] = scaler.inverse_transform(synthetic_scam_rescaled[numerical_features])

# Combine real non-scam + synthetic scam data
real_non_scam = real_data_rescaled[real_data_rescaled['is_scam'] == 0]
final_dataset = pd.concat([real_non_scam, synthetic_scam_rescaled], axis=0)

# Shuffle dataset
final_dataset = final_dataset.sample(frac=1).reset_index(drop=True)

print("Final Dataset Shape:", final_dataset.shape)
print("Class Distribution:\n", final_dataset['is_scam'].value_counts())

In [None]:
# Validate distributions (optional visualization)
plt.figure(figsize=(10, 6))
sns.histplot(data=real_data_rescaled, x='transaction_amount_usd', hue='is_scam', element='step', stat='density', common_norm=False)
plt.title('Real Data: Transaction Amount Distribution')
plt.show()

plt.figure(figsize=(10, 6))
sns.histplot(data=final_dataset, x='transaction_amount_usd', hue='is_scam', element='step', stat='density', common_norm=False)
plt.title('Hybrid Data (Real Normal + Synthetic Fraud): Transaction Amount Distribution')
plt.show()

In [None]:
# Save final fraud dataset
final_dataset.to_csv('synthetic_fraud_dataset.csv', index=False)
print("Saved final dataset to 'synthetic_fraud_dataset.csv'")