In [None]:
# Cell 1: Imports

import pandas as pd
import numpy as np
import kuzu
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import KFold
import torch
import torch.nn as nn
import torch.optim as optim
import os
import shutil


In [None]:
# Cell 2: Autoencoder

# Autoencoder model with increased dropout and deeper architecture
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(128, 32),
            nn.ReLU(),
            nn.Linear(32, 16)
        )
        self.decoder = nn.Sequential(
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )
    
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


In [None]:
import pandas as pd
import numpy as np
import networkx as nx

def generate_synthetic_data(num_entities=5000, num_transactions=25000):
    np.random.seed(42)
    entities = pd.DataFrame({
        'entity_id': [f'E{i:04d}' for i in range(num_entities)],
        'profile_type': np.random.choice(['individual_low', 'individual_high', 'business_small', 'business_large'], num_entities, p=[0.4, 0.1, 0.3, 0.2]),
        'agent_id': np.random.randint(1, 100, num_entities),
        'kyc_risk_score': np.random.uniform(0, 0.3, num_entities),
        'dormancy_period': np.random.randint(0, 365, num_entities)
    })
    suspicious = np.random.choice([0, 1], num_entities, p=[0.99, 0.01])
    potential_launderers = np.random.choice([0, 1], num_entities, p=[0.98, 0.02])
    suspicious_indices = np.where(suspicious)[0]
    potential_indices = np.where(potential_launderers & ~suspicious)[0]
    
    entities.loc[suspicious_indices, 'kyc_risk_score'] = np.where(
        entities.loc[suspicious_indices, 'profile_type'].isin(['individual_low', 'individual_high']),
        np.random.uniform(0.6, 0.9, len(suspicious_indices)),
        np.random.uniform(0.4, 0.7, len(suspicious_indices))
    )
    entities.loc[suspicious_indices, 'dormancy_period'] = np.random.randint(200, 365, len(suspicious_indices))
    entities.loc[potential_indices, 'kyc_risk_score'] = np.random.uniform(0.3, 0.6, len(potential_indices))
    
    G = nx.barabasi_albert_graph(num_entities, 5, seed=42)
    edges = [(f'E{u:04d}', f'E{v:04d}') for u, v in G.edges()]
    transactions = pd.DataFrame({
        'sender_id': [edges[i % len(edges)][0] for i in range(num_transactions)],
        'receiver_id': [edges[i % len(edges)][1] for i in range(num_transactions)],
        'amount': np.random.exponential(50000, num_transactions).clip(100, 1500000),
        'is_cross_border': np.random.choice([True, False], num_transactions, p=[0.08, 0.92]),
        'timestamp': np.random.randint(0, 30, num_transactions),
        'ml_flag': np.zeros(num_transactions, dtype=int),
        'flagged_receiver': np.random.choice([True, False], num_transactions, p=[0.02, 0.98]),
        'high_risk_jurisdiction': np.random.choice([True, False], num_transactions, p=[0.03, 0.97])
    })
    
    for i, row in entities.iterrows():
        mask = transactions['sender_id'] == row['entity_id']
        num_tx = sum(mask)
        if suspicious[i]:
            transactions.loc[mask, 'ml_flag'] = np.random.choice([0, 1], num_tx, p=[0.9, 0.1])
            transactions.loc[mask, 'amount'] = np.random.choice(
                [np.random.uniform(5000, 50000), np.random.uniform(200000, 800000)], 
                size=num_tx, p=[0.6, 0.4]
            )
            transactions.loc[mask, 'timestamp'] = np.random.randint(0, 3, num_tx)
            transactions.loc[mask, 'is_cross_border'] = np.random.choice([True, False], num_tx, p=[0.7, 0.3])
            transactions.loc[mask, 'high_risk_jurisdiction'] = np.random.choice([True, False], num_tx, p=[0.5, 0.5])
            transactions.loc[mask, 'flagged_receiver'] = np.random.choice([True, False], num_tx, p=[0.3, 0.7])
        elif potential_launderers[i]:
            transactions.loc[mask, 'amount'] = np.random.choice(
                [np.random.uniform(5000, 50000), np.random.uniform(200000, 800000)], 
                size=num_tx, p=[0.6, 0.4]
            )
            transactions.loc[mask, 'timestamp'] = np.random.randint(0, 3, num_tx)
            transactions.loc[mask, 'is_cross_border'] = np.random.choice([True, False], num_tx, p=[0.7, 0.3])
    
    suspicious_transactions = transactions[transactions['ml_flag'] == 1].sample(frac=0.1)
    for idx in suspicious_transactions.index:
        sender = transactions.loc[idx, 'sender_id']
        intermediate = np.random.choice(entities['entity_id'], 1)[0]
        transactions = pd.concat([transactions, pd.DataFrame([{
            'sender_id': sender, 'receiver_id': intermediate, 'amount': transactions.loc[idx, 'amount'],
            'is_cross_border': False, 'timestamp': transactions.loc[idx, 'timestamp'] + 1,
            'ml_flag': 0, 'flagged_receiver': False, 'high_risk_jurisdiction': False
        }, {
            'sender_id': intermediate, 'receiver_id': sender, 'amount': transactions.loc[idx, 'amount'],
            'is_cross_border': False, 'timestamp': transactions.loc[idx, 'timestamp'] + 2,
            'ml_flag': 0, 'flagged_receiver': False, 'high_risk_jurisdiction': False
        }])], ignore_index=True)
    
    normal_high_risk = np.random.choice(entities.index, size=int(0.1 * num_entities))
    for i in normal_high_risk:
        mask = transactions['sender_id'] == entities.loc[i, 'entity_id']
        transactions.loc[mask, 'is_cross_border'] = np.random.choice([True, False], sum(mask), p=[0.4, 0.6])
        transactions.loc[mask, 'amount'] = np.random.uniform(10000, 100000, sum(mask))
    
    entities.to_csv('entities.csv', index=False)
    transactions.to_csv('transactions.csv', index=False)
    print(f"Generated {num_entities} entities and {num_transactions} transactions")
    print(transactions['amount'].describe())
    print(transactions[['is_cross_border', 'ml_flag', 'flagged_receiver', 'high_risk_jurisdiction']].value_counts())
    print(entities[['kyc_risk_score', 'dormancy_period']].describe())
    print(f"Proportion of ml_flag = 1: {transactions['ml_flag'].mean():.4f}")

generate_synthetic_data()

Generated 5000 entities and 25000 transactions
count     25004.000000
mean      56483.247940
std       76459.887197
min         100.000000
25%       15833.933555
50%       36874.917196
75%       71979.541353
max      795934.101611
Name: amount, dtype: float64
is_cross_border  ml_flag  flagged_receiver  high_risk_jurisdiction
False            0        False             False                     20668
True             0        False             False                      2989
False            0        False             True                        650
                          True              False                       439
True             0        False             True                        120
                          True              False                        65
False            0        True              True                         36
True             0        True              True                         15
                 1        False             False                

In [None]:
# Cell 4: Clear and recreate Kuzu database
db_path = 'kuzu_db'
if os.path.exists(db_path):
    shutil.rmtree(db_path)
print("Creating new Kuzu database")
db = kuzu.Database(db_path)
conn = kuzu.Connection(db)
conn.execute("CREATE NODE TABLE Entity(entity_id STRING, profile_type STRING, agent_id INT, kyc_risk_score DOUBLE, dormancy_period INT, PRIMARY KEY(entity_id))")
conn.execute("CREATE REL TABLE Transaction(FROM Entity TO Entity, amount DOUBLE, is_cross_border BOOLEAN, timestamp INT, ml_flag INT, flagged_receiver BOOLEAN, high_risk_jurisdiction BOOLEAN)")
conn.execute("COPY Entity FROM 'entities.csv' (HEADER=TRUE, DELIM=',', QUOTE='\"')")
conn.execute("COPY Transaction FROM 'transactions.csv' (HEADER=TRUE, DELIM=',', QUOTE='\"')")

# Verify database
result = conn.execute("MATCH (e:Entity) RETURN COUNT(e)")
num_entities = result.get_next()[0]
result = conn.execute("MATCH ()-[t:Transaction]->() RETURN COUNT(t)")
num_transactions = result.get_next()[0]
print(f"Number of entities: {num_entities}")
print(f"Number of transactions: {num_transactions}")
if num_entities == 0 or num_transactions == 0:
    raise ValueError("Database is empty.")


Creating new Kuzu database
Number of entities: 5000
Number of transactions: 25004


In [None]:
# Cell 5

# Query 1: Degree
try:
    result = conn.execute("""
        MATCH (e:Entity)-[t:Transaction]->()
        RETURN e.entity_id, COUNT(t) AS degree
    """)
    degree_df = pd.DataFrame([result.get_next() for _ in range(result.get_num_tuples())], 
                            columns=['entity_id', 'degree'])
    degree_df['degree'] = pd.to_numeric(degree_df['degree'], errors='coerce').fillna(0)
    print(f"Degree query returned {len(degree_df)} rows")
except Exception as e:
    print(f"Error executing degree query: {e}")
    degree_df = pd.DataFrame({'entity_id': entities['entity_id'], 'degree': 0})

# Query 3: Small transaction count
try:
    result = conn.execute("""
        MATCH (e:Entity)-[t:Transaction]->()
        WHERE t.amount < 1000
        RETURN e.entity_id, COUNT(t) AS small_tx_count
    """)
    small_tx_df = pd.DataFrame([result.get_next() for _ in range(result.get_num_tuples())], 
                              columns=['entity_id', 'small_tx_count'])
    small_tx_df['small_tx_count'] = pd.to_numeric(small_tx_df['small_tx_count'], errors='coerce').fillna(0)
    print(f"Small transaction query returned {len(small_tx_df)} rows")
except Exception as e:
    print(f"Error executing small transaction query: {e}")
    small_tx_df = pd.DataFrame({'entity_id': entities['entity_id'], 'small_tx_count': 0})

# Query 4: Clustering coefficient
try:
    result = conn.execute("""
        MATCH (e:Entity)-[t1:Transaction]->(m:Entity)-[t2:Transaction]->(n:Entity)
        WHERE n.entity_id = e.entity_id AND m.entity_id <> e.entity_id
        RETURN e.entity_id, COUNT(DISTINCT m) AS clustering_coeff
    """)
    cluster_df = pd.DataFrame([result.get_next() for _ in range(result.get_num_tuples())], 
                             columns=['entity_id', 'clustering_coeff'])
    cluster_df['clustering_coeff'] = pd.to_numeric(cluster_df['clustering_coeff'], errors='coerce').fillna(0)
    print(f"Clustering coefficient query returned {len(cluster_df)} rows")
except Exception as e:
    print(f"Error executing clustering coefficient query: {e}")
    cluster_df = pd.DataFrame({'entity_id': entities['entity_id'], 'clustering_coeff': 0})

# Query 5: Transaction frequency variance
try:
    result = conn.execute("""
        MATCH (e:Entity)-[t:Transaction]->()
        RETURN e.entity_id, COUNT(t) AS tx_count
    """)
    tx_count_df = pd.DataFrame([result.get_next() for _ in range(result.get_num_tuples())], 
                              columns=['entity_id', 'tx_count'])
    tx_count_df['tx_count'] = pd.to_numeric(tx_count_df['tx_count'], errors='coerce').fillna(0)
    mean_tx_count = tx_count_df['tx_count'].mean()
    tx_count_df['tx_freq_variance'] = (tx_count_df['tx_count'] - mean_tx_count) ** 2
    tx_freq_df = tx_count_df.groupby('entity_id').agg({'tx_freq_variance': 'mean'}).reset_index()
    tx_freq_df['tx_freq_variance'] = np.sqrt(tx_freq_df['tx_freq_variance']).fillna(0)
    print(f"Transaction frequency query returned {len(tx_count_df)} rows")
except Exception as e:
    print(f"Error computing tx frequency variance: {e}")
    tx_freq_df = pd.DataFrame({'entity_id': entities['entity_id'], 'tx_freq_variance': 0})

# Query 6: Amount variance and skewness
try:
    result = conn.execute("""
        MATCH (e:Entity)-[t:Transaction]->()
        RETURN e.entity_id, t.amount AS amount
    """)
    amount_df = pd.DataFrame([result.get_next() for _ in range(result.get_num_tuples())], 
                            columns=['entity_id', 'amount'])
    amount_df['amount'] = pd.to_numeric(amount_df['amount'], errors='coerce')
    var_skew_df = amount_df.groupby('entity_id').agg({
        'amount': [
            ('avg_amount', 'mean'),
            ('amount_variance', lambda x: np.var(x, ddof=1) if len(x) > 1 else 0),
            ('amount_skewness', lambda x: ((x - x.mean()) ** 3).mean() / (x.std(ddof=1) ** 3 + 1e-7) if len(x) > 1 else 0),
            ('high_value_ratio', lambda x: sum(x > 100000) / len(x) if len(x) > 0 else 0)  # Added
        ]
    }).droplevel(0, axis=1).reset_index()
    var_skew_df[['amount_variance', 'amount_skewness', 'high_value_ratio']] = var_skew_df[['amount_variance', 'amount_skewness', 'high_value_ratio']].fillna(0)
except Exception as e:
    print(f"Error computing amount variance/skewness: {e}")
    var_skew_df = pd.DataFrame({'entity_id': entities['entity_id'], 'avg_amount': 0, 'amount_variance': 0, 'amount_skewness': 0, 'high_value_ratio': 0})

# Query 7: Transaction velocity, burstiness, and temporal concentration
try:
    result = conn.execute("""
        MATCH (e:Entity)-[t:Transaction]->()
        RETURN e.entity_id, t.timestamp AS timestamp
    """)
    time_df = pd.DataFrame([result.get_next() for _ in range(result.get_num_tuples())], 
                          columns=['entity_id', 'timestamp'])
    time_df['timestamp'] = pd.to_numeric(time_df['timestamp'], errors='coerce')
    time_agg_df = time_df.groupby('entity_id').agg({
        'timestamp': [
            ('tx_velocity', lambda x: len(x) / (x.max() - x.min() + 1e-7)),
            ('burstiness', lambda x: np.var(x, ddof=1) * 10 if len(x) > 1 else 0),
            ('temporal_concentration', lambda x: np.mean((x.max() - x.min()) < 3) if len(x) > 0 else 0)
        ]
    }).droplevel(0, axis=1).reset_index()
    time_agg_df[['tx_velocity', 'burstiness', 'temporal_concentration']] = time_agg_df[['tx_velocity', 'burstiness', 'temporal_concentration']].fillna(0)
except Exception as e:
    print(f"Error computing temporal features: {e}")
    time_agg_df = pd.DataFrame({'entity_id': entities['entity_id'], 'tx_velocity': 0, 'burstiness': 0, 'temporal_concentration': 0})

# Query 8: Additional features
try:
    result = conn.execute("""
        MATCH (e:Entity)
        OPTIONAL MATCH (e)-[t:Transaction]->()
        OPTIONAL MATCH ()-[t_in:Transaction]->(e)
        RETURN e.entity_id, 
               e.kyc_risk_score AS kyc_risk_score,
               e.dormancy_period AS dormancy_period,
               COALESCE(SUM(CASE WHEN t.is_cross_border THEN t.amount ELSE 0 END), 0) AS cross_border_amount,
               COALESCE(COUNT(t_in), 0) AS in_degree,
               COALESCE(COUNT(t), 0) AS tx_frequency,
               COALESCE(COUNT(t_in) / (COUNT(t_in) + COUNT(t) + 1e-7), 0) AS directionality_ratio
    """)
    features_df = pd.DataFrame([result.get_next() for _ in range(result.get_num_tuples())], 
                              columns=['entity_id', 'kyc_risk_score', 'dormancy_period', 'cross_border_amount', 'in_degree', 'tx_frequency', 'directionality_ratio'])
    numeric_cols = ['kyc_risk_score', 'dormancy_period', 'cross_border_amount', 'in_degree', 'tx_frequency', 'directionality_ratio']
    features_df[numeric_cols] = features_df[numeric_cols].apply(pd.to_numeric, errors='coerce', downcast='float').fillna(0)
except Exception as e:
    print(f"Error executing features query: {e}")
    features_df = pd.DataFrame({'entity_id': entities['entity_id'], 'kyc_risk_score': 0, 'dormancy_period': 0, 'cross_border_amount': 0, 'in_degree': 0, 'tx_frequency': 0, 'directionality_ratio': 0})

# Query 9: Round-tripping (multi-hop)
try:
    result = conn.execute("""
        MATCH (e:Entity)-[t1:Transaction]->(m:Entity)-[t2:Transaction]->(e2:Entity)
        WHERE e2.entity_id = e.entity_id AND m.entity_id <> e.entity_id
        RETURN e.entity_id, COUNT(DISTINCT t1) AS round_trip_count
    """)
    round_trip_df = pd.DataFrame([result.get_next() for _ in range(result.get_num_tuples())], 
                                columns=['entity_id', 'round_trip_count'])
    round_trip_df['round_trip_count'] = pd.to_numeric(round_trip_df['round_trip_count'], errors='coerce').fillna(0)
except Exception as e:
    print(f"Error computing round-trip count: {e}")
    round_trip_df = pd.DataFrame({'entity_id': entities['entity_id'], 'round_trip_count': 0})

Degree query returned 3534 rows
Small transaction query returned 337 rows
Clustering coefficient query returned 3 rows
Transaction frequency query returned 3534 rows


In [None]:
# Cell 6: Training Prep

from sklearn.metrics import roc_curve
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Merge features
data_df = degree_df.merge(small_tx_df, on='entity_id', how='left').merge(var_skew_df, on='entity_id', how='left').merge(cluster_df, on='entity_id', how='left').merge(tx_freq_df, on='entity_id', how='left').merge(time_agg_df, on='entity_id', how='left').merge(features_df, on='entity_id', how='left').merge(round_trip_df, on='entity_id', how='left')
data_df.fillna({'small_tx_count': 0, 'avg_amount': 0, 'amount_variance': 0, 'amount_skewness': 0, 'clustering_coeff': 0, 'tx_freq_variance': 0, 'tx_velocity': 0, 'burstiness': 0, 'temporal_concentration': 0, 'kyc_risk_score': 0, 'dormancy_period': 0, 'cross_border_amount': 0, 'in_degree': 0, 'tx_frequency': 0, 'directionality_ratio': 0, 'round_trip_count': 0}, inplace=True)
print("Merged data_df:")
print(data_df.head())
print("NaN counts:")
print(data_df.isna().sum())
print("Data types:")
print(data_df.dtypes)

# Compute ml_flag and labels
try:
    result = conn.execute("""
        MATCH (e:Entity)-[t:Transaction]->()
        RETURN e.entity_id, AVG(t.ml_flag) AS ml_flag_score
    """)
    ml_flag_df = pd.DataFrame([result.get_next() for _ in range(result.get_num_tuples())], 
                             columns=['entity_id', 'ml_flag_score'])
    ml_flag_df['ml_flag_score'] = pd.to_numeric(ml_flag_df['ml_flag_score'], errors='coerce').fillna(0)
    data_df = data_df.merge(ml_flag_df, on='entity_id', how='left').fillna({'ml_flag_score': 0})
    # Set initial is_anomaly based on non-zero ml_flag_score
    data_df['is_anomaly'] = (data_df['ml_flag_score'] > 0).astype(int)
    # Use ROC curve to optimize threshold if there are enough positive samples
    if data_df['is_anomaly'].sum() > 1:  # Ensure enough positives for ROC
        fpr, tpr, thresholds = roc_curve(data_df['is_anomaly'], data_df['ml_flag_score'])
        anomaly_threshold = thresholds[np.argmax(tpr - fpr)]  # Youden’s J
        data_df['is_anomaly'] = (data_df['ml_flag_score'] > anomaly_threshold).astype(int)
    else:
        print("Warning: Too few positive samples for ROC; using 95th percentile threshold")
        anomaly_threshold = np.percentile(data_df['ml_flag_score'], 95)
        data_df['is_anomaly'] = (data_df['ml_flag_score'] > anomaly_threshold).astype(int)
except Exception as e:
    print(f"Error computing ml_flag_score: {e}")
    data_df['ml_flag_score'] = np.zeros(len(data_df), dtype=float)
    data_df['is_anomaly'] = np.zeros(len(data_df), dtype=int)

# Feature selection
feature_columns = ['degree', 'tx_velocity', 'kyc_risk_score', 'cross_border_amount', 'in_degree', 'directionality_ratio', 'round_trip_count']
X = data_df[feature_columns].values

# Verify X
if not np.all(np.isfinite(X)):
    print("Warning: X contains non-finite values")
    X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)

# Standard scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)
print("Any NaN in X:", np.any(np.isnan(X)))
print("Any inf in X:", np.any(np.isinf(X)))

# Split for held-out test set
np.random.seed(42)
test_size = 0.2
test_idx = np.random.choice(len(X), size=int(test_size * len(X)), replace=False)
train_idx = np.setdiff1d(np.arange(len(X)), test_idx)
X_train, X_test = X[train_idx], X[test_idx]
y_test = data_df['is_anomaly'].iloc[test_idx]


In [None]:
# Cell 7: Train

from sklearn.metrics import precision_recall_curve, auc, roc_curve
from sklearn.model_selection import StratifiedKFold
import numpy as np
import torch

# Autoencoder training on normal data
normal_idx = np.random.choice(np.where(data_df.iloc[train_idx]['is_anomaly'] == 0)[0], size=int(0.5 * sum(data_df.iloc[train_idx]['is_anomaly'] == 0)), replace=False)
X_normal = X_train[normal_idx]
X_normal_tensor = torch.FloatTensor(X_normal)
input_dim = X.shape[1]
autoencoder = Autoencoder(input_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.0005, weight_decay=1e-4)

# Training with early stopping
X_val = X_train[np.random.choice(np.where(data_df.iloc[train_idx]['is_anomaly'] == 0)[0], size=int(0.1 * len(X_train)), replace=False)]
X_val_tensor = torch.FloatTensor(X_val)
best_loss = float('inf')
patience, max_patience = 0, 10
for epoch in range(300):  # Increased epochs
    autoencoder.train()
    optimizer.zero_grad()
    outputs = autoencoder(X_normal_tensor)
    loss = criterion(outputs, X_normal_tensor)
    loss.backward()
    optimizer.step()
    autoencoder.eval()
    with torch.no_grad():
        val_outputs = autoencoder(X_val_tensor)
        val_loss = criterion(val_outputs, X_val_tensor)
    if val_loss < best_loss:
        best_loss = val_loss
        patience = 0
    else:
        patience += 1
        if patience >= max_patience:
            print(f"Early stopping at epoch {epoch}")
            break
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}")

# Compute reconstruction errors
autoencoder.eval()
with torch.no_grad():
    X_tensor = torch.FloatTensor(X)
    reconstructed = autoencoder(X_tensor)
    reconstruction_errors = torch.mean((reconstructed - X_tensor) ** 2, dim=1).numpy()
anomaly_scores = 2000 * (reconstruction_errors - reconstruction_errors.min()) / (reconstruction_errors.max() - reconstruction_errors.min() + 1e-7)
data_df['anomaly_score'] = anomaly_scores
fpr, tpr, thresholds = roc_curve(y_test, anomaly_scores[test_idx])
optimal_threshold = thresholds[np.argmax(tpr - fpr)]
data_df['predicted_anomaly'] = (anomaly_scores > optimal_threshold).astype(int)

# Evaluate on test set
if 'is_anomaly' in data_df.columns:
    precision = precision_score(y_test, data_df.iloc[test_idx]['predicted_anomaly'], zero_division=0)
    recall = recall_score(y_test, data_df.iloc[test_idx]['predicted_anomaly'], zero_division=0)
    f1 = f1_score(y_test, data_df.iloc[test_idx]['predicted_anomaly'], zero_division=0)
    precision_curve, recall_curve, _ = precision_recall_curve(y_test, anomaly_scores[test_idx])
    auprc = auc(recall_curve, precision_curve)
    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall: {recall:.4f}")
    print(f"Test F1 Score: {f1:.4f}")
    print(f"Test AUPRC: {auprc:.4f}")

# K-fold cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # Reduced to 5 folds
fold = 1
precisions, recalls, f1_scores, auprcs = [], [], [], []
mean_anomaly_scores = []

for train_idx_fold, val_idx in kf.split(X_train, data_df.iloc[train_idx]['is_anomaly']):
    print(f"\nFold {fold}")
    X_train_fold, X_val = X_train[train_idx_fold], X_train[val_idx]
    y_val = data_df['is_anomaly'].iloc[train_idx[val_idx]]
    train_normal_idx = np.where(data_df.iloc[train_idx[train_idx_fold]]['is_anomaly'] == 0)[0]
    X_train_normal = X_train_fold[train_normal_idx]
    X_train_normal_tensor = torch.FloatTensor(X_train_normal)
    X_val_tensor = torch.FloatTensor(X_val)
    
    autoencoder = Autoencoder(input_dim)
    optimizer = optim.Adam(autoencoder.parameters(), lr=0.0005, weight_decay=1e-4)
    best_loss = float('inf')
    patience = 0
    for epoch in range(300):
        autoencoder.train()
        optimizer.zero_grad()
        outputs = autoencoder(X_train_normal_tensor)
        loss = criterion(outputs, X_train_normal_tensor)
        loss.backward()
        optimizer.step()
        autoencoder.eval()
        with torch.no_grad():
            val_outputs = autoencoder(X_val_tensor)
            val_loss = criterion(val_outputs, X_val_tensor)
        if val_loss < best_loss:
            best_loss = val_loss
            patience = 0
        else:
            patience += 1
            if patience >= max_patience:
                print(f"Early stopping at epoch {epoch}")
                break
    
    autoencoder.eval()
    with torch.no_grad():
        reconstructed_val = autoencoder(X_val_tensor)
        val_errors = torch.mean((reconstructed_val - X_val_tensor) ** 2, dim=1).numpy()
    val_scores = 2000 * (val_errors - val_errors.min()) / (val_errors.max() - val_errors.min() + 1e-7)
    if y_val.sum() > 0:  # Check for positive samples
        fpr, tpr, thresholds = roc_curve(y_val, val_scores)
        optimal_threshold = thresholds[np.argmax(tpr - fpr)]
    else:
        print(f"Warning: No positive samples in Fold {fold}; using 95th percentile threshold")
        optimal_threshold = np.percentile(val_scores, 95)
    val_predictions = (val_scores > optimal_threshold).astype(int)
    
    precision = precision_score(y_val, val_predictions, zero_division=0)
    recall = recall_score(y_val, val_predictions, zero_division=0)
    f1 = f1_score(y_val, val_predictions, zero_division=0)
    precision_curve, recall_curve, _ = precision_recall_curve(y_val, val_scores)
    auprc = auc(recall_curve, precision_curve)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    auprcs.append(auprc)
    mean_anomaly_scores.append(np.mean(val_scores))
    
    print(f"Fold {fold} Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, AUPRC: {auprc:.4f}")
    print(f"Fold {fold} Mean Anomaly Score: {np.mean(val_scores):.4f}")
    print(f"Validation Anomaly Scores (first 10): {val_scores[:10]}")
    fold += 1
    
# Average metrics
print(f"\nAverage Precision: {np.mean(precisions):.4f}")
print(f"Average Recall: {np.mean(recalls):.4f}")
print(f"Average F1 Score: {np.mean(f1_scores):.4f}")
print(f"Average AUPRC: {np.mean(auprcs):.4f}")
print(f"Average Mean Anomaly Score: {np.mean(mean_anomaly_scores):.4f}")

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Plot anomaly score distribution with y-axis clipping
plt.figure(figsize=(8, 5))
scores = data_df['anomaly_score']
anomaly_scores = data_df[data_df['is_anomaly'] == 1]['anomaly_score']
hist, bins = np.histogram(scores, bins=30)  # Increased bins for finer detail
hist_anomaly, _ = np.histogram(anomaly_scores, bins=bins)
y_max = np.percentile(hist, 95)  # Clip y-axis at 95th percentile
plt.hist(scores, bins=30, alpha=0.5, label='All Scores')
plt.hist(anomaly_scores, bins=30, alpha=0.5, label='True Anomalies')
plt.ylim(0, y_max)
plt.title('Anomaly Score Distribution (Y-Axis Clipped at 95th Percentile)')
plt.xlabel('Anomaly Score')
plt.ylabel('Frequency')
plt.legend()
plt.show()

# Plot ROC curve
plt.figure(figsize=(8, 5))
fpr, tpr, _ = roc_curve(data_df['is_anomaly'], data_df['anomaly_score'])
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

# Feature correlations
print("Feature correlations with anomaly score:")
correlations = data_df[feature_columns + ['anomaly_score']].corr()['anomaly_score'].drop('anomaly_score')
print(correlations)

# Save results
data_df.to_csv('anomaly_results.csv', index=False)

# Check for data leakage
print("Feature means by anomaly label:")
print(data_df.groupby('is_anomaly')[['cross_border_amount', 'kyc_risk_score', 'dormancy_period']].mean())