In [5]:
# anomaly_config.py
import os

CONFIG = {
    # "DB_CONNECTION_STRING": os.getenv("DB_CONNECTION_STRING", "sqlite:///trades.db"),  # Removed or commented out
    "BATCH_DATE": os.getenv("BATCH_DATE", "2024-12-11"),
    "EIF_PARAMS": {
        "ntrees": 100,
        "sample_size": 256,
        "extension_level": 1,  # Corrected to match 'ExtensionLevel' in eif
        "contamination": 0.01  # 1% anomalies
    },
    "DEEPIF_PARAMS": {
        "latent_dim": 2,
        "epochs": 50,
        "batch_size": 32,
        "learning_rate": 0.001,
        "if_n_estimators": 100,
        "if_contamination": 0.01
    },
    "FEATURE_COLUMNS": ["our_cents", "cp_cents", "notional", "impact_dollars", "product_type", "counterparty"],
    "LOG_LEVEL": "INFO",
    "RESULT_TABLE": "trade_anomaly_detection_results",  # Can be removed if not using
    "SAMPLE_DATA_SIZE": 1000  # Number of samples for synthetic data
}

In [6]:
# data_generation.py
import numpy as np
import pandas as pd

def generate_synthetic_data(size=1000, random_state=42):
    np.random.seed(random_state)
    
    # Numerical features
    our_cents = np.random.normal(loc=100, scale=10, size=size)
    cp_cents = our_cents + np.random.normal(loc=0, scale=5, size=size)  # Slight variations
    
    # Introduce anomalies
    n_anomalies = int(size * 0.01)  # 1% anomalies
    anomaly_indices = np.random.choice(size, n_anomalies, replace=False)
    our_cents[anomaly_indices] += np.random.normal(loc=50, scale=10, size=n_anomalies)  # Large deviation
    
    notional = np.random.uniform(10000, 1000000, size=size)
    impact_dollars = np.abs(our_cents - cp_cents) * notional / 100.0
    
    # Categorical features
    product_types = ['swaption', 'vanilla_swap', 'option', 'forward']
    counterparties = ['CP_A', 'CP_B', 'CP_C', 'CP_D']
    
    product_type = np.random.choice(product_types, size=size)
    counterparty = np.random.choice(counterparties, size=size)
    
    # Create DataFrame
    data = pd.DataFrame({
        'trade_id': range(1, size + 1),
        'our_cents': our_cents,
        'cp_cents': cp_cents,
        'notional': notional,
        'impact_dollars': impact_dollars,
        'product_type': product_type,
        'counterparty': counterparty
    })
    
    return data

In [7]:
# data_preprocessing.py
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def preprocess_data(df, feature_columns):
    # Separate features
    X = df[feature_columns]
    
    # Define categorical and numerical columns
    categorical_cols = ['product_type', 'counterparty']
    numerical_cols = ['our_cents', 'cp_cents', 'notional', 'impact_dollars']
    
    # Preprocessing pipelines
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ]
    )
    
    # Fit and transform
    X_processed = preprocessor.fit_transform(X)
    
    # Get feature names after encoding
    ohe = preprocessor.named_transformers_['cat']['onehot']
    cat_features = ohe.get_feature_names_out(categorical_cols)
    all_features = numerical_cols + list(cat_features)
    
    X_processed_df = pd.DataFrame(X_processed, columns=all_features)
    
    return X_processed_df, preprocessor

In [8]:
# eif_model.py
import numpy as np
import pandas as pd
from eif import iForest
import logging

def run_eif(df, feature_columns, eif_params):
    """
    Apply Extended Isolation Forest (EIF) for anomaly detection.
    """
    # Log available columns
    logging.debug(f"Available columns in EIF DataFrame: {df.columns.tolist()}")
    logging.debug(f"Expected feature columns: {feature_columns}")
    
    # Extract features
    X = df[feature_columns].values
    
    # Initialize EIF with corrected parameter names and pass X as the first argument
    try:
        model = iForest(
            X,  # Pass X as the first positional argument
            ntrees=eif_params['ntrees'],
            sample_size=eif_params['sample_size'],
            ExtensionLevel=eif_params['extension_level']  # Corrected parameter name
        )
    except TypeError as te:
        logging.error(f"TypeError during EIF model initialization: {te}")
        raise
    except Exception as e:
        logging.error(f"Unexpected error during EIF model initialization: {e}")
        raise
    
    # Compute anomaly scores (lower scores are more anomalous)
    try:
        scores = model.compute_paths(X_in=X)
    except Exception as e:
        logging.error(f"Error computing anomaly scores: {e}")
        raise
    
    # Determine threshold based on contamination
    try:
        threshold = np.percentile(scores, eif_params['contamination'] * 100)
    except Exception as e:
        logging.error(f"Error calculating threshold: {e}")
        raise
    
    # Label anomalies
    labels = scores < threshold  # True if anomaly
    
    # Add to DataFrame
    df['eif_anomaly_label'] = labels
    df['eif_anomaly_score'] = scores
    
    logging.info(f"EIF: Detected {labels.sum()} anomalies out of {len(labels)} trades.")
    
    return df

ModuleNotFoundError: No module named 'eif'

In [None]:
# deepif_model.py
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import logging
import shap

# Define the Autoencoder in PyTorch
class Autoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(Autoencoder, self).__init__()
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, latent_dim)
        )
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        latent = self.encoder(x)
        reconstructed = self.decoder(latent)
        return reconstructed, latent

def train_autoencoder(model, dataloader, epochs, lr, device):
    """
    Train the autoencoder model.
    """
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    
    model.to(device)
    model.train()
    
    for epoch in range(epochs):
        epoch_loss = 0.0
        for batch in dataloader:
            data = batch[0].to(device)
            optimizer.zero_grad()
            reconstructed, _ = model(data)
            loss = criterion(reconstructed, data)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        logging.info(f"DeepIF: Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}")
    
    return model

def run_deepif(df, feature_columns, deepif_params, preprocessor):
    """
    Apply Deep Isolation Forest (DeepIF) for anomaly detection.
    """
    # Extract features
    X = df[feature_columns].values
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Convert to torch tensor
    X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
    
    # Create DataLoader
    dataset = TensorDataset(X_tensor)
    dataloader = DataLoader(dataset, batch_size=deepif_params['batch_size'], shuffle=True)
    
    # Define autoencoder
    input_dim = X_scaled.shape[1]
    latent_dim = deepif_params['latent_dim']
    autoencoder = Autoencoder(input_dim, latent_dim)
    
    # Train autoencoder
    autoencoder = train_autoencoder(
        model=autoencoder,
        dataloader=dataloader,
        epochs=deepif_params['epochs'],
        lr=deepif_params['learning_rate'],
        device='cpu'  # Change to 'cuda' if GPU is available
    )
    
    # Extract latent embeddings
    autoencoder.eval()
    with torch.no_grad():
        latent_embeddings = autoencoder.encoder(X_tensor)  # Corrected line
    latent_embeddings = latent_embeddings.numpy()
    
    # Train Isolation Forest on latent embeddings
    isolation_forest = IsolationForest(
        n_estimators=deepif_params['if_n_estimators'],
        contamination=deepif_params['if_contamination'],
        random_state=42
    )
    isolation_forest.fit(latent_embeddings)
    
    # Predict anomalies
    anomaly_scores = isolation_forest.decision_function(latent_embeddings)
    anomaly_labels = isolation_forest.predict(latent_embeddings)  # -1 = anomaly, 1 = normal
    
    # Add results to DataFrame
    df['deepif_anomaly_score'] = anomaly_scores
    df['deepif_anomaly_label'] = (anomaly_labels == -1)
    
    logging.info(f"DeepIF: Detected {df['deepif_anomaly_label'].sum()} anomalies out of {len(df)} trades.")
    
    # Optional: Interpretability with SHAP
    # Note: SHAP explanations for autoencoder can be complex. Here's a basic example.
    try:
        explainer = shap.Explainer(autoencoder.encoder, X_scaled)
        shap_values = explainer(X_scaled[:100])  # Explain first 100 instances for speed
        shap.summary_plot(shap_values, X_scaled[:100], feature_names=preprocessor.get_feature_names_out())
    except Exception as e:
        logging.error(f"SHAP explainability failed: {e}")
    
    return df, isolation_forest

In [None]:
# combine_results.py
import pandas as pd
import logging

def combine_results(df_eif, df_deepif, original_df):
    """
    Combine EIF and DeepIF results with the original DataFrame.
    Strategies:
    - Logical AND: Flag as anomaly only if both models agree.
    - Average Scores: Combine anomaly scores for ranking.
    """
    # Ensure the DataFrames have the same index
    if not (df_eif.index.equals(df_deepif.index) and df_eif.index.equals(original_df.index)):
        logging.error("DataFrames do not have matching indices. Cannot combine results.")
        raise ValueError("DataFrames indices mismatch.")
    
    # Merge anomaly labels and scores
    combined_df = original_df.copy()
    combined_df['eif_anomaly_label'] = df_eif['eif_anomaly_label']
    combined_df['eif_anomaly_score'] = df_eif['eif_anomaly_score']
    combined_df['deepif_anomaly_label'] = df_deepif['deepif_anomaly_label']
    combined_df['deepif_anomaly_score'] = df_deepif['deepif_anomaly_score']
    
    # Logical AND
    combined_df['combined_anomaly_label'] = combined_df['eif_anomaly_label'] & combined_df['deepif_anomaly_label']
    
    # Average Scores (assuming lower scores indicate anomalies)
    combined_df['combined_anomaly_score'] = (combined_df['eif_anomaly_score'] + combined_df['deepif_anomaly_score']) / 2.0
    
    logging.info(f"Combined Results: {combined_df['combined_anomaly_label'].sum()} anomalies detected.")
    
    return combined_df

In [None]:
# interpretability.py
import shap
import matplotlib.pyplot as plt

def explain_anomalies(model, df, feature_columns, preprocessor, top_n=10):
    """
    Use SHAP to explain the top N anomalies.
    """
    # Select top N anomalies based on combined score
    anomalies = df[df['combined_anomaly_label']].sort_values('combined_anomaly_score').head(top_n)
    if anomalies.empty:
        print("No anomalies to explain.")
        return
    
    # Preprocess the data as done before
    X = anomalies[feature_columns].values
    X_processed = preprocessor.transform(anomalies[feature_columns])
    
    # Create a DataFrame with processed features
    feature_names = preprocessor.get_feature_names_out()
    X_processed_df = pd.DataFrame(X_processed, columns=feature_names)
    
    # Initialize SHAP
    explainer = shap.Explainer(model.encoder, X_processed_df)
    shap_values = explainer(X_processed_df)
    
    # Plot SHAP summary
    shap.summary_plot(shap_values, X_processed_df, feature_names=feature_names)

In [None]:
# store_results.py
import pandas as pd
import logging

def store_results(df, config):
    """
    Store anomaly detection results by exporting to a CSV file.
    """
    # Define the output CSV file path
    output_file = 'trade_anomaly_detection_results.csv'
    
    try:
        # Export the DataFrame to CSV
        df.to_csv(output_file, index=False)
        logging.info(f"Results successfully exported to '{output_file}'.")
    except Exception as e:
        logging.error(f"Failed to export results to CSV: {e}")
        raise

In [None]:
# # main.py
# import logging
# import pandas as pd
# from anomaly_config import CONFIG  # Updated import
# from data_generation import generate_synthetic_data
# from data_preprocessing import preprocess_data
# from eif_model import run_eif
# from deepif_model import run_deepif
# from combine_results import combine_results
# from store_results import store_results

def main():
    # Setup Logging
    logging.basicConfig(
        level=getattr(logging, CONFIG['LOG_LEVEL']),
        format='%(asctime)s [%(levelname)s] %(message)s'
    )
    
    # Generate Synthetic Data (Replace this with actual data loading if needed)
    logging.info("Generating synthetic data...")
    data = generate_synthetic_data(size=CONFIG['SAMPLE_DATA_SIZE'])
    
    # Display first few rows of the synthetic data
    print(data.head())
    
    # Preprocess Data
    logging.info("Preprocessing data...")
    X_processed_df, preprocessor = preprocess_data(data, CONFIG['FEATURE_COLUMNS'])
    
    # Run Extended Isolation Forest (EIF) on preprocessed features
    logging.info("Running Extended Isolation Forest (EIF)...")
    processed_eif_df = run_eif(
        df=X_processed_df.copy(),
        feature_columns=X_processed_df.columns.tolist(),
        eif_params=CONFIG['EIF_PARAMS']
    )
    
    # Run Deep Isolation Forest (DeepIF) on preprocessed features
    logging.info("Running Deep Isolation Forest (DeepIF)...")
    processed_deepif_df, deepif_model = run_deepif(
        df=X_processed_df.copy(),
        feature_columns=X_processed_df.columns.tolist(),
        deepif_params=CONFIG['DEEPIF_PARAMS'],
        preprocessor=preprocessor
    )
    
    # Combine Results
    logging.info("Combining anomaly detection results...")
    combined_df = combine_results(
        df_eif=processed_eif_df,
        df_deepif=processed_deepif_df,
        original_df=data
    )
    
    # Optional: Explain anomalies (Top N)
    # Uncomment the following lines if you want to generate SHAP explanations
    # from interpretability import explain_anomalies
    # explain_anomalies(deepif_model, combined_df, preprocessor, top_n=10)
    
    # Store Results
    logging.info("Storing results...")
    store_results(combined_df, CONFIG)
    
    logging.info("Anomaly detection pipeline completed successfully.")

if __name__ == "__main__":
    main()

2024-12-12 12:23:17,107 [INFO] Generating synthetic data...
2024-12-12 12:23:17,112 [INFO] Preprocessing data...
2024-12-12 12:23:17,117 [INFO] Running Extended Isolation Forest (EIF)...


   trade_id   our_cents    cp_cents       notional  impact_dollars  \
0         1  104.967142  111.963919   31920.060718     2233.375525   
1         2   98.617357  103.240525  945663.046868    43719.595291   
2         3  106.476885  106.775037  699660.998505     2086.052208   
3         4  115.230299  111.995615  500746.712730    16197.573239   
4         5   97.658466  101.149583  933795.962064    32599.905544   

   product_type counterparty  
0      swaption         CP_B  
1  vanilla_swap         CP_A  
2      swaption         CP_A  
3  vanilla_swap         CP_A  
4      swaption         CP_C  


2024-12-12 12:23:18,008 [INFO] EIF: Detected 10 anomalies out of 1000 trades.
2024-12-12 12:23:18,013 [INFO] Running Deep Isolation Forest (DeepIF)...
2024-12-12 12:23:18,031 [INFO] DeepIF: Epoch 1/50, Loss: 38.1730
2024-12-12 12:23:18,048 [INFO] DeepIF: Epoch 2/50, Loss: 33.2384
2024-12-12 12:23:18,064 [INFO] DeepIF: Epoch 3/50, Loss: 31.6126
2024-12-12 12:23:18,081 [INFO] DeepIF: Epoch 4/50, Loss: 31.0320
2024-12-12 12:23:18,098 [INFO] DeepIF: Epoch 5/50, Loss: 30.7357
2024-12-12 12:23:18,115 [INFO] DeepIF: Epoch 6/50, Loss: 28.0840
2024-12-12 12:23:18,131 [INFO] DeepIF: Epoch 7/50, Loss: 26.9228
2024-12-12 12:23:18,162 [INFO] DeepIF: Epoch 8/50, Loss: 26.4190
2024-12-12 12:23:18,178 [INFO] DeepIF: Epoch 9/50, Loss: 26.0102
2024-12-12 12:23:18,193 [INFO] DeepIF: Epoch 10/50, Loss: 25.8654
2024-12-12 12:23:18,209 [INFO] DeepIF: Epoch 11/50, Loss: 25.5518
2024-12-12 12:23:18,225 [INFO] DeepIF: Epoch 12/50, Loss: 25.3646
2024-12-12 12:23:18,241 [INFO] DeepIF: Epoch 13/50, Loss: 24.9755


KeyError: 'DB_CONNECTION_STRING'