In [1]:
# -*- coding: utf-8 -*-
import os
import warnings
from typing import Dict, List, Tuple, Optional

# warnings.filterwarnings('ignore')
os.environ['KERAS_BACKEND'] = 'tensorflow'

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import keras
import tensorflow as tf

import bayesflow as bf

from hmmlearn import hmm
from hmmlearn.hmm import CategoricalHMM

from sklearn.preprocessing import LabelEncoder

current_backend = tf.keras.backend.backend()
print(f"tf.keras is using the '{current_backend}' backend.")

2025-07-13 12:53:13.976800: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2025-07-13 12:53:13.976837: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-07-13 12:53:13.976842: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
I0000 00:00:1752403993.976855 6233715 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1752403993.976878 6233715 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
INFO:bayesflow:Using backend 'tensorflow'


tf.keras is using the 'tensorflow' backend.


In [2]:
# HMM PARAMETERS FROM TASK DESCRIPTION

# 20 amino acids in standard order
AMINO_ACIDS = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 
               'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']

# Emission probabilities from task tables
# Alpha-helix state (state 0)
EMISSION_ALPHA = [0.12, 0.06, 0.03, 0.05, 0.01, 0.09, 0.05, 0.04, 0.02, 0.07,
                  0.12, 0.06, 0.03, 0.04, 0.02, 0.05, 0.04, 0.01, 0.03, 0.06]

# Other state (state 1) 
EMISSION_OTHER = [0.06, 0.05, 0.05, 0.06, 0.02, 0.05, 0.03, 0.09, 0.03, 0.05,
                  0.08, 0.06, 0.02, 0.04, 0.06, 0.07, 0.06, 0.01, 0.04, 0.07]

# Transition probabilities from task description
# [alpha->alpha, alpha->other]
TRANS_FROM_ALPHA = [0.90, 0.10]
# [other->alpha, other->other]  
TRANS_FROM_OTHER = [0.05, 0.95]

# Initial state probabilities (always starts in "other" state)
INITIAL_PROBS = [0.0, 1.0]  # [alpha-helix, other]

# Validation
print("PARAMETER VALIDATION:")
print(f"Amino acids: {len(AMINO_ACIDS)} types")
print(f"Alpha emission sum: {sum(EMISSION_ALPHA):.3f}")
print(f"Other emission sum: {sum(EMISSION_OTHER):.3f}")
print(f"Alpha transitions sum: {sum(TRANS_FROM_ALPHA):.3f}")
print(f"Other transitions sum: {sum(TRANS_FROM_OTHER):.3f}")
print(f"Initial probs sum: {sum(INITIAL_PROBS):.3f}")
print("\n✓ All probabilities are valid!")

PARAMETER VALIDATION:
Amino acids: 20 types
Alpha emission sum: 1.000
Other emission sum: 1.000
Alpha transitions sum: 1.000
Other transitions sum: 1.000
Initial probs sum: 1.000

✓ All probabilities are valid!


In [3]:
# FIXED HMM MODEL CREATION

def create_fixed_hmm():
    """
    Create HMM with fixed parameters from task description.
    
    States: 0=alpha-helix, 1=other
    Features: 20 amino acids (0-19 indices)
    
    Returns:
        CategoricalHMM with fixed empirical parameters
    """
    # Create model with fixed parameters (no learning)
    model = hmm.CategoricalHMM(
        n_components=2,        # 2 states: alpha-helix, other
        n_features=20,         # 20 amino acids
        params="",             # Don't update any parameters
        init_params="",        # Don't initialize any parameters
        algorithm="viterbi",   # Use Viterbi algorithm for decoding
        verbose=True
    )
    
    # Set fixed parameters from task description
    model.startprob_ = np.array(INITIAL_PROBS)
    model.transmat_ = np.array([TRANS_FROM_ALPHA, TRANS_FROM_OTHER])
    model.emissionprob_ = np.array([EMISSION_ALPHA, EMISSION_OTHER])
    
    return model

# Test HMM creation
print("TESTING HMM CREATION:\n")
hmm_model = create_fixed_hmm()

print(f"States: {hmm_model.n_components}")
print(f"Features: {hmm_model.n_features}")
print(f"Start probabilities: {hmm_model.startprob_}")
print(f"Transition matrix shape: {hmm_model.transmat_.shape}")
print(f"Emission matrix shape: {hmm_model.emissionprob_.shape}")

print("\nTransition probabilities:")
print("From alpha-helix:", hmm_model.transmat_[0])
print("From other:     ", hmm_model.transmat_[1])

print("\nEmission probabilities (first 5 amino acids):")
print("Alpha-helix:", hmm_model.emissionprob_[0][:5])
print("Other:      ", hmm_model.emissionprob_[1][:5])
print("\n✓ HMM model created successfully!")

TESTING HMM CREATION:

States: 2
Features: 20
Start probabilities: [0. 1.]
Transition matrix shape: (2, 2)
Emission matrix shape: (2, 20)

Transition probabilities:
From alpha-helix: [0.9 0.1]
From other:      [0.05 0.95]

Emission probabilities (first 5 amino acids):
Alpha-helix: [0.12 0.06 0.03 0.05 0.01]
Other:       [0.06 0.05 0.05 0.06 0.02]

✓ HMM model created successfully!


In [4]:
# HMM DATA GENERATION AND SIMULATOR FUNCTIONS

def generate_amino_acid_sequence(n_samples=50, random_state=None):
    """
    Generate amino acid sequences from the fixed HMM.
    
    Args:
        n_samples: Number of amino acids to generate
        random_state: Random state for reproducibility
        
    Returns:
        dict with 'amino_acids', 'true_states', and 'state_probs'
    """
    # Create the fixed HMM model
    model = create_fixed_hmm()
    
    # Generate sequence from HMM
    X, Z = model.sample(n_samples, random_state=random_state)
    
    # X is shape (n_samples, 1) - amino acid indices
    # Z is shape (n_samples,) - true hidden states
    amino_acids = X.flatten()  # Convert to 1D array of amino acid indices
    
    # Get state membership probabilities using Forward-Backward algorithm
    # Need to reshape X for predict_proba (expects (n_samples, 1))
    state_probs = model.predict_proba(X)  # Shape: (n_samples, n_states)
    
    return {
        'amino_acids': amino_acids,       # Shape: (n_samples,) - amino acid indices (0-19)
        'true_states': Z,                 # Shape: (n_samples,) - true hidden states (0=alpha, 1=other) 
        'state_probs': state_probs        # Shape: (n_samples, 2) - state membership probabilities
    }

# Test the data generation
print("TESTING HMM DATA GENERATION:\n")
test_data = generate_amino_acid_sequence(n_samples=20, random_state=42)

print(f"Amino acids shape: {test_data['amino_acids'].shape}")
print(f"True states shape: {test_data['true_states'].shape}")
print(f"State probabilities shape: {test_data['state_probs'].shape}")

print(f"\nFirst 10 amino acids (indices): {test_data['amino_acids'][:10]}")
print(f"First 10 true states: {test_data['true_states'][:10]}")
print(f"First 5 state probabilities:\n{test_data['state_probs'][:5]}")

# Verify state probabilities sum to 1
print(f"\nState probabilities sum check: {np.allclose(test_data['state_probs'].sum(axis=1), 1.0)}")

# Convert amino acid indices to actual amino acid letters for readability
amino_acid_letters = [AMINO_ACIDS[idx] for idx in test_data['amino_acids'][:10]]
print(f"First 10 amino acids (letters): {amino_acid_letters}")
print("\n✓ HMM data generation working correctly!")

TESTING HMM DATA GENERATION:

Amino acids shape: (20,)
True states shape: (20,)
State probabilities shape: (20, 2)

First 10 amino acids (indices): [19 11  2 16 14 19  3  2  9  5]
First 10 true states: [1 1 1 1 1 0 0 0 0 0]
First 5 state probabilities:
[[0.         1.        ]
 [0.01768884 0.98231116]
 [0.0253218  0.9746782 ]
 [0.03656372 0.96343628]
 [0.05153765 0.94846235]]

State probabilities sum check: True
First 10 amino acids (letters): ['V', 'K', 'N', 'T', 'P', 'V', 'D', 'N', 'I', 'E']

✓ HMM data generation working correctly!


In [5]:
# BAYESFLOW SIMULATOR IMPLEMENTATION

def hmm_simulator_function(batch_shape, sequence_length=50, **kwargs):
    """
    Simulator function for BayesFlow that generates HMM data.
    
    This function will be wrapped by BayesFlow's LambdaSimulator.
    
    Args:
        batch_shape: Shape of the batch to generate (from BayesFlow)
        sequence_length: Length of amino acid sequences to generate
        **kwargs: Additional keyword arguments
        
    Returns:
        dict: Dictionary with simulation outputs for BayesFlow
    """
    # Handle both int and tuple batch_shape
    if isinstance(batch_shape, int):
        batch_size = batch_shape
    else:
        batch_size = batch_shape[0] if len(batch_shape) > 0 else 1
    
    # Generate multiple sequences
    amino_acids_batch = []
    true_states_batch = []
    state_probs_batch = []
    
    for i in range(batch_size):
        # Generate one sequence with different random state for each
        data = generate_amino_acid_sequence(
            n_samples=sequence_length, 
            random_state=np.random.randint(0, 10000)
        )
        
        amino_acids_batch.append(data['amino_acids'])
        true_states_batch.append(data['true_states'])
        state_probs_batch.append(data['state_probs'])
    
    # Stack into batch format
    return {
        'amino_acids': np.array(amino_acids_batch),      # Shape: (batch_size, sequence_length)
        'true_states': np.array(true_states_batch),      # Shape: (batch_size, sequence_length)
        'state_probs': np.array(state_probs_batch),      # Shape: (batch_size, sequence_length, 2)
    }

# Create BayesFlow simulator
print("CREATING BAYESFLOW SIMULATOR:\n")
hmm_simulator = bf.simulators.LambdaSimulator(
    sample_fn=hmm_simulator_function,
    is_batched=True  # Our function handles batching internally
)

print("✓ BayesFlow LambdaSimulator created successfully!")

# Test the BayesFlow simulator
print("\nTESTING BAYESFLOW SIMULATOR:")
batch_size = 3
sequence_length = 15

# Sample from the simulator
simulation_data = hmm_simulator.sample(
    batch_shape=(batch_size,), 
    sequence_length=sequence_length
)

print(f"Simulation data keys: {list(simulation_data.keys())}")
print(f"Amino acids batch shape: {simulation_data['amino_acids'].shape}")
print(f"True states batch shape: {simulation_data['true_states'].shape}")
print(f"State probabilities batch shape: {simulation_data['state_probs'].shape}")

# Show multiple sequences
num_seq = 2
print(f"\nFirst {num_seq} sequences:")
for i in range(num_seq):
    amino_acids = simulation_data['amino_acids'][i]
    true_states = simulation_data['true_states'][i]
    state_probs = simulation_data['state_probs'][i]
    
    print(f"\nSequence {i}:")
    print(f"Amino acids: {amino_acids}")
    print(f"True states: {true_states}")
    print(f"State probabilities shape: {state_probs.shape}")
    print(f"State probabilities sum check: {np.allclose(state_probs.sum(axis=1), 1.0)}")
    print(f"Sequnce length: {len(amino_acids)}")

# Convert first sequence to amino acid letters
example_letters = [AMINO_ACIDS[idx] for idx in simulation_data['amino_acids'][0]]
print(f"Amino acid letters: {example_letters}")

print("\n✓ BayesFlow simulator working correctly!")

CREATING BAYESFLOW SIMULATOR:

✓ BayesFlow LambdaSimulator created successfully!

TESTING BAYESFLOW SIMULATOR:
Simulation data keys: ['amino_acids', 'true_states', 'state_probs']
Amino acids batch shape: (3, 15)
True states batch shape: (3, 15)
State probabilities batch shape: (3, 15, 2)

First 2 sequences:

Sequence 0:
Amino acids: [19  1  3  0 16 12 12 14 11  8 15  3 15  7 17]
True states: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
State probabilities shape: (15, 2)
State probabilities sum check: True
Sequnce length: 15

Sequence 1:
Amino acids: [ 7  0  4  0 15 14  9  2  5 15 12  6  4  2  1]
True states: [1 1 1 1 1 1 0 0 0 0 0 0 0 0 0]
State probabilities shape: (15, 2)
State probabilities sum check: True
Sequnce length: 15
Amino acid letters: ['V', 'R', 'D', 'A', 'T', 'M', 'M', 'P', 'K', 'H', 'S', 'D', 'S', 'G', 'W']

✓ BayesFlow simulator working correctly!


In [None]:
# CUSTOM PROTEIN SUMMARY NETWORK

class ProteinSummaryNetwork(bf.networks.SummaryNetwork):
    """
    Custom summary network for protein amino acid sequences.
    
    This network is specifically designed for the protein secondary structure task:
    - Embeds amino acid indices into dense representations
    - Uses bidirectional LSTM to capture sequential dependencies
    - Applies attention mechanism to focus on important positions
    - Outputs summary statistics for the entire sequence
    """
    
    def __init__(self, 
                 vocab_size=20,              # Number of amino acids
                 embedding_dim=32,           # Amino acid embedding dimension
                 lstm_units=64,              # LSTM hidden units
                 attention_dim=32,           # Attention mechanism dimension
                 summary_dim=64,             # Output summary dimension
                 dropout_rate=0.1,           # Dropout rate
                 **kwargs):
        super().__init__(**kwargs)
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.lstm_units = lstm_units
        self.attention_dim = attention_dim
        self.summary_dim = summary_dim
        self.dropout_rate = dropout_rate
        
        # Amino acid embedding layer
        self.embedding = tf.keras.layers.Embedding(
            input_dim=vocab_size,
            output_dim=embedding_dim,
            mask_zero=False,  # Don't mask zero values as amino acid 'A' has index 0
            name='amino_acid_embedding'
        )
        
        # Bidirectional LSTM for sequence processing
        self.lstm = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(
                lstm_units,
                return_sequences=True,  # Return full sequence for attention
                dropout=dropout_rate,
                recurrent_dropout=dropout_rate,
                name='sequence_lstm'
            ),
            name='bidirectional_lstm'
        )
        
        # Attention mechanism layers
        self.attention_dense = tf.keras.layers.Dense(
            attention_dim, 
            activation='tanh',
            name='attention_dense'
        )
        self.attention_weights = tf.keras.layers.Dense(
            1, 
            activation=None,  # Don't use softmax here, apply it later
            name='attention_weights'
        )
        
        # Final summary layers
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.summary_dense1 = tf.keras.layers.Dense(
            summary_dim * 2,
            activation='silu',
            name='summary_dense1'
        )
        self.summary_dense2 = tf.keras.layers.Dense(
            summary_dim,
            activation='silu', 
            name='summary_dense2'
        )
        
    def call(self, x, training=False, **kwargs):
        """
        Forward pass of the protein summary network.
        
        Args:
            x: Input tensor of shape (batch_size, sequence_length, 1) containing amino acid indices
            training: Whether in training mode
            
        Returns:
            Summary tensor of shape (batch_size, summary_dim)
        """
        # Remove the last dimension if present: (batch_size, seq_len, 1) -> (batch_size, seq_len)
        if x.shape[-1] == 1:
            x = tf.squeeze(x, axis=-1)
            
        # Convert to integer indices for embedding
        x = tf.cast(x, tf.int32)
        
        # Embed amino acid indices: (batch_size, seq_len) -> (batch_size, seq_len, embedding_dim)
        embedded = self.embedding(x)
        
        # Process with bidirectional LSTM: (batch_size, seq_len, embedding_dim) -> (batch_size, seq_len, 2*lstm_units)
        lstm_output = self.lstm(embedded, training=training)
        
        # Apply attention mechanism
        # Compute attention scores: (batch_size, seq_len, 2*lstm_units) -> (batch_size, seq_len, attention_dim)
        attention_scores = self.attention_dense(lstm_output)
        
        # Compute attention weights: (batch_size, seq_len, attention_dim) -> (batch_size, seq_len, 1)
        attention_logits = self.attention_weights(attention_scores)
        
        # Apply softmax along the sequence dimension to get proper attention weights
        attention_weights = tf.nn.softmax(attention_logits, axis=1)  # Softmax over sequence dimension
        
        # Apply attention: weighted sum of LSTM outputs
        # (batch_size, seq_len, 2*lstm_units) * (batch_size, seq_len, 1) -> (batch_size, 2*lstm_units)
        attended_output = tf.reduce_sum(lstm_output * attention_weights, axis=1)
        
        # Apply dropout
        attended_output = self.dropout(attended_output, training=training)
        
        # Generate final summary through dense layers
        summary = self.summary_dense1(attended_output)
        summary = self.dropout(summary, training=training)
        summary = self.summary_dense2(summary)
        
        return summary
    
    def get_config(self):
        """Return the configuration of the layer."""
        config = super().get_config()
        config.update({
            'vocab_size': self.vocab_size,
            'embedding_dim': self.embedding_dim,
            'lstm_units': self.lstm_units,
            'attention_dim': self.attention_dim,
            'summary_dim': self.summary_dim,
            'dropout_rate': self.dropout_rate,
        })
        return config
    
    @classmethod
    def from_config(cls, config):
        """Create layer from configuration."""
        return cls(**config)

print("✓ Custom ProteinSummaryNetwork class defined")

✓ Custom ProteinSummaryNetwork class defined


In [7]:
# TRAINING FUNCTION FOR CUSTOM PROTEIN WORKFLOW

def train_protein_workflow(
    workflow,
    batch_size=16,
    epochs=50,
    print_every=10,
    save_path=None
):
    """
    Train the protein BayesFlow workflow with our custom summary network.
    
    Args:
        workflow: The BayesFlow workflow to train
        batch_size: Batch size for training
        epochs: Number of training epochs
        print_every: Print progress every N epochs
        save_path: Path to save the trained model (optional)
    
    Returns:
        training_history: Dictionary with training metrics
    """
    
    print(f"Starting training for {epochs} epochs with batch size {batch_size}")
    print("=" * 60)
    
    training_history = {
        'epoch': [],
        'loss': [],
        'validation_loss': []
    }
    
    try:
        # Configure the workflow for training
        config = {
            'epochs': epochs,
            'batch_size': batch_size,
            'validation_sims': 1000,  # Generate validation data
            'checkpoint_interval': max(1, epochs // 10),  # Save checkpoints
        }
        
        print("Training configuration:")
        for key, value in config.items():
            print(f"  {key}: {value}")
        print()
        
        # Start online training
        print("🚀 Starting online training...")
        training_info = workflow.fit_online(
            epochs=config['epochs'],
            batch_size=config['batch_size'],
            print_every=print_every
        )
        
        print("✅ Training completed successfully!")
        
        # Extract training history if available
        if hasattr(training_info, 'history') and training_info.history:
            history = training_info.history
            training_history['loss'] = history.get('loss', [])
            training_history['validation_loss'] = history.get('val_loss', [])
            training_history['epoch'] = list(range(1, len(training_history['loss']) + 1))
        
        # Save the model if path provided
        if save_path:
            print(f"💾 Saving model to {save_path}")
            workflow.save_model(save_path)
            
        return training_history
        
    except Exception as e:
        print(f"❌ Training failed with error: {e}")
        import traceback
        traceback.print_exc()
        return training_history

print("✓ Training function defined")

✓ Training function defined


In [13]:
# CORRECTED PROTEIN BAYESFLOW WORKFLOW CREATION

# First, let's create a custom flattening transform
class FlattenTransform(bf.adapters.transforms.Transform):
    """Custom transform to flatten inference variables from (batch, seq_len, 2) to (batch, seq_len*2)"""
    
    def __init__(self):
        super().__init__()
    
    def forward(self, x, **kwargs):
        # Flatten the last two dimensions: (batch, seq_len, 2) -> (batch, seq_len*2)
        return x.reshape(x.shape[0], -1).astype(np.float32)
    
    def inverse(self, x, **kwargs):
        # For inverse, we would need to know the original shape
        # This is not needed for our use case but required by the interface
        raise NotImplementedError("Inverse transform not implemented for FlattenTransform")

def create_corrected_protein_workflow():
    """
    Create BayesFlow workflow with custom protein summary network and proper adapter.
    """
    print("Creating corrected protein BayesFlow workflow...")
    
    # 1. USE EXISTING SIMULATOR
    simulator = hmm_simulator
    print("✓ Using existing HMM simulator")
    
    # 2. CUSTOM SUMMARY NETWORK
    protein_summary_net = ProteinSummaryNetwork(
        vocab_size=20,
        embedding_dim=32,
        lstm_units=64,
        attention_dim=32,
        summary_dim=64,
        name='ProteinSummaryNetwork'
    )
    print("✓ Custom summary network created")
    
    # 3. INFERENCE NETWORK
    inference_net = bf.networks.CouplingFlow(
        num_params=100,  # Flattened state probabilities: 50 positions * 2 states = 100
        num_coupling_layers=8,
        coupling_settings={'units': [128, 128], 'activation': 'silu'},
        name='ProteinInferenceNetwork'
    )
    print("✓ Inference network created")
    
    # 4. ADAPTER WITH CORRECT TRANSFORMS AND FLATTENING
    adapter_transforms = [
        # Rename variables to BayesFlow conventions
        bf.adapters.transforms.Rename(from_key='amino_acids', to_key='summary_variables'),
        bf.adapters.transforms.Rename(from_key='state_probs', to_key='inference_variables'),
        
        # Drop unused variables
        bf.adapters.transforms.Drop(keys=['true_states']),
        
        # Convert data types
        bf.adapters.transforms.MapTransform({
            'summary_variables': bf.adapters.transforms.ConvertDType(
                from_dtype='int64', to_dtype='float32'
            ),
            'inference_variables': bf.adapters.transforms.ConvertDType(
                from_dtype='float64', to_dtype='float32'
            ),
        }),
        
        # Flatten inference variables using our custom transform
        bf.adapters.transforms.MapTransform({
            'inference_variables': FlattenTransform(),
        }),
    ]
    
    adapter = bf.Adapter(transforms=adapter_transforms)
    print("✓ Adapter with transforms created")
    
    # 5. CREATE WORKFLOW
    workflow = bf.BasicWorkflow(
        simulator=simulator,
        adapter=adapter,
        inference_network=inference_net,
        summary_network=protein_summary_net
    )
    print("✓ BayesFlow workflow created")
    
    return workflow

# Create the corrected workflow
print("=" * 60)
corrected_protein_workflow = create_corrected_protein_workflow()
print("=" * 60)
print("🎉 Corrected protein BayesFlow workflow created successfully!")

Creating corrected protein BayesFlow workflow...
✓ Using existing HMM simulator
✓ Custom summary network created
✓ Inference network created
✓ Adapter with transforms created
✓ BayesFlow workflow created
🎉 Corrected protein BayesFlow workflow created successfully!


In [14]:
# TEST TRAINING WITH CORRECTED WORKFLOW

print("🧪 Testing training with corrected workflow...")
print("=" * 50)

# Test simulation first
print("1. Testing simulation...")
test_sim_data = corrected_protein_workflow.simulate(2)
print("✓ Simulation successful")
print("   Data keys:", list(test_sim_data.keys()))
for key, value in test_sim_data.items():
    print(f"   {key}: shape {value.shape}, dtype {value.dtype}")

# Test adaptation
print("\n2. Testing data adaptation...")
test_adapted = corrected_protein_workflow.adapter(test_sim_data)
print("✓ Adaptation successful")
print("   Adapted keys:", list(test_adapted.keys()))
for key, value in test_adapted.items():
    print(f"   {key}: shape {value.shape}, dtype {value.dtype}")

# Now test a few training epochs
print("\n3. Testing training...")
try:
    test_history = train_protein_workflow(
        workflow=corrected_protein_workflow,
        batch_size=4,  # Small batch for testing
        epochs=3,      # Just a few epochs for testing
        print_every=1
    )
    print("✅ Training test successful!")
    
except Exception as e:
    print(f"❌ Training test failed: {e}")
    import traceback
    traceback.print_exc()

INFO:bayesflow:Fitting on dataset instance of OnlineDataset.
INFO:bayesflow:Building on a test batch.
INFO:bayesflow:Building on a test batch.


🧪 Testing training with corrected workflow...
1. Testing simulation...
✓ Simulation successful
   Data keys: ['amino_acids', 'true_states', 'state_probs']
   amino_acids: shape (2, 50), dtype int64
   true_states: shape (2, 50), dtype int64
   state_probs: shape (2, 50, 2), dtype float64

2. Testing data adaptation...
✓ Adaptation successful
   Adapted keys: ['summary_variables', 'inference_variables']
   summary_variables: shape (2, 50), dtype float32
   inference_variables: shape (2, 100), dtype float32

3. Testing training...
Starting training for 3 epochs with batch size 4
Training configuration:
  epochs: 3
  batch_size: 4
  validation_sims: 1000
  checkpoint_interval: 1

🚀 Starting online training...
Epoch 1/3
Epoch 1/3


2025-07-13 12:56:53.871214: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m  7/100[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m15:56[0m 10s/step - loss: 149.5542

KeyboardInterrupt: 

In [None]:
# FULL TRAINING SESSION

print("🚀 Starting full training session...")
print("=" * 60)

# Train for more epochs to see meaningful learning
full_training_history = train_protein_workflow(
    workflow=corrected_protein_workflow,
    batch_size=16,     # Reasonable batch size
    epochs=20,         # More epochs for better learning
    print_every=5      # Progress updates every 5 epochs
)

print("\n" + "=" * 60)
print("🎉 TRAINING COMPLETED SUCCESSFULLY!")
print("=" * 60)

# Display training summary
if full_training_history and full_training_history['loss']:
    print(f"📊 Training Summary:")
    print(f"   Total epochs: {len(full_training_history['loss'])}")
    print(f"   Final loss: {full_training_history['loss'][-1]:.4f}")
    print(f"   Loss reduction: {full_training_history['loss'][0]:.4f} → {full_training_history['loss'][-1]:.4f}")
    
    # Simple plot of training loss
    import matplotlib.pyplot as plt
    plt.figure(figsize=(10, 6))
    plt.plot(full_training_history['epoch'], full_training_history['loss'], 'b-', linewidth=2, label='Training Loss')
    plt.title('BayesFlow Training Progress', fontsize=14, fontweight='bold')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.show()
else:
    print("⚠️ No training history available to display")

print("\n🧬 The protein secondary structure BayesFlow model is now trained!")
print("📈 Ready for posterior inference on new amino acid sequences!")

In [15]:
# INVESTIGATING BAYESFLOW COUPLINGFLOW PARAMETERS

print("🔍 INVESTIGATING COUPLINGFLOW IMPLEMENTATION")
print("=" * 60)

# Check the CouplingFlow class signature and documentation
print("1. CouplingFlow class signature:")
import inspect
print(inspect.signature(bf.networks.CouplingFlow.__init__))

print("\n2. CouplingFlow class docstring:")
print(bf.networks.CouplingFlow.__doc__)

print("\n3. CouplingFlow available attributes/methods:")
coupling_flow_attrs = [attr for attr in dir(bf.networks.CouplingFlow) if not attr.startswith('_')]
print(coupling_flow_attrs)

print("\n4. Let's check what networks are available in BayesFlow:")
available_networks = [attr for attr in dir(bf.networks) if not attr.startswith('_')]
print("Available networks:", available_networks)

print("\n5. Checking if there are other coupling-related networks:")
coupling_networks = [attr for attr in available_networks if 'coup' in attr.lower() or 'flow' in attr.lower()]
print("Coupling/Flow networks:", coupling_networks)

🔍 INVESTIGATING COUPLINGFLOW IMPLEMENTATION
1. CouplingFlow class signature:
(self, subnet: str | type = 'mlp', depth: int = 6, transform: str = 'affine', permutation: str | None = 'random', use_actnorm: bool = True, base_distribution: str = 'normal', subnet_kwargs: dict[str, any] = None, transform_kwargs: dict[str, any] = None, **kwargs)

2. CouplingFlow class docstring:
(IN) Implements a coupling flow as a sequence of dual couplings with permutations and activation
    normalization. Incorporates ideas from [1-5].

    [1] Kingma, D. P., & Dhariwal, P. (2018).
    Glow: Generative flow with invertible 1x1 convolutions.
    Advances in Neural Information Processing Systems, 31.

    [2] Durkan, C., Bekasov, A., Murray, I., & Papamakarios, G. (2019).
    Neural spline flows. Advances in Neural Information Processing Systems, 32.

    [3] Ardizzone, L., Kruse, J., Lüth, C., Bracher, N., Rother, C., & Köthe, U. (2020).
    Conditional invertible neural networks for diverse image-to-image

In [16]:
# UNDERSTANDING HOW BAYESFLOW HANDLES PARAMETER DIMENSIONS

print("🔍 UNDERSTANDING BAYESFLOW PARAMETER HANDLING")
print("=" * 60)

# Let's check what a BasicWorkflow expects for inference networks
print("1. BasicWorkflow signature:")
print(inspect.signature(bf.BasicWorkflow.__init__))

print("\n2. Let's look at how CouplingFlow works with our current data:")
print("First, let's see our current data dimensions again:")

# Test our simulator and adapter
test_sim = corrected_protein_workflow.simulate(2)
test_adapted = corrected_protein_workflow.adapter(test_sim)

print("Simulated data:")
for key, value in test_sim.items():
    print(f"  {key}: {value.shape}")

print("\nAdapted data:")
for key, value in test_adapted.items():
    print(f"  {key}: {value.shape}")

print("\n3. The key insight: BayesFlow workflows automatically determine dimensions!")
print("   - BasicWorkflow builds the network based on actual data shapes")
print("   - The inference network gets built when it sees the inference_variables")
print("   - No need to specify num_params explicitly!")

print("\n4. Let's create a properly configured CouplingFlow:")

# Create a new CouplingFlow with correct parameters
correct_coupling_flow = bf.networks.CouplingFlow(
    subnet='mlp',           # Use MLP subnets
    depth=8,               # 8 coupling layers (equivalent to num_coupling_layers)
    transform='affine',    # Affine coupling transforms
    permutation='random',  # Random permutations between layers
    subnet_kwargs={        # Arguments for the MLP subnets
        'units': [128, 128],  # Hidden units in MLP
        'activation': 'silu'  # Activation function
    }
)

print("✓ Correct CouplingFlow created with proper parameters:")
print(f"  - Subnet type: mlp")
print(f"  - Depth (coupling layers): 8")
print(f"  - Transform: affine")
print(f"  - Subnet units: [128, 128]")
print(f"  - Activation: silu")

🔍 UNDERSTANDING BAYESFLOW PARAMETER HANDLING
1. BasicWorkflow signature:
(self, simulator: bayesflow.simulators.simulator.Simulator = None, adapter: bayesflow.adapters.adapter.Adapter = None, inference_network: bayesflow.networks.inference_network.InferenceNetwork | str = 'coupling_flow', summary_network: bayesflow.networks.summary_network.SummaryNetwork | str = None, initial_learning_rate: float = 0.0005, optimizer: keras.src.optimizers.optimizer.Optimizer | type = None, checkpoint_filepath: str = None, checkpoint_name: str = 'model', save_weights_only: bool = False, save_best_only: bool = False, inference_variables: collections.abc.Sequence[str] | str = None, inference_conditions: collections.abc.Sequence[str] | str = None, summary_variables: collections.abc.Sequence[str] | str = None, standardize: collections.abc.Sequence[str] | str | None = 'inference_variables', **kwargs)

2. Let's look at how CouplingFlow works with our current data:
First, let's see our current data dimensions a

In [17]:
# CORRECTED WORKFLOW WITH PROPER COUPLINGFLOW PARAMETERS

def create_properly_configured_workflow():
    """
    Create BayesFlow workflow with correctly configured CouplingFlow.
    """
    print("Creating properly configured BayesFlow workflow...")
    
    # 1. USE EXISTING SIMULATOR
    simulator = hmm_simulator
    print("✓ Using existing HMM simulator")
    
    # 2. CUSTOM SUMMARY NETWORK
    protein_summary_net = ProteinSummaryNetwork(
        vocab_size=20,
        embedding_dim=32,
        lstm_units=64,
        attention_dim=32,
        summary_dim=64,
        name='ProteinSummaryNetwork'
    )
    print("✓ Custom summary network created")
    
    # 3. PROPERLY CONFIGURED INFERENCE NETWORK
    inference_net = bf.networks.CouplingFlow(
        subnet='mlp',           # Use MLP subnets
        depth=8,               # Number of coupling layers
        transform='affine',    # Affine coupling transforms  
        permutation='random',  # Random permutations between layers
        use_actnorm=True,      # Use activation normalization
        base_distribution='normal',  # Normal base distribution
        subnet_kwargs={        # Configuration for MLP subnets
            'units': [128, 128],     # Hidden layer sizes
            'activation': 'silu',    # Activation function
            'dropout': 0.1           # Dropout rate
        },
        name='ProteinInferenceNetwork'
    )
    print("✓ Properly configured CouplingFlow created")
    print(f"  - Depth: 8 coupling layers")
    print(f"  - Subnet: MLP with units [128, 128]")
    print(f"  - Transform: affine")
    print(f"  - Base distribution: normal")
    
    # 4. ADAPTER (same as before)
    adapter_transforms = [
        bf.adapters.transforms.Rename(from_key='amino_acids', to_key='summary_variables'),
        bf.adapters.transforms.Rename(from_key='state_probs', to_key='inference_variables'),
        bf.adapters.transforms.Drop(keys=['true_states']),
        bf.adapters.transforms.MapTransform({
            'summary_variables': bf.adapters.transforms.ConvertDType(
                from_dtype='int64', to_dtype='float32'
            ),
            'inference_variables': bf.adapters.transforms.ConvertDType(
                from_dtype='float64', to_dtype='float32'
            ),
        }),
        bf.adapters.transforms.MapTransform({
            'inference_variables': FlattenTransform(),
        }),
    ]
    
    adapter = bf.Adapter(transforms=adapter_transforms)
    print("✓ Adapter with transforms created")
    
    # 5. CREATE WORKFLOW WITH PROPER PARAMETERS
    workflow = bf.BasicWorkflow(
        simulator=simulator,
        adapter=adapter,
        inference_network=inference_net,
        summary_network=protein_summary_net,
        initial_learning_rate=0.001,  # Learning rate
        inference_variables=['inference_variables'],  # Specify which variables to infer
        summary_variables=['summary_variables']       # Specify summary variables
    )
    print("✓ BayesFlow workflow created with proper configuration")
    
    return workflow

# Create the properly configured workflow
print("=" * 70)
properly_configured_workflow = create_properly_configured_workflow()
print("=" * 70)
print("🎉 PROPERLY CONFIGURED WORKFLOW CREATED!")
print("\n📋 Key corrections made:")
print("  ✅ Used correct CouplingFlow parameters (depth, subnet_kwargs)")
print("  ✅ Removed non-existent num_params parameter")
print("  ✅ BayesFlow will auto-determine dimensions from data")
print("  ✅ Specified inference_variables and summary_variables explicitly")

Creating properly configured BayesFlow workflow...
✓ Using existing HMM simulator
✓ Custom summary network created
✓ Properly configured CouplingFlow created
  - Depth: 8 coupling layers
  - Subnet: MLP with units [128, 128]
  - Transform: affine
  - Base distribution: normal
✓ Adapter with transforms created
✓ BayesFlow workflow created with proper configuration
🎉 PROPERLY CONFIGURED WORKFLOW CREATED!

📋 Key corrections made:
  ✅ Used correct CouplingFlow parameters (depth, subnet_kwargs)
  ✅ Removed non-existent num_params parameter
  ✅ BayesFlow will auto-determine dimensions from data
  ✅ Specified inference_variables and summary_variables explicitly


In [18]:
# TESTING THE PROPERLY CONFIGURED WORKFLOW

print("🧪 TESTING PROPERLY CONFIGURED WORKFLOW")
print("=" * 50)

# Test simulation and adaptation
print("1. Testing simulation and adaptation...")
test_sim_data = properly_configured_workflow.simulate(2)
test_adapted_data = properly_configured_workflow.adapter(test_sim_data)

print("✓ Simulation and adaptation successful")
print("Raw simulation data shapes:")
for key, value in test_sim_data.items():
    print(f"  {key}: {value.shape}")

print("\nAdapted data shapes:")
for key, value in test_adapted_data.items():
    print(f"  {key}: {value.shape}")

# Quick training test
print("\n2. Testing training with properly configured workflow...")
try:
    quick_test_history = train_protein_workflow(
        workflow=properly_configured_workflow,
        batch_size=4,
        epochs=2,  # Just 2 epochs for quick test
        print_every=1
    )
    print("✅ TRAINING TEST SUCCESSFUL!")
    print("🎯 The workflow is now properly configured and ready for full training!")
    
except Exception as e:
    print(f"❌ Training test failed: {e}")
    import traceback
    traceback.print_exc()

print("\n" + "=" * 50)
print("📊 SUMMARY OF CORRECTIONS:")
print("=" * 50)
print("❌ BEFORE (incorrect parameters):")
print("  - num_params=100")
print("  - num_coupling_layers=8") 
print("  - coupling_settings={'units': [128, 128], 'activation': 'silu'}")
print()
print("✅ AFTER (correct parameters):")
print("  - depth=8  # Number of coupling layers")
print("  - subnet='mlp'  # Type of subnet")
print("  - subnet_kwargs={'units': [128, 128], 'activation': 'silu'}")
print("  - transform='affine'  # Coupling transform type")
print("  - BayesFlow auto-determines parameter dimensions from data!")
print()
print("🔑 KEY INSIGHT: BayesFlow CouplingFlow doesn't need explicit parameter")
print("   dimensions - it builds them automatically based on the actual data shapes!")

INFO:bayesflow:Fitting on dataset instance of OnlineDataset.
INFO:bayesflow:Building on a test batch.
INFO:bayesflow:Building on a test batch.


🧪 TESTING PROPERLY CONFIGURED WORKFLOW
1. Testing simulation and adaptation...
✓ Simulation and adaptation successful
Raw simulation data shapes:
  amino_acids: (2, 50)
  true_states: (2, 50)
  state_probs: (2, 50, 2)

Adapted data shapes:
  summary_variables: (2, 50)
  inference_variables: (2, 100)

2. Testing training with properly configured workflow...
Starting training for 2 epochs with batch size 4
Training configuration:
  epochs: 2
  batch_size: 4
  validation_sims: 1000
  checkpoint_interval: 1

🚀 Starting online training...
Epoch 1/2
Epoch 1/2
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1042s[0m 10s/step - loss: 41.6534
Epoch 2/2
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1042s[0m 10s/step - loss: 41.6534
Epoch 2/2
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1009s[0m 10s/step - loss: -40.8455
✅ Training completed successfully!
✅ TRAINING TEST SUCCESSFUL!
🎯 The workflow is now properly configured and ready for full training!

📊 