This notebook continues from `transformer_architecture.ipynb`.


In [2]:
# Bring in model definitions and dependencies from the architecture notebook
%run ./transformer_architecture.ipynb

  validate(nb)


FFN(
  (fc1): Linear(in_features=512, out_features=2048, bias=True)
  (fc2): Linear(in_features=2048, out_features=512, bias=True)
)
TransformerEncoder(
  (att): TransformerAttention(
    (q_proj): Linear(in_features=512, out_features=512, bias=True)
    (k_proj): Linear(in_features=512, out_features=512, bias=True)
    (v_proj): Linear(in_features=512, out_features=512, bias=True)
    (output_proj): Linear(in_features=512, out_features=512, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (ffn): FFN(
    (fc1): Linear(in_features=512, out_features=2048, bias=True)
    (fc2): Linear(in_features=2048, out_features=512, bias=True)
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (LayerNorm_att): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (LayerNorm_ffn): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
TransformerDecoder(
  (att): TransformerAttention(
    (q_proj): Linear(in_features=512, out_features=512, bias=True)
    (k_proj): Linear(in_features=512,

## Testing Section

In [3]:
## testing on the embedding implemntation
## Tokenlize model input: from batched sentences to batched sequence of code
from transformers import AutoTokenizer
from transformers import pipeline

import torch

# layer config 
d_model = 768
d_embed = 1024  # Larger embedding dimension
vocab_size=30522

# loading sample data
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=True, use_multiprocessing=False)
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

# Will truncate the sequences that are longer than the model max length
# (512 for BERT or DistilBERT)
max_position_embeddings = 512
model_inputs = tokenizer(sequences, truncation=True,  padding="longest")

# Check vocabulary size from the tokenizer
# Happen to be the same as the default setting for distilbert -- of course!
vocab_size = tokenizer.vocab_size
print(f"Tokenizer vocabulary size: {vocab_size}")


input = torch.tensor(model_inputs['input_ids'])
embedder = EmbeddingWithProjection(vocab_size=vocab_size, d_embed=d_embed, d_model=d_model)
output = embedder(input)

print(f"Input shape: {input.shape}")
print(f"Embedded shape after projection: {output.shape}")

Tokenizer vocabulary size: 30522
Input shape: torch.Size([2, 16])
Embedded shape after projection: torch.Size([2, 16, 768])


In [4]:
def test_transformer_encoder():
    # Set random seed for reproducibility
    torch.manual_seed(42)
    
    # Test parameters
    batch_size = 32
    seq_length = 20
    d_model = 512
    d_ff = 2048
    num_heads = 8
    
    # Initialize the transformer encoder
    encoder = TransformerEncoder(
        d_model=d_model,
        d_ff=d_ff,
        num_head=num_heads,
        dropout=0.1
    )
    
    # Set to evaluation mode to disable dropout
    encoder.eval()
    
    # Create input sequence - using ones instead of random values
    # for easier interpretation of attention patterns
    input_sequence = torch.ones(batch_size, seq_length, d_model)
    cross_sequence = torch.ones(batch_size, seq_length, d_model)*0.5
    
    # Create attention mask
    attention_mask = torch.ones(batch_size, seq_length)
    attention_mask[:, 15:] = 0  # Mask last 5 positions
    attention_mask =attention_mask.unsqueeze(1).unsqueeze(3)
    
    # Store attention patterns
    attention_patterns = []
    
    # Define hook to capture attention scores
    def attention_hook(module, input, output):
        # We want to capture the attention scores before they're processed further
        # This assumes your attention module returns the attention scores
        attention_patterns.append(output)
    
    # Register the hook on the attention computation
    encoder.att.register_forward_hook(attention_hook)
    
    # Perform forward pass
    with torch.no_grad():
        output = encoder(input_sequence, attention_mask)
    
    # Basic shape tests
    expected_shape = (batch_size, seq_length, d_model)
    assert output.shape == expected_shape, f"Expected shape {expected_shape}, got {output.shape}"
    
    # Print output statistics
    print("\nOutput Statistics:")
    print(f"Mean: {output.mean():.4f}")
    print(f"Std: {output.std():.4f}")
    print(f"Min: {output.min():.4f}")
    print(f"Max: {output.max():.4f}")
    
    # Analyze attention patterns
    if attention_patterns:
        attention_output = attention_patterns[0]
        # Look at the attention patterns for unmasked vs masked positions
        unmasked_attention = output[:, :15, :].abs().mean()
        masked_attention = output[:, 15:, :].abs().mean()
        
        print("\nAttention Analysis:")
        print(f"Unmasked positions mean: {unmasked_attention:.4f}")
        print(f"Masked positions mean: {masked_attention:.4f}")
        
        # Note: We expect masked positions to still have values due to residual connections,
        # but their patterns should be different from unmasked positions
        print("\nIs the masking working?", "Yes" if unmasked_attention != masked_attention else "No")
    
    # Check for any NaN or infinite values
    assert torch.isfinite(output).all(), "Output contains NaN or infinite values"
    
    print("\nAll tests passed successfully!")
    return output, attention_patterns

# Run the test
output, attention_patterns = test_transformer_encoder()


Output Statistics:
Mean: 0.0000
Std: 1.0000
Min: -2.7968
Max: 2.8519

Attention Analysis:
Unmasked positions mean: 0.8078
Masked positions mean: 0.8078

Is the masking working? Yes

All tests passed successfully!


In [5]:
def test_transformer_decoder():
    torch.manual_seed(42)
    
    # Test parameters
    batch_size = 32
    seq_length = 20
    encoder_seq_length = 22
    d_model = 512
    d_ff = 2048
    num_heads = 8
    
    decoder = TransformerDecoder(
        d_model=d_model,
        d_ff=d_ff,
        num_head=num_heads,
        dropout=0.1
    )
    decoder.eval()
    
    # Create input sequences
    decoder_input = torch.randn(batch_size, seq_length, d_model)
    encoder_output = torch.randn(batch_size, encoder_seq_length, d_model)
    
    # Create padding mask for encoder outputs
    padding_mask = torch.ones(batch_size, seq_length, encoder_seq_length)
    padding_mask[:, :, 18:] = 0  # Mask last 4 positions of encoder output
    padding_mask = padding_mask.unsqueeze(1)  # Add head dimension
    
    # Store attention scores
    attention_scores = []
    
    # Define hook to capture attention scores before softmax
    def attention_hook(module, input, output):
        if not attention_scores:  # Only store first layer's patterns
            # Assuming attention scores are computed before this hook
            attention_scores.append(module.att_matrix.detach())  # You might need to modify this based on your attention implementation
    
    # Register hook on the attention layer
    decoder.att.register_forward_hook(attention_hook)
    
    # Perform forward pass
    with torch.no_grad():
        output = decoder(decoder_input, encoder_output, padding_mask)
    
    # Basic shape tests
    expected_shape = (batch_size, seq_length, d_model)
    assert output.shape == expected_shape, f"Expected shape {expected_shape}, got {output.shape}"
    
    # Print output statistics
    print("\nOutput Statistics:")
    print(f"Mean: {output.mean():.4f}")
    print(f"Std: {output.std():.4f}")
    print(f"Min: {output.min():.4f}")
    print(f"Max: {output.max():.4f}")
    
    # Test shape preservation
    print("\nShape Analysis:")
    print(f"Input shape: {decoder_input.shape}")
    print(f"Output shape: {output.shape}")
    print(f"Expected shape matches: {'Yes' if decoder_input.shape == output.shape else 'No'}")
    
    # Check for any NaN or infinite values
    assert torch.isfinite(output).all(), "Output contains NaN or infinite values"
    
    print("\nAll tests passed successfully!")
    return output, attention_scores

# Run the test
output, attention_scores = test_transformer_decoder()


Output Statistics:
Mean: 0.0000
Std: 1.0000
Min: -4.3617
Max: 4.5787

Shape Analysis:
Input shape: torch.Size([32, 20, 512])
Output shape: torch.Size([32, 20, 512])
Expected shape matches: Yes

All tests passed successfully!


In [6]:
def test_transformer_encoder_decoder_stack():
    torch.manual_seed(42)
    
    # Test parameters
    batch_size = 8
    seq_length = 10
    d_model = 512
    d_ff = 2048
    num_heads = 8
    num_layers = 6
    
    # Initialize the transformer encoder-decoder stack
    transformer = TransformerEncoderDecoder(
        num_layer=num_layers,
        d_model=d_model,
        d_ff=d_ff,
        num_head=num_heads,
        dropout=0.1
    )
    
    # Set to evaluation mode to disable dropout
    transformer.eval()
    
    # Create input sequences
    encoder_input = torch.randn(batch_size, seq_length, d_model)
    decoder_input = torch.randn(batch_size, seq_length, d_model)
    
    # Create padding mask
    padding_mask = torch.ones(batch_size, seq_length)
    padding_mask[:, -2:] = 0  # Mask last 2 positions
    padding_mask = padding_mask.unsqueeze(1).unsqueeze(2)  # [batch, 1, 1, seq_len]
    
    # Store intermediate outputs
    intermediate_outputs = []
    
    def hook_fn(module, input, output):
        intermediate_outputs.append(output.detach())
    
    # Register hooks to capture outputs from each encoder and decoder layer
    for i, (encoder, decoder) in enumerate(zip(transformer.encoder_stack, transformer.decoder_stack)):
        encoder.register_forward_hook(lambda m, i, o, layer=i: print(f"\nEncoder Layer {layer} shape:", o.shape))
        decoder.register_forward_hook(lambda m, i, o, layer=i: print(f"Decoder Layer {layer} shape:", o.shape))
    
    # Perform forward pass
    with torch.no_grad():
        output = transformer(encoder_input, decoder_input, padding_mask)
    
    # Basic shape tests
    expected_shape = (batch_size, seq_length, d_model)
    assert output.shape == expected_shape, f"Expected shape {expected_shape}, got {output.shape}"
    
    # Print output statistics
    print("\nFinal Output Statistics:")
    print(f"Mean: {output.mean():.4f}")
    print(f"Std: {output.std():.4f}")
    print(f"Min: {output.min():.4f}")
    print(f"Max: {output.max():.4f}")
    
    # Verify shape preservation through layers
    print("\nShape Preservation Check:")
    print(f"Input shapes - Encoder: {encoder_input.shape}, Decoder: {decoder_input.shape}")
    print(f"Output shape: {output.shape}")
    
    # Check for any NaN or infinite values
    assert torch.isfinite(output).all(), "Output contains NaN or infinite values"
    
    # Verify that output is different from input (transformation happened)
    input_output_diff = (output - decoder_input).abs().mean()
    print(f"\nMean absolute difference between input and output: {input_output_diff:.4f}")
    print("Transformation occurred:", "Yes" if input_output_diff > 1e-3 else "No")
    
    # Check if model parameters were used
    total_params = sum(p.numel() for p in transformer.parameters())
    print(f"\nTotal number of parameters: {total_params:,}")
    
    print("\nAll tests passed successfully!")
    return output

# Run the test
output = test_transformer_encoder_decoder_stack()


Encoder Layer 0 shape: torch.Size([8, 10, 512])

Encoder Layer 1 shape: torch.Size([8, 10, 512])

Encoder Layer 2 shape: torch.Size([8, 10, 512])

Encoder Layer 3 shape: torch.Size([8, 10, 512])

Encoder Layer 4 shape: torch.Size([8, 10, 512])

Encoder Layer 5 shape: torch.Size([8, 10, 512])
Decoder Layer 0 shape: torch.Size([8, 10, 512])
Decoder Layer 1 shape: torch.Size([8, 10, 512])
Decoder Layer 2 shape: torch.Size([8, 10, 512])
Decoder Layer 3 shape: torch.Size([8, 10, 512])
Decoder Layer 4 shape: torch.Size([8, 10, 512])
Decoder Layer 5 shape: torch.Size([8, 10, 512])

Final Output Statistics:
Mean: 0.0000
Std: 1.0000
Min: -3.7172
Max: 4.1310

Shape Preservation Check:
Input shapes - Encoder: torch.Size([8, 10, 512]), Decoder: torch.Size([8, 10, 512])
Output shape: torch.Size([8, 10, 512])

Mean absolute difference between input and output: 0.9379
Transformation occurred: Yes

Total number of parameters: 37,834,752

All tests passed successfully!


In [7]:
def test_complete_transformer():
    # Configuration
    d_model = 768
    d_embed = 1024
    d_ff = 2048
    num_heads = 8
    num_layers = 6
    max_position_embeddings = 512
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english", 
                                            use_fast=True, 
                                            use_multiprocessing=False)
    vocab_size = tokenizer.vocab_size
    
    # Create sample source and target sequences
    src_sequences = [
        "I've been waiting for a HuggingFace course my whole life.",
        "So have I!"
    ]
    # Pretend these are translations
    tgt_sequences = [
        "J'ai attendu un cours HuggingFace toute ma vie.",
        "Moi aussi!"
    ]
    
    # Tokenize source and target sequences
    src_inputs = tokenizer(src_sequences, truncation=True, padding="longest", return_tensors="pt")
    tgt_inputs = tokenizer(tgt_sequences, truncation=True, padding="longest", return_tensors="pt")
    
    # Create transformer model
    transformer = Transformer(
        num_layer=num_layers,
        d_model=d_model,
        d_embed=d_embed,
        d_ff=d_ff,
        num_head=num_heads,
        src_vocab_size=vocab_size,
        tgt_vocab_size=vocab_size,
        max_position_embeddings=max_position_embeddings
    )
    
    # Set to eval mode
    transformer.eval()
    
    # Create padding mask from attention mask
    padding_mask = src_inputs['attention_mask'].unsqueeze(1).unsqueeze(2)
    
    print("\nInput Shapes:")
    print(f"Source tokens: {src_inputs['input_ids'].shape}")
    print(f"Target tokens: {tgt_inputs['input_ids'].shape}")
    
    # Forward pass
    with torch.no_grad():
        output = transformer(
            src_tokens=src_inputs['input_ids'],
            tgt_tokens=tgt_inputs['input_ids'],
            padding_mask=padding_mask
        )
    
    print("\nOutput Analysis:")
    print(f"Output shape: {output.shape}")  # Should be [batch_size, tgt_len, vocab_size]
    
    # Verify output is proper probability distribution
    print("\nProbability Distribution Check:")
    print(f"Sum to 1: {torch.allclose(output.exp().sum(dim=-1), torch.ones_like(output.exp().sum(dim=-1)))}")
    print(f"Max probability: {output.exp().max().item():.4f}")
    print(f"Min probability: {output.exp().min().item():.4f}")
    
    # Check if we can get predictions
    predictions = output.argmax(dim=-1)
    print("\nSample Predictions:")
    print("Original target:")
    print(tgt_sequences[0])
    print("\nModel output (decoded):")
    print(tokenizer.decode(predictions[0]))
    
    # Test backward pass
    transformer.train()
    output = transformer(
        src_tokens=src_inputs['input_ids'],
        tgt_tokens=tgt_inputs['input_ids'],
        padding_mask=padding_mask
    )
    
    # Calculate loss (cross entropy)
    loss = F.nll_loss(
        output.view(-1, vocab_size),
        tgt_inputs['input_ids'].view(-1)
    )
    
    # Test backward pass
    loss.backward()
    
    # Verify gradients
    has_gradients = all(p.grad is not None for p in transformer.parameters())
    print("\nTraining Check:")
    print(f"Loss value: {loss.item():.4f}")
    print(f"Has gradients: {has_gradients}")
    
    return output, predictions

# Run test
output, predictions = test_complete_transformer()


Input Shapes:
Source tokens: torch.Size([2, 16])
Target tokens: torch.Size([2, 17])

Output Analysis:
Output shape: torch.Size([2, 17, 30522])

Probability Distribution Check:
Sum to 1: True
Max probability: 0.0005
Min probability: 0.0000

Sample Predictions:
Original target:
J'ai attendu un cours HuggingFace toute ma vie.

Model output (decoded):
##aco bearer barriedate gate spoil lowlands tam navigation growls 1971 painfully demand negativelyzam [unused158] lowlands

Training Check:
Loss value: 10.7329
Has gradients: True


---

Continue with visual exploration -> `transformer_visualization.ipynb`
