# Comparing architectures in HuggingFace (BART, BERT, Pre/Post Norm. Layers, etc.)

In [1]:
from transformers import (
    BartForConditionalGeneration,
    BartTokenizer,
    BartConfig,
    BartModel,
    BertModel,
    BertConfig,
    GenerationConfig,
    EncoderDecoderModel,
    EncoderDecoderConfig,
    RobertaPreLayerNormConfig,
)

import torch
import torch.nn as nn



  from .autonotebook import tqdm as notebook_tqdm


## BART

In [2]:
bart_config = BartConfig(
    vocab_size=50265,
    max_position_embeddings=1024,
    encoder_layers=12,
    encoder_ffn_dim=4096,
    encoder_attention_heads=16,
    decoder_layers=12,
    decoder_ffn_dim=4096,
    decoder_attention_heads=16,
    encoder_layerdrop=0.0,
    decoder_layerdrop=0.0,
    activation_function="gelu",
    d_model=1024,
    dropout=0.1,
    attention_dropout=0.0,
    activation_dropout=0.0,
    init_std=0.02,
    classifier_dropout=0.0,
    scale_embedding=False,
    use_cache=True,
    num_labels=3,
    pad_token_id=1,
    bos_token_id=0,
    eos_token_id=2,
    is_encoder_decoder=False,
    decoder_start_token_id=2,
    forced_eos_token_id=2,
)

In [3]:
model = BartModel(bart_config)
print(model)

## BERT

In [None]:
bert_config = BertConfig(
    vocab_size=50265,
    max_position_embeddings=1024,
    encoder_layers=12,
    encoder_ffn_dim=4096,
    encoder_attention_heads=16,
    decoder_layers=12,
    decoder_ffn_dim=4096,
    decoder_attention_heads=16,
    encoder_layerdrop=0.0,
    decoder_layerdrop=0.0,
    activation_function="gelu",
    d_model=1024,
    dropout=0.1,
    attention_dropout=0.0,
    activation_dropout=0.0,
    init_std=0.02,
    classifier_dropout=0.0,
    scale_embedding=False,
    use_cache=True,
    num_labels=3,
    pad_token_id=1,
    bos_token_id=0,
    eos_token_id=2,
    is_encoder_decoder=True,
    decoder_start_token_id=2,
    forced_eos_token_id=2,
)

In [None]:
model = BertModel(bert_config)
print(model)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(1024, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=Fals

## Conclusions:
- BART model uses Post-normalization layers as in the original "Attention is All You Need"
- BERT model uses Pre-normalization layers thorugh Embeddding Normalization

## Custom PositionalEncoding

In [None]:
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=256):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1).squeeze(1)
        print(f'PE: {pe.shape}')
        self.register_buffer('pe', pe)

    def forward(self, x):
        print(f'X shape in forward: {x.shape}')
        print(f'PE shape in forward: {self.pe.shape}')
        x = x + self.pe[:x.size(1)]
        return self.dropout(x)

In [None]:
pe_encoder, pe_decoder = PositionalEncoding(256), PositionalEncoding(256)

model_config = BartConfig(
    vocab_size=100,
    d_model=256,
    encoder_layers=4,
    decoder_layers=4,
    encoder_attention_heads=4,
    decoder_attention_heads=4,
    encoder_ffn_dim=1024,
    decoder_ffn_dim=1024,
    max_position_embeddings=512, # Will be deleted
    activation_function='relu',
    pad_token_id=2,
    force_bos_token_to_be_generated=True,
    use_cache=False,
)

model = BartForConditionalGeneration(model_config)
print(model)

# Change positional encoding to the transformer
model.model.encoder.embed_positions = pe_encoder
model.model.decoder.embed_positions = pe_decoder

PE: torch.Size([256, 256])
PE: torch.Size([256, 256])
BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(100, 256, padding_idx=2)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(100, 256, padding_idx=2)
      (embed_positions): BartLearnedPositionalEmbedding(514, 256)
      (layers): ModuleList(
        (0-3): 4 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=256, out_features=256, bias=True)
            (v_proj): Linear(in_features=256, out_features=256, bias=True)
            (q_proj): Linear(in_features=256, out_features=256, bias=True)
            (out_proj): Linear(in_features=256, out_features=256, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=256, out_features=1024, bias=True)
          (fc2): Linear(in_features=1024, out_features=256, bias=T

In [None]:
print(model)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(100, 256, padding_idx=2)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(100, 256, padding_idx=2)
      (embed_positions): PositionalEncoding(
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (layers): ModuleList(
        (0-3): 4 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=256, out_features=256, bias=True)
            (v_proj): Linear(in_features=256, out_features=256, bias=True)
            (q_proj): Linear(in_features=256, out_features=256, bias=True)
            (out_proj): Linear(in_features=256, out_features=256, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=256, out_features=1024, bias=True)
          (fc2): Linear(in_features=1024, out_features=256, bias=True)
          (fi

In [None]:
print(model.model.encoder)

BartEncoder(
  (embed_tokens): BartScaledWordEmbedding(100, 256, padding_idx=2)
  (embed_positions): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (layers): ModuleList(
    (0-3): 4 x BartEncoderLayer(
      (self_attn): BartSdpaAttention(
        (k_proj): Linear(in_features=256, out_features=256, bias=True)
        (v_proj): Linear(in_features=256, out_features=256, bias=True)
        (q_proj): Linear(in_features=256, out_features=256, bias=True)
        (out_proj): Linear(in_features=256, out_features=256, bias=True)
      )
      (self_attn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (activation_fn): ReLU()
      (fc1): Linear(in_features=256, out_features=1024, bias=True)
      (fc2): Linear(in_features=1024, out_features=256, bias=True)
      (final_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    )
  )
  (layernorm_embedding): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
)


In [None]:
print(model.model.encoder.embed_positions)

PositionalEncoding(
  (dropout): Dropout(p=0.1, inplace=False)
)


In [None]:
rand_batch = torch.rand((16, 128, 1, 256)) # Sequence of 128 columns with 256 channels

pos_enc_batch = model.model.encoder.embed_positions(rand_batch)
print(f'Pos enc batch: {pos_enc_batch.shape}')
enc_images = model.model.encoder(inputs_embeds=rand_batch)

X shape in forward: torch.Size([16, 128, 1, 256])
PE shape in forward: torch.Size([256, 256])
Pos enc batch: torch.Size([16, 128, 128, 256])
X shape in forward: torch.Size([16, 128, 256])
PE shape in forward: torch.Size([256, 256])


RuntimeError: The size of tensor a (128) must match the size of tensor b (16) at non-singleton dimension 1

In [None]:
import matplotlib.pyplot as plt
P = model.model.encoder.embed_positions.pe.squeeze(1)
cax = plt.matshow(P)
plt.gcf().colorbar(cax)