In [1]:
# imports
from pathlib import Path
import sys  

# Get my_package directory path from Notebook
parent_dir = str(Path().resolve().parents[1])

# Add to sys.path
sys.path.insert(0, parent_dir)

In [2]:
import torch 
import math
from torch import nn
import torch.nn.functional as F
from torch import optim
from src.transformers.models.functionals import create_cross_attention_mask

In [3]:
embed_dim = 128
num_heads = 8
hidden_dim = 200
max_len = 5000
batch_size = 32
d_ff = hidden_dim
num_head = 8


In [31]:
def get_inputs_tensors():

    src = torch.tensor(
        [[ 1,3,4,2,0,0,0],
        [ 1,3,4,4,2,0,0],
        [ 1,2,0,0,0,0,0]]
    ) # [ 3, 7]
        
    # en_tensor = torch.randint(1, 10**4,[batch_size, seq_len], dtype=int)
    tgt = torch.tensor(
        [[ 1,3,2,0,0,0,0,0],
        [ 1,3,3,3,2,0,0,0],
        [ 1,3,2,0,0,0,0,0]]
    ) # [ 3, 8]

    print(src.shape, tgt.shape)

    embedding = nn.Embedding(10**4, embedding_dim=embed_dim)
    src_embedding = embedding(src)
    tgt_embedding = embedding(tgt)

    print(src_embedding.shape, tgt_embedding.shape)
    cross_padding_mask = create_cross_attention_mask(tgt, src)
    
    return tgt, src, src_embedding, tgt_embedding, cross_padding_mask


tgt, src, src_embedding, tgt_embedding, cross_padding_mask = get_inputs_tensors()

torch.Size([3, 7]) torch.Size([3, 8])
torch.Size([3, 7, 128]) torch.Size([3, 8, 128])


In [73]:
def show_model_parameters(model: nn.Module):
    print(model)
    for p in model.parameters():
        if p.requires_grad:
            print(p.name, p.data)
        else:
            print(p.name)
            print(p)

def check_gradient_explosion(
    model: nn.Module,
    target: torch.Tensor, 
    forward_args: list,
    forward_kwargs:dict
):
    torch.cuda.empty_cache()
    optimizer = optim.SGD(model.parameters(), lr=0.5)
    criterion = nn.MSELoss()
    
    optimizer.zero_grad()
    outputs = model(*forward_args, **forward_kwargs)
    loss_value = criterion(outputs, target)
    loss_value.backward()
    
    print(nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0))
    optimizer.step()
    
    return outputs



# **Layers**

In [82]:
from src.transformers.models.layers import EncoderLayer, DecoderLayer

criterion = nn.MSELoss()

encoder_layer = EncoderLayer(
    embed_dim=embed_dim,
    num_heads=num_head,
)    

decoder_layer = DecoderLayer(
    embed_dim=embed_dim,
    num_heads=num_head,
)  

show_model_parameters(encoder_layer)
show_model_parameters(decoder_layer)

EncoderLayer(
  (self_attention): MultiHeadAttention(
    (w_q): Linear(in_features=128, out_features=128, bias=True)
    (w_k): Linear(in_features=128, out_features=128, bias=True)
    (w_v): Linear(in_features=128, out_features=128, bias=True)
    (w_o): Linear(in_features=128, out_features=128, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (feed_forward): FeedForward(
    (linear1): Linear(in_features=128, out_features=128, bias=True)
    (linear2): Linear(in_features=128, out_features=128, bias=True)
    (dropout): Dropout(p=0.0, inplace=False)
    (relu): ReLU()
    (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  )
  (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)
None tensor([[ 0.0283,  0.0774, -0.0439,  ..., -0.0065,  0.0478,  0.0393],
        [ 0.0179,  0.0412, -0.0012,  ...,  0.0274, -0.0614,  0.0683],
        [-0.0677, -0

In [80]:
encoder_layer = EncoderLayer(
    embed_dim=embed_dim,
    num_heads=num_head,
)   

tgt, src, src_embedding, tgt_embedding, cross_padding_mask = get_inputs_tensors()
memory = check_gradient_explosion( 
                                  encoder_layer,
                                  src_embedding, 
                                  [src_embedding],
                                  {'self_is_causal': True}
)


torch.Size([3, 7]) torch.Size([3, 8])
torch.Size([3, 7, 128]) torch.Size([3, 8, 128])
tensor(0.7772)


In [105]:
decoder_layer = DecoderLayer(
    embed_dim=embed_dim,
    num_heads=num_head,
)   
tgt, src, src_embedding, tgt_embedding, cross_padding_mask = get_inputs_tensors()
logits = check_gradient_explosion( 
                                  decoder_layer,
                                  tgt_embedding, 
                                  [tgt_embedding, src_embedding],
                                  {'self_is_causal': True,
                                   'cross_padding_mask': None}
)

torch.Size([3, 7]) torch.Size([3, 8])
torch.Size([3, 7, 128]) torch.Size([3, 8, 128])
tensor(1.0517)


In [8]:
logits = decoder_layer(
    tgt_embedding,
    memory,
    self_is_causal = True,
    cross_padding_mask = cross_padding_mask
)

print(
    f"\n{src_embedding.shape=}",
    f"\n{tgt_embedding.shape=}"
    f"\n{memory.shape=}",
    f"\n{logits.shape=}"
)



src_embedding.shape=torch.Size([3, 7, 128]) 
tgt_embedding.shape=torch.Size([3, 8, 128])
memory.shape=torch.Size([3, 7, 128]) 
logits.shape=torch.Size([3, 8, 128])


# **Encoder**

In [9]:
from src.transformers.models.encoders import Encoder

encoder = Encoder(
    vocab_size = 10**4,
    embed_dim=embed_dim,
    num_heads=num_head,
    n_layers=2,
    max_len=5000,
)

print(encoder)

  from .autonotebook import tqdm as notebook_tqdm


Encoder(
  (embedding): Embedding(10000, 128)
  (positional_encoding): PositionalEncoding()
  (layers): ModuleList(
    (0-1): 2 x EncoderLayer(
      (self_attention): MultiHeadAttention(
        (w_q): Linear(in_features=128, out_features=128, bias=True)
        (w_k): Linear(in_features=128, out_features=128, bias=True)
        (w_v): Linear(in_features=128, out_features=128, bias=True)
        (w_o): Linear(in_features=128, out_features=128, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (feed_forward): FeedForward(
        (linear1): Linear(in_features=128, out_features=128, bias=True)
        (linear2): Linear(in_features=128, out_features=128, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
        (relu): ReLU()
        (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      )
      (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (dropout)

In [10]:
memory = encoder(
    src,
    src_padding_mask = None,
    self_is_causal = True
)

print(
    f"\n{memory.shape=}",
)



memory.shape=torch.Size([3, 7, 128])


# **Decoders**

In [11]:
from src.transformers.models.decoders import Decoder

decoder = Decoder(
    vocab_size = 10**4,
    embed_dim=embed_dim,
    num_heads=num_head,
    n_layers=2,
    max_len=5000,
)

print(decoder)

Decoder(
  (embedding): Embedding(10000, 128)
  (positional_encoding): PositionalEncoding()
  (layers): ModuleList(
    (0-1): 2 x DecoderLayer(
      (self_attention): MultiHeadAttention(
        (w_q): Linear(in_features=128, out_features=128, bias=True)
        (w_k): Linear(in_features=128, out_features=128, bias=True)
        (w_v): Linear(in_features=128, out_features=128, bias=True)
        (w_o): Linear(in_features=128, out_features=128, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (cross_attention): MultiHeadAttention(
        (w_q): Linear(in_features=128, out_features=128, bias=True)
        (w_k): Linear(in_features=128, out_features=128, bias=True)
        (w_v): Linear(in_features=128, out_features=128, bias=True)
        (w_o): Linear(in_features=128, out_features=128, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (feed_forward): FeedForward(
        (linear1): Linear(in_features=128, out_features=128, bias=True)
 

In [12]:
logits = decoder(
    tgt,
    memory,
    self_is_causal=True,
    cross_padding_mask=cross_padding_mask
)

print(
    f"\n{logits.shape=}"
)



logits.shape=torch.Size([3, 8, 128])


# **Model**

In [13]:
from src.transformers.models.optimus_model import OptimusTransformer

config = {
    'src_vocab_size': 10**4,
    'tgt_vocab_size': 11**4,
    'embed_dim': 128,
    'num_heads': 16,
    'n_layers': 1,
    'hidden_dim': 256,
    'max_seq_length': 100,
    'dropout': 0.1,
    'batch_size': 32,
    'num_epochs': 10,
    'warmup_steps': 4000,
    'label_smoothing': 0.1
}

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = OptimusTransformer(
src_vocab_size=config['src_vocab_size'],
tgt_vocab_size=config['tgt_vocab_size'],
n_layers=config['n_layers'],
embed_dim=config['embed_dim'],
num_heads=config['num_heads'],
max_len=config['max_seq_length'],
dropout=config['dropout'],
).to(device)

print(model)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)


src_vocab_size=10000, tgt_vocab_size=14641
OptimusTransformer(
  (encoder): Encoder(
    (embedding): Embedding(10000, 128)
    (positional_encoding): PositionalEncoding()
    (layers): ModuleList(
      (0): EncoderLayer(
        (self_attention): MultiHeadAttention(
          (w_q): Linear(in_features=128, out_features=128, bias=True)
          (w_k): Linear(in_features=128, out_features=128, bias=True)
          (w_v): Linear(in_features=128, out_features=128, bias=True)
          (w_o): Linear(in_features=128, out_features=128, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward): FeedForward(
          (linear1): Linear(in_features=128, out_features=128, bias=True)
          (linear2): Linear(in_features=128, out_features=128, bias=True)
          (dropout): Dropout(p=0.0, inplace=False)
          (relu): ReLU()
          (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        )
        (norm1): LayerNorm((128,), eps=1e-05, e

In [14]:
with torch.no_grad():
    output = model(
        src,
        tgt,
        cross_padding_mask = cross_padding_mask,
        tgt_is_causal = True,
        memory_is_causal = True
    )

print(output.shape)

torch.Size([3, 8, 14641])


In [15]:
start_seq = torch.tensor([[1],[1],[1]])
cross_padding_mask = create_cross_attention_mask(start_seq, src)
print(cross_padding_mask)
with torch.no_grad():
    output = model(
        src,
        start_seq,
        cross_padding_mask = cross_padding_mask,
        tgt_is_causal = True,
        memory_is_causal = True
    )

print(torch.argmax(output))


tensor([[[[False, False, False, False,  True,  True,  True]]],


        [[[False, False, False, False, False,  True,  True]]],


        [[[False, False,  True,  True,  True,  True,  True]]]])
tensor(3362)


In [16]:
print("=== Test Forward Pass ===")
with torch.no_grad():
    output = model(
        src,
        tgt,
        cross_padding_mask = cross_padding_mask,
        tgt_is_causal = True,
        memory_is_causal = True
    )
    print(f"Input source shape: {src.shape}")
    print(f"Input target shape: {tgt.shape}")
    print(f"Output shape: {output.shape}")  # [batch_size, tgt_len, vocab_size]

# Test generazione
print("\n=== Test Generazione ===")
with torch.no_grad():
    generated = model.generate(src, max_len=20, start_token=1, end_token=2)
    print(f"Generated sequence shape: {generated.shape}")
    print(f"First generated sequence: {generated[0].tolist()}")


=== Test Forward Pass ===
Input source shape: torch.Size([3, 7])
Input target shape: torch.Size([3, 8])
Output shape: torch.Size([3, 8, 14641])

=== Test Generazione ===
Generated sequence shape: torch.Size([3, 21])
First generated sequence: [1, 11127, 11127, 11127, 11127, 11127, 11127, 11127, 11127, 11127, 11127, 11127, 11127, 11127, 11127, 11127, 11127, 11127, 11127, 11127, 11127]
