In [1]:
import numpy as np
import torch
import torch.nn as nn

In [463]:
en_sentence = "this is a fat cat"      
fr_sentence = "c'est un chat gros"

In [464]:
# Special tokens
SOS = "<sos>" # start of sentence
EOS = "<eos>"

Step 1: tokenisation
 - we create a dictionary 

In [465]:
# Tokenize sentences
en_tokens = en_sentence.split() 
fr_tokens = [SOS] + fr_sentence.split() + [EOS]  # adding SOS and EOS

In [466]:
vocab_en = {word: idx for idx, word in enumerate((en_tokens))}

# for repeated words, use set instead
# {word: idx for idx, word in enumerate(set(en_tokens))}


In [467]:
vocab_en

{'this': 0, 'is': 1, 'a': 2, 'fat': 3, 'cat': 4}

In [468]:
vocab_fr = {word: idx for idx, word in enumerate((fr_tokens))}


In [469]:
vocab_fr


{'<sos>': 0, "c'est": 1, 'un': 2, 'chat': 3, 'gros': 4, '<eos>': 5}

Step 2: converting each token into an embedding vector
 - let's consider d_model = 6 for simplicity

In [470]:
d_model = 6

In [471]:
en_embed = torch.randn(len(vocab_en), d_model)

en_embed.size() # should be 5 x 6

torch.Size([5, 6])

In [472]:
fr_embed = torch.randn(len(vocab_fr), d_model)

fr_embed.size() # should be 6 x 6

torch.Size([6, 6])

Step 3: adding positional encoding

In [473]:
seq_len = len(vocab_en)   # length of input seq of en = 5

In [474]:
# to create positional encoding (sin/cos formula)
def pos_encoding(seq_len, d_model, embed):

    pos_encod = np.zeros((seq_len, d_model)) # np.zeros expects a tuple for shape

    for pos in range(seq_len):
        for i in range(d_model):
            if i % 2 == 0:
                pos_encod[pos, i] = np.sin(pos / (10000 ** (i / d_model)))
            else:
                pos_encod[pos, i] = np.cos(pos / (10000 ** ((i - 1) / d_model)))
    
    return pos_encod


In [475]:
fr_pos_encod = pos_encoding(6, 6, fr_embed)
fr_pos_encod 

array([[ 0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         1.        ],
       [ 0.84147098,  0.54030231,  0.04639922,  0.99892298,  0.00215443,
         0.99999768],
       [ 0.90929743, -0.41614684,  0.0926985 ,  0.99569422,  0.00430886,
         0.99999072],
       [ 0.14112001, -0.9899925 ,  0.1387981 ,  0.9903207 ,  0.00646326,
         0.99997911],
       [-0.7568025 , -0.65364362,  0.18459872,  0.98281398,  0.00861763,
         0.99996287],
       [-0.95892427,  0.28366219,  0.23000171,  0.97319022,  0.01077197,
         0.99994198]])

In [476]:
# adding positional encoding to french embeddings
fr_embeddings_with_position = fr_embed + fr_pos_encod
fr_embeddings_with_position = torch.tensor(fr_embeddings_with_position, dtype=torch.float32)

print("target (french) embeddings with positional encoding:\n", fr_embeddings_with_position)


target (french) embeddings with positional encoding:
 tensor([[-1.8414, -0.1653, -1.6628, -0.9757,  0.5443,  2.3233],
        [ 0.7097,  0.8202,  0.1538,  1.2056, -1.7091,  1.3063],
        [ 0.9607, -0.7560,  1.3283,  0.2207, -0.3546,  0.3512],
        [-1.1170, -2.5425,  0.0085,  0.8106,  0.2896, -0.6482],
        [-1.7519, -2.2727, -0.0269,  2.0114, -0.7436,  1.1899],
        [-2.2542,  0.8670,  0.3996, -0.7127, -0.0201, -0.3976]])


  fr_embeddings_with_position = torch.tensor(fr_embeddings_with_position, dtype=torch.float32)


In [477]:
en_pos_encod = pos_encoding(5, 6, en_embed)
en_pos_encod 

array([[ 0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         1.        ],
       [ 0.84147098,  0.54030231,  0.04639922,  0.99892298,  0.00215443,
         0.99999768],
       [ 0.90929743, -0.41614684,  0.0926985 ,  0.99569422,  0.00430886,
         0.99999072],
       [ 0.14112001, -0.9899925 ,  0.1387981 ,  0.9903207 ,  0.00646326,
         0.99997911],
       [-0.7568025 , -0.65364362,  0.18459872,  0.98281398,  0.00861763,
         0.99996287]])

In [478]:
# adding positional encoding to english embeddings
en_embeddings_with_position = en_embed + en_pos_encod
en_embeddings_with_position = torch.tensor(en_embeddings_with_position, dtype = torch.float32)

print("input (en) embeddings with positional encoding:\n", en_embeddings_with_position)


input (en) embeddings with positional encoding:
 tensor([[-0.0771,  1.1392, -0.2497, -0.5263, -0.7974,  0.4814],
        [ 0.1057,  0.4611, -1.2704,  0.6989,  1.0642,  0.7467],
        [ 2.3867, -0.5689,  0.2371,  1.6630, -1.0370,  2.0919],
        [ 0.2475, -2.0977, -0.2304,  1.3468, -0.8500,  2.1827],
        [-1.8206, -2.3345,  0.9096,  0.3176, -1.5079,  1.5184]])


  en_embeddings_with_position = torch.tensor(en_embeddings_with_position, dtype = torch.float32)


In [479]:
fr_embed_tensor = torch.tensor(fr_embeddings_with_position, dtype = torch.float32)


  fr_embed_tensor = torch.tensor(fr_embeddings_with_position, dtype = torch.float32)


Step 4: Masked self-attention for the decoder 

In [480]:
fr_seq_len = len(vocab_fr)

In [481]:
# creates a triangular matrix wwith lower half as 1 rest 0 (a mask for future words)
mask = np.tril(np.ones((fr_seq_len, fr_seq_len))) 


In [482]:
mask

array([[1., 0., 0., 0., 0., 0.],
       [1., 1., 0., 0., 0., 0.],
       [1., 1., 1., 0., 0., 0.],
       [1., 1., 1., 1., 0., 0.],
       [1., 1., 1., 1., 1., 0.],
       [1., 1., 1., 1., 1., 1.]])

In [483]:
mask[mask == 0] = -1e9 #-np.infty
mask[mask == 1] = 0
mask   # -inf = no context whatsoever
# tho infinity shouldn't be used, we use a very small number (1e-9) for a working transformer

array([[ 0.e+00, -1.e+09, -1.e+09, -1.e+09, -1.e+09, -1.e+09],
       [ 0.e+00,  0.e+00, -1.e+09, -1.e+09, -1.e+09, -1.e+09],
       [ 0.e+00,  0.e+00,  0.e+00, -1.e+09, -1.e+09, -1.e+09],
       [ 0.e+00,  0.e+00,  0.e+00,  0.e+00, -1.e+09, -1.e+09],
       [ 0.e+00,  0.e+00,  0.e+00,  0.e+00,  0.e+00, -1.e+09],
       [ 0.e+00,  0.e+00,  0.e+00,  0.e+00,  0.e+00,  0.e+00]])

In [484]:
scaled = torch.tensor(mask, dtype=torch.float32) + fr_embed_tensor
scaled

tensor([[-1.8414e+00, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09,
         -1.0000e+09],
        [ 7.0966e-01,  8.2017e-01, -1.0000e+09, -1.0000e+09, -1.0000e+09,
         -1.0000e+09],
        [ 9.6071e-01, -7.5597e-01,  1.3283e+00, -1.0000e+09, -1.0000e+09,
         -1.0000e+09],
        [-1.1170e+00, -2.5425e+00,  8.4663e-03,  8.1057e-01, -1.0000e+09,
         -1.0000e+09],
        [-1.7519e+00, -2.2727e+00, -2.6877e-02,  2.0114e+00, -7.4357e-01,
         -1.0000e+09],
        [-2.2542e+00,  8.6703e-01,  3.9961e-01, -7.1266e-01, -2.0102e-02,
         -3.9760e-01]])

Step 5: Applying softmax to get attention weights

In [485]:
# softmax (vector to prob)

def softmax(x):
    exp_x = torch.exp(x - torch.max(x))  # Subtract max for numerical stability
    return exp_x / exp_x.sum(dim=-1, keepdim=True)

In [486]:
attention = softmax(scaled)
torch.tensor(attention, dtype=torch.float32)

attention

  torch.tensor(attention, dtype=torch.float32)


tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4724, 0.5276, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3811, 0.0685, 0.5504, 0.0000, 0.0000, 0.0000],
        [0.0893, 0.0215, 0.2753, 0.6139, 0.0000, 0.0000],
        [0.0189, 0.0112, 0.1058, 0.8124, 0.0517, 0.0000],
        [0.0172, 0.3890, 0.2437, 0.0801, 0.1602, 0.1098]])

Step 6: Computing weighted sum of embeddings using attention weights

In [487]:
out = torch.matmul(attention, scaled)
torch.tensor(out, dtype=torch.float32)

out

  torch.tensor(out, dtype=torch.float32)


tensor([[-1.8414e+00, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09,
         -1.0000e+09],
        [-4.9545e-01, -4.7240e+08, -1.0000e+09, -1.0000e+09, -1.0000e+09,
         -1.0000e+09],
        [-1.2436e-01, -3.8110e+08, -4.4957e+08, -1.0000e+09, -1.0000e+09,
         -1.0000e+09],
        [-5.7052e-01, -8.9330e+07, -1.1080e+08, -3.8608e+08, -1.0000e+09,
         -1.0000e+09],
        [-9.2311e-01, -1.8853e+07, -3.0054e+07, -1.3588e+08, -9.4832e+08,
         -1.0000e+09],
        [-1.3912e-01, -1.7154e+07, -4.0612e+08, -6.4985e+08, -7.2999e+08,
         -8.9018e+08]])

Step 7: Add and Norm

- Adding residual connection --> to ensure that there is a stronger information signal that flows through the network

- Normalizing --> needed to prevent vanishing gradient during backprop

In [488]:
# remember fr_embed_tensor = torch.tensor((fr_embed + fr_pos_encod), dtype = torch.float32)

out_with_residual = out + fr_embed_tensor
torch.tensor(out_with_residual, dtype = torch.float32)

out_with_residual

  torch.tensor(out_with_residual, dtype = torch.float32)


tensor([[-3.6827e+00, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09,
         -1.0000e+09],
        [ 2.1421e-01, -4.7240e+08, -1.0000e+09, -1.0000e+09, -1.0000e+09,
         -1.0000e+09],
        [ 8.3635e-01, -3.8110e+08, -4.4957e+08, -1.0000e+09, -1.0000e+09,
         -1.0000e+09],
        [-1.6875e+00, -8.9330e+07, -1.1080e+08, -3.8608e+08, -1.0000e+09,
         -1.0000e+09],
        [-2.6751e+00, -1.8853e+07, -3.0054e+07, -1.3588e+08, -9.4832e+08,
         -1.0000e+09],
        [-2.3933e+00, -1.7154e+07, -4.0612e+08, -6.4985e+08, -7.2999e+08,
         -8.9018e+08]])

In [489]:
# norm
layer_norm = torch.nn.LayerNorm(d_model)
normalized_out = layer_norm(out_with_residual)

In [490]:
normalized_out

tensor([[ 2.2361, -0.4472, -0.4472, -0.4472, -0.4472, -0.4472],
        [ 1.9360,  0.7091, -0.6613, -0.6613, -0.6613, -0.6613],
        [ 1.6469,  0.6638,  0.4872, -0.9327, -0.9327, -0.9327],
        [ 1.0281,  0.8151,  0.7638,  0.1072, -1.3571, -1.3571],
        [ 0.8083,  0.7654,  0.7400,  0.4994, -1.3478, -1.4653],
        [ 1.3106,  1.2605,  0.1249, -0.5868, -0.8207, -1.2884]],
       grad_fn=<NativeLayerNormBackward0>)

Step 8: Feed-Forward Network
 - a feed-forward network (FFN) that consists of two linear layers with a ReLU activation in between

In [491]:
ffn = nn.Sequential(
    nn.Linear(d_model, 4 * d_model),  # Expand the model dimension
    nn.ReLU(),
    nn.Linear(4 * d_model, d_model)   # Project back to original size
)

ffn_out = ffn(normalized_out)
ffn_out

tensor([[-0.3263,  0.4261, -0.6664,  0.5714,  1.0014, -0.1260],
        [-0.2389,  0.2112, -0.6452,  0.4037,  0.9688, -0.1275],
        [-0.2358,  0.1036, -0.7250,  0.3573,  0.7150, -0.3568],
        [-0.2452,  0.0290, -0.7106,  0.2980,  0.2706, -0.3207],
        [-0.2525,  0.0448, -0.6882,  0.3072,  0.1719, -0.2632],
        [-0.2428, -0.0014, -0.7032,  0.2823,  0.7057, -0.1947]],
       grad_fn=<AddmmBackward0>)

Step 9: This completes one Transformer Decoder layer.

In [492]:
ffn_out_with_residual = ffn_out + normalized_out  # Skip connection
normalized_ffn_out = layer_norm(ffn_out_with_residual)

normalized_ffn_out

tensor([[ 1.8568, -0.1767, -1.3273, -0.0237,  0.4291, -0.7582],
        [ 1.5805,  0.8139, -1.3832, -0.3483,  0.2093, -0.8723],
        [ 1.6254,  0.8963, -0.2427, -0.6251, -0.2198, -1.4341],
        [ 0.9436,  1.0079,  0.1752,  0.5458, -1.0250, -1.6476],
        [ 0.6741,  0.9304,  0.1664,  0.9268, -1.0705, -1.6271],
        [ 1.1568,  1.3591, -0.5846, -0.2949, -0.0945, -1.5418]],
       grad_fn=<NativeLayerNormBackward0>)

Step 10: Multi-Head Self-Attention for Encoder

In [493]:
multihead_attn = nn.MultiheadAttention(embed_dim = d_model, num_heads = 2, batch_first = True)
encoder_output, _ = multihead_attn(en_embeddings_with_position, en_embeddings_with_position, en_embeddings_with_position)

In [494]:
encoder_output = layer_norm(encoder_output + en_embeddings_with_position)

In [495]:
encoder_ffn_out = ffn(encoder_output)
encoder_ffn_out = layer_norm(encoder_ffn_out + encoder_output)


In [496]:
encoder_ffn_out

tensor([[-0.4807,  1.3482, -0.6100, -0.5612, -1.1166,  1.4204],
        [-0.6194, -0.0743, -1.7432,  0.1996,  1.2402,  0.9972],
        [ 0.7156, -0.9385, -0.9282,  1.2744, -1.0897,  0.9663],
        [-0.3832, -1.3100, -0.6968,  1.3004, -0.2818,  1.3714],
        [-1.3030, -0.9353,  0.5255,  1.0738, -0.6276,  1.2666]],
       grad_fn=<NativeLayerNormBackward0>)

Step 11: Cross-Attention in Decoder
 - The decoder attends to both its own tokens (masked self-attention) and the encoder's output (cross-attention)

In [497]:
cross_attn = nn.MultiheadAttention(embed_dim=d_model, num_heads=2, batch_first=True)

cross_attn_output, _ = cross_attn(normalized_ffn_out, encoder_ffn_out, encoder_ffn_out)
cross_attn_output = layer_norm(cross_attn_output + normalized_ffn_out)  # Residual connection



In [498]:
cross_ffn_out = ffn(cross_attn_output)
cross_ffn_out = layer_norm(cross_ffn_out + cross_attn_output)  # Add & Norm



In [499]:
cross_ffn_out

tensor([[ 1.1518, -0.1076, -1.6454,  0.4414,  1.0145, -0.8547],
        [ 1.0513,  0.6750, -1.6408,  0.1474,  0.8050, -1.0379],
        [ 1.3299,  0.7086, -0.6739, -0.0926,  0.4724, -1.7445],
        [ 0.7247,  0.8482, -0.1542,  1.0551, -0.7086, -1.7653],
        [ 0.4279,  0.7598, -0.1508,  1.3898, -0.8005, -1.6262],
        [ 0.8792,  1.0881, -0.9179,  0.1834,  0.4807, -1.7134]],
       grad_fn=<NativeLayerNormBackward0>)

In [500]:
cross_ffn_out = ffn(cross_attn_output)
cross_ffn_out = layer_norm(cross_ffn_out + cross_attn_output)  # Add & Norm


In [501]:
final_projection = nn.Linear(d_model, len(vocab_fr))  # Map d_model → vocab_size
logits = final_projection(cross_ffn_out)


In [502]:
loss_fn = nn.CrossEntropyLoss()

# Convert target sentence to tensor (ignoring SOS/EOS)
target_tensor = torch.tensor([vocab_fr[word] for word in fr_tokens if word in vocab_fr], dtype=torch.long)

# Ensure correct shape for loss calculation
loss = loss_fn(logits.view(-1, len(vocab_fr)), target_tensor.view(-1))  # Flatten
loss.backward()


In [503]:
optimizer = torch.optim.Adam(list(ffn.parameters()) + list(cross_attn.parameters()) + list(final_projection.parameters()), lr=0.001)

optimizer.step()
optimizer.zero_grad()  # Clear gradients


In [504]:
# Get predicted word indices
predicted_indices = torch.argmax(logits, dim=-1)  # Choose highest probability index

# Reverse mapping from index to word
idx_to_word_fr = {idx: word for word, idx in vocab_fr.items()}

# Check shape before proceeding
print("Predicted indices shape:", predicted_indices.shape)  

# Ensure predicted_indices is a list of indices, not a single integer
if predicted_indices.dim() == 1:  # If it's a single sequence
    predicted_indices_list = predicted_indices.tolist()
elif predicted_indices.dim() == 2:  # If batch dimension exists
    predicted_indices_list = predicted_indices[0].tolist()
else:
    raise ValueError("Unexpected shape for predicted_indices:", predicted_indices.shape)

# Convert indices to words
predicted_sentence = [idx_to_word_fr[idx] for idx in predicted_indices_list]

# Remove special tokens
translated_sentence = " ".join([word for word in predicted_sentence if word not in [SOS, EOS]])

print("Predicted French Translation:", translated_sentence)




Predicted indices shape: torch.Size([6])
Predicted French Translation: un gros gros c'est gros
