<a href="https://colab.research.google.com/github/alaaosama72/Transformer-model-/blob/main/Untitled87.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn.functional as F
from torch.nn import Linear, Softmax

# Define the parameters
d_model = 512  # Dimension of the model
n_heads = 8    # Number of attention heads
seq_length = 10  # Length of the input sequence (number of tokens)

# Random input tensor (batch_size, seq_length, d_model)
x = torch.rand((1, seq_length, d_model))

# Split the input tensor into multiple heads
def split_heads(x, n_heads):
    batch_size, seq_length, d_model = x.size()
    depth = d_model // n_heads
    x = x.view(batch_size, seq_length, n_heads, depth)
    return x.permute(0, 2, 1, 3)

# Scaled dot-product attention
def scaled_dot_product_attention(q, k, v):
    matmul_qk = torch.matmul(q, k.transpose(-2, -1))
    dk = q.size()[-1]
    scaled_attention_logits = matmul_qk / torch.sqrt(torch.tensor(dk, dtype=torch.float32))
    attention_weights = Softmax(dim=-1)(scaled_attention_logits)
    output = torch.matmul(attention_weights, v)
    return output, attention_weights

# Multi-head attention
class MultiHeadAttention(torch.nn.Module):
    def __init__(self, d_model, n_heads):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads

        self.wq = Linear(d_model, d_model)
        self.wk = Linear(d_model, d_model)
        self.wv = Linear(d_model, d_model)
        self.dense = Linear(d_model, d_model)

    def forward(self, x):
        batch_size = x.size(0)

        q = self.wq(x)
        k = self.wk(x)
        v = self.wv(x)

        q = split_heads(q, self.n_heads)
        k = split_heads(k, self.n_heads)
        v = split_heads(v, self.n_heads)

        scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v)
        scaled_attention = scaled_attention.permute(0, 2, 1, 3).contiguous()
        original_size_attention = scaled_attention.view(batch_size, -1, self.d_model)

        output = self.dense(original_size_attention)
        return output, attention_weights

mha = MultiHeadAttention(d_model, n_heads)
output, attention_scores = mha(x)

# Print the attention scores
print("Attention Scores:", attention_scores)

Attention Scores: tensor([[[[0.0959, 0.0968, 0.0929, 0.1039, 0.1030, 0.1051, 0.0976, 0.0962,
           0.0996, 0.1089],
          [0.0948, 0.0953, 0.0955, 0.1038, 0.0980, 0.1042, 0.0953, 0.0968,
           0.1062, 0.1102],
          [0.0943, 0.0975, 0.0977, 0.1016, 0.1024, 0.1025, 0.0984, 0.0987,
           0.0984, 0.1086],
          [0.1018, 0.0930, 0.0993, 0.1042, 0.1021, 0.1033, 0.0974, 0.0973,
           0.0968, 0.1047],
          [0.0989, 0.0939, 0.0952, 0.1061, 0.1035, 0.1041, 0.0940, 0.0964,
           0.0951, 0.1129],
          [0.0996, 0.0962, 0.0954, 0.1051, 0.1030, 0.1049, 0.0986, 0.0984,
           0.0924, 0.1065],
          [0.0937, 0.1003, 0.0998, 0.1005, 0.0996, 0.1011, 0.0976, 0.0957,
           0.1018, 0.1099],
          [0.0978, 0.1001, 0.0908, 0.1010, 0.0994, 0.1055, 0.1013, 0.0981,
           0.1028, 0.1032],
          [0.0932, 0.0955, 0.0970, 0.1044, 0.1002, 0.1024, 0.0943, 0.1014,
           0.0999, 0.1118],
          [0.0979, 0.0977, 0.0913, 0.1053, 0.0984, 0.10