In [1]:
import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :].to(x.device)

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.head_dim = d_model // num_heads
        
        assert self.head_dim * num_heads == d_model, "d_model must be divisible by num_heads"
        
        self.qkv_proj = nn.Linear(d_model, d_model * 3)
        self.fc_out = nn.Linear(d_model, d_model)
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

    def forward(self, queries, keys, values, mask=None):
        batch_size, seq_length, _ = queries.shape
        qkv = self.qkv_proj(torch.cat((queries, keys, values), dim=-1)).chunk(3, dim=-1)
        queries, keys, values = [t.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2) for t in qkv]
        
        scores = torch.matmul(queries, keys.transpose(-2, -1)) / self.scale
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float("-inf"))
        attention = torch.softmax(scores, dim=-1)
        out = torch.matmul(attention, values).transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
        return self.fc_out(out)

class FeedForward(nn.Module):
    def __init__(self, d_model, hidden_dim):
        super(FeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

class EncoderBlock(nn.Module):
    def __init__(self, d_model, num_heads, hidden_dim, dropout=0.1):
        super(EncoderBlock, self).__init__()
        self.attn = MultiHeadAttention(d_model, num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.ff = FeedForward(d_model, hidden_dim)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_out = self.attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_out))
        ff_out = self.ff(x)
        x = self.norm2(x + self.dropout(ff_out))
        return x

class DecoderBlock(nn.Module):
    def __init__(self, d_model, num_heads, hidden_dim, dropout=0.1):
        super(DecoderBlock, self).__init__()
        self.attn1 = MultiHeadAttention(d_model, num_heads)
        self.attn2 = MultiHeadAttention(d_model, num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.ff = FeedForward(d_model, hidden_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask, tgt_mask):
        attn_out1 = self.attn1(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_out1))
        attn_out2 = self.attn2(x, enc_out, enc_out, src_mask)
        x = self.norm2(x + self.dropout(attn_out2))
        ff_out = self.ff(x)
        x = self.norm3(x + self.dropout(ff_out))
        return x

class Transformer(nn.Module):
    def __init__(self, input_dim, output_dim, d_model, num_heads, num_layers, hidden_dim, max_len):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(input_dim, d_model)
        self.decoder_embedding = nn.Embedding(output_dim, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        self.encoder_layers = nn.ModuleList([
            EncoderBlock(d_model, num_heads, hidden_dim) for _ in range(num_layers)
        ])
        self.decoder_layers = nn.ModuleList([
            DecoderBlock(d_model, num_heads, hidden_dim) for _ in range(num_layers)
        ])
        self.fc_out = nn.Linear(d_model, output_dim)

    def forward(self, src, tgt, src_mask, tgt_mask):
        src = self.pos_encoding(self.encoder_embedding(src))
        tgt = self.pos_encoding(self.decoder_embedding(tgt))
        
        for layer in self.encoder_layers:
            src = layer(src, src_mask)
        
        for layer in self.decoder_layers:
            tgt = layer(tgt, src, src_mask, tgt_mask)
        
        return self.fc_out(tgt)

# Example Usage
input_dim = 10000  # Hindi vocabulary size
output_dim = 10000  # English vocabulary size
d_model = 512  # Embedding dimension
num_heads = 8  # Number of attention heads
num_layers = 6  # Number of transformer blocks
hidden_dim = 2048  # Feedforward network hidden dimension
max_len = 100  # Maximum sequence length

model = Transformer(input_dim, output_dim, d_model, num_heads, num_layers, hidden_dim, max_len)
print(model)


Transformer(
  (encoder_embedding): Embedding(10000, 512)
  (decoder_embedding): Embedding(10000, 512)
  (pos_encoding): PositionalEncoding()
  (encoder_layers): ModuleList(
    (0-5): 6 x EncoderBlock(
      (attn): MultiHeadAttention(
        (qkv_proj): Linear(in_features=512, out_features=1536, bias=True)
        (fc_out): Linear(in_features=512, out_features=512, bias=True)
      )
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (ff): FeedForward(
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
        (relu): ReLU()
      )
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (decoder_layers): ModuleList(
    (0-5): 6 x DecoderBlock(
      (attn1): MultiHeadAttention(
        (qkv_proj): Linear(in_features=512, out_features=1536, bias=True)
        (fc_out): Linear(in_features=512, out_featu

In [2]:
from transformers import MarianMTModel, MarianTokenizer

# Load model and tokenizer for Hindi to English
model_name = "Helsinki-NLP/opus-mt-hi-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def translate_text(sentence):
    # Tokenize input text
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    
    # Generate translation
    translated_tokens = model.generate(**inputs)
    
    # Decode output
    translated_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    return translated_text

# Example translation
hindi_text = "मुझे फुटबॉल खेलना पसंद है।"
translated_text = translate_text(hindi_text)
print("Translated:", translated_text)


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/813k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/304M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Translated: I like playing football.
