In [1]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import math
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [42]:
#to create custom dataset pytorch inbuilt class used

from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):

    def __init__(self, english_sentences, lang_sentences):
        self.english_sentences = english_sentences
        self.lang_sentences = lang_sentences
    #rturn number of eng/hind or sentences in a list
    def __len__(self):
        return len(self.english_sentences)
    #return 1:1 mapping of one lang to other
    def __getitem__(self, idx):
        return self.english_sentences[idx], self.lang_sentences[idx]



In [2]:
class SentenceEmbedding(nn.Module):
    "For a given sentence, create an embedding"
    def __init__(self, max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN):
        super().__init__()
        self.vocab_size = len(language_to_index)
        self.max_sequence_length = max_sequence_length
        self.embedding = nn.Embedding(self.vocab_size, d_model)
        self.language_to_index = language_to_index
        self.position_encoder = RoPEEmbedding(d_model)
        self.dropout = nn.Dropout(p=0.1)
        self.START_TOKEN = START_TOKEN
        self.END_TOKEN = END_TOKEN
        self.PADDING_TOKEN = PADDING_TOKEN

    def batch_tokenize(self, batch, start_token=True, end_token=True):
      def tokenize(sentence, start_token=True, end_token=True):
          # Convert sentence to list of word indices
          sentence_word_indices = [self.language_to_index[token] for token in list(sentence)]

          # Add start and end tokens if needed
          if start_token:
              sentence_word_indices.insert(0, self.language_to_index[self.START_TOKEN])
          if end_token:
              sentence_word_indices.append(self.language_to_index[self.END_TOKEN])

          # Pad the sentence to max_sequence_length
          while len(sentence_word_indices) < self.max_sequence_length:
              sentence_word_indices.append(self.language_to_index[self.PADDING_TOKEN])

          # Ensure the sequence length doesn't exceed max_sequence_length
          sentence_word_indices = sentence_word_indices[:self.max_sequence_length]

          return torch.tensor(sentence_word_indices)

      tokenized = []
      for sentence_num in range(len(batch)):
          tokenized.append(tokenize(batch[sentence_num], start_token, end_token))

      # All sentences are now padded to the same length, so stack them
      tokenized = torch.stack(tokenized)  # All tensors will have the same size now
      return tokenized


    def forward(self, x,start_token, end_token=True): # sentence
        x = self.batch_tokenize(x ,start_token,end_token)
        x = self.embedding(x)
        pos = self.position_encoder(x)
        x = self.dropout(x + pos)
        return x


In [3]:
class SentenceEmbedding(nn.Module):
    "For a given sentence, create an embedding"
    def __init__(self, max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN):
        super().__init__()
        self.vocab_size = len(language_to_index)
        self.max_sequence_length = max_sequence_length
        # create embedding of dmodel from input tokens
        self.embedding = nn.Embedding(self.vocab_size, d_model)
        self.language_to_index = language_to_index
        self.position_encoder = RoPEEmbedding(d_model)
        self.dropout = nn.Dropout(p=0.1)
        self.START_TOKEN = START_TOKEN
        self.END_TOKEN = END_TOKEN
        self.PADDING_TOKEN = PADDING_TOKEN

    def batch_tokenize(self,batch,start,end):
      def tokenize(sentence):
        words = sentence.split()  # ✅ Now splits by words
        # for below The .get() method of a dictionary tries to find the token in language_to_index.If token exists in the dictionary, it returns the corresponding index.If token does not exist in the dictionary, it returns language_to_index[PADDING_TOKEN],
        sentence_word_indices = [self.language_to_index.get(token, self.language_to_index[self.PADDING_TOKEN]) for token in words]
        # ✅ Apply truncation BEFORE adding special tokens
        sentence_word_indices = sentence_word_indices[:self.max_sequence_length - (1 if start else 0) - (1 if end else 0)]

        # ✅ Add special tokens
        if start:
            sentence_word_indices.insert(0, self.language_to_index[self.START_TOKEN])

        if end:
            sentence_word_indices.append(self.language_to_index[self.END_TOKEN])


        # Apply padding if needed (padding should be added after truncation in case after trunction needs padding)
        if len(sentence_word_indices) < self.max_sequence_length:
            padding_length = self.max_sequence_length - len(sentence_word_indices)
            sentence_word_indices.extend([self.language_to_index[self.PADDING_TOKEN]] * padding_length)

        return torch.tensor(sentence_word_indices)

      return torch.stack([tokenize(sentence) for sentence in batch])


    def forward(self, x,start,end): # sentence
        x = self.batch_tokenize(x,start,end)
        print('input tokenization of x')
        print(x.shape)
        x = self.embedding(x)
        pos = self.position_encoder(x)
        #print('---positional embedding---')
        #print(pos.shape)
        x = self.dropout(x + pos)
        return x


#rope embedding
class RoPEEmbedding(torch.nn.Module):
    def __init__(self, embedding_dim):
        super().__init__()
        assert embedding_dim % 2 == 0, "Embedding dimension must be even for RoPE"
        self.embedding_dim = embedding_dim

    def forward(self, x):
        """
        Forward pass for Rotary Position Embedding.

        Args:
        - x: Tensor of shape (batch_size, seq_len, embedding_dim)

        Returns:
        - Tensor with RoPE applied to the last two dimensions.
        """
        seq_len = x.shape[1]

        # Generate position indices
        position_ids = torch.arange(seq_len, dtype=torch.float32, device=x.device)

        # Compute the rotary angles
        freqs = 1.0 / (10000 ** (torch.arange(0, self.embedding_dim, 2, dtype=torch.float32, device=x.device) / self.embedding_dim))
        angles = torch.einsum('i,j->ij', position_ids, freqs)

        # Create the rotation matrix for sin and cos embeddings
        sin = torch.sin(angles).repeat_interleave(2, dim=-1)
        cos = torch.cos(angles).repeat_interleave(2, dim=-1)

        # Apply rotation using cos and sin embeddings
        x1 = x * cos + self.rotate_half(x) * sin
        return x1


    def rotate_half(self,x):
        """
        Rotate the last dimension of the input tensor by swapping odd and even elements and negating one.

        Args:
        - x: Tensor of shape (..., embedding_dim)

        Returns:
        - Rotated tensor of the same shape.
        """
        x1, x2 = x[..., ::2], x[..., 1::2]  # Split into even and odd dimensions
        return torch.cat((-x2, x1), dim=-1)

#sine embedding
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model

    def forward(self, x):
        batch_size, seq_len, _ = x.size()

        # Generate the positional encoding
        even_i = torch.arange(0, self.d_model, 2).float()
        denominator = torch.pow(10000, even_i/self.d_model)
        position = torch.arange(self.max_sequence_length).reshape(self.max_sequence_length, 1)
        even_PE = torch.sin(position / denominator)
        odd_PE = torch.cos(position / denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)

        # Ensure the PE matches the batch size and sequence length
        PE = PE.unsqueeze(0).expand(batch_size, seq_len, self.d_model)

        return PE + x




In [4]:

#feedforward network
class feedforward(nn.Module):
  def __init__(self,d_model,hidlayer,dropout):
    super().__init__()
    self.d_model=d_model
    self.hidlayer=hidlayer
    self.linearlayer1=nn.Linear(self.d_model,self.hidlayer)
    self.linearlayer2=nn.Linear(self.hidlayer,self.d_model)
    self.dropout=nn.Dropout(dropout)
    self.activation=nn.ReLU()

  def forward(self,x):
    l1=self.linearlayer1(x)
    #print(f"x after first linear layer: {x.size()}")
    l1=self.activation(l1)
    #print(f"x after activation: {l1.size()}")
    l1=self.dropout(l1)
    #print(f"x after dropout 1: {l1.size()}")
    out=self.linearlayer2(l1)
    #print(f"x after 2nd linear layer: {out.size()}")
    #drop out gen not aplpied after 1st layhers
    out=self.dropout(out)
    #print(f"x after dropout 2: {out.size()}")
    return out

#multhead attention
class multihead_attention(nn.Module):
    def __init__(self,dmodel,heads=1,masking=False):
        super().__init__()
        self.heads = heads
        self.masking = masking
        self.dmodel=dmodel
        assert self.dmodel % self.heads == 0, "Embedding dimension must be divisible by num_heads"

        self.dmodel = dmodel
        self.head_dim = self.dmodel // self.heads
        self.wq = nn.Linear(self.dmodel, self.dmodel)
        self.wk = nn.Linear(self.dmodel, self.dmodel)
        self.wv = nn.Linear(self.dmodel, self.dmodel)
        self.linearlayer=nn.Linear(self.dmodel,self.dmodel)

        #print('heads =', self.heads)

    def scaled_dot_product_attention(self, q, k, v):
        dk = torch.tensor(q.shape[-1], dtype=torch.float32)
        scaled = torch.matmul(q, k.transpose(-2, -1)) / torch.sqrt(dk)

        if self.masking:
            mask = torch.ones(q.shape[2], q.shape[2], device=q.device)
            mask = torch.tril(mask)
            mask[mask == 0] = -torch.inf
            mask[mask == 1] = 0
            scaled = scaled + mask

        attention = torch.softmax(scaled, dim=-1)
        scores = torch.matmul(attention, v)
        return attention, scores

    def forward(self, x):
        batch_size, sequence_length, input_dim = x.size()
        q = self.wq(x)
        k = self.wk(x)
        v = self.wv(x)
        q = q.view(batch_size, sequence_length, self.heads, self.head_dim)
        k = k.view(batch_size, sequence_length, self.heads, self.head_dim)
        v = v.view(batch_size, sequence_length, self.heads, self.head_dim)
        q = q.permute(0, 2, 1, 3)
        k = k.permute(0, 2, 1, 3)
        v = v.permute(0, 2, 1, 3)

        attention, scores = self.scaled_dot_product_attention(q, k, v)
        #print('scores init',scores.shape)
        scores = scores.reshape(batch_size, sequence_length, self.heads *self.head_dim)# we can use self.dmodel as well as last arg
        #print('scores shape',scores.shape)
        out=self.linearlayer(scores)
        #print('out',out.shape)
        #print('projected shape',projected.shape)
        #print()
        return out



#layer norm
class CustomLayerNorm(nn.Module):
    def __init__(self, normalized_shape, epsilon=1e-5):
        super(CustomLayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(normalized_shape))
        self.beta = nn.Parameter(torch.zeros(normalized_shape))
        self.epsilon = epsilon

    def forward(self, x):
        # Calculate mean and std across the last dimension (features) for each sequence in the batch
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True)
        x_normalized = (x - mean) / (std + self.epsilon)

        # Apply gamma and beta, which are learned parameters for normalization
        # The shape of gamma and beta should match the feature size
        return self.gamma.unsqueeze(0).unsqueeze(0) * x_normalized + self.beta.unsqueeze(0).unsqueeze(0)



class multihead_cross_attention(nn.Module):
    def __init__(self, dmodel, masking=None, heads=1):
        super().__init__()
        self.heads = heads
        self.masking = masking
        assert dmodel % heads == 0, "Embedding dimension must be divisible by num_heads"

        self.dmodel = dmodel
        self.head_dim = self.dmodel // self.heads
        self.wqc = nn.Linear(self.dmodel, self.dmodel)
        self.wkc = nn.Linear(self.dmodel, self.dmodel)
        self.wvc = nn.Linear(self.dmodel, self.dmodel)
        self.linearlayer=nn.Linear(self.dmodel,self.dmodel)
        #print('heads =', self.heads)

    def scaled_dot_product_attention(self, q, k, v):
        dk = torch.tensor(q.shape[-1], dtype=torch.float32)
        scaled = torch.matmul(q, k.transpose(-2, -1)) / torch.sqrt(dk)

        if self.masking is not None:
            mask = torch.ones(q.shape[2], q.shape[2], device=q.device)
            mask = torch.tril(mask)
            mask[mask == 0] = -torch.inf
            mask[mask == 1] = 0
            scaled = scaled + mask

        attention = torch.softmax(scaled, dim=-1)
        scores = torch.matmul(attention, v)
        return attention, scores

    def forward(self, x,y):
        batch_size, sequence_length, input_dim = x.size()
        # if y.size(0) != x.size(0):
        #      pad_size = x.size(0) - y.size(0)
        #      padding = torch.zeros(pad_size, y.size(1), y.size(2), device=y.device)
        #      y = torch.cat([y, padding], dim=0)

        #print('x and y shape cross attention',x.shape,y.shape)

        # q and k from decoder
        q = self.wqc(x)
        k = self.wkc(y)
        # v from decoder
        v = self.wvc(y)
        q = q.view(batch_size, sequence_length, self.heads, self.head_dim)
        k = k.view(batch_size, sequence_length, self.heads, self.head_dim)
        v = v.view(batch_size, sequence_length, self.heads, self.head_dim)
        q = q.permute(0, 2, 1, 3)
        k = k.permute(0, 2, 1, 3)
        v = v.permute(0, 2, 1, 3)

        attention, scores = self.scaled_dot_product_attention(q, k, v)
        #print('scores init',scores.shape)
        scores = scores.reshape(batch_size, sequence_length, self.heads *self.head_dim)# we can use self.dmodel as well as last arg
        #print('scores shape',scores.shape)
        out=self.linearlayer(scores)
        #print('out',out.shape)

        #print('projected shape',projected.shape)
        #print(type(out))
        return out




In [45]:
# === Special Tokens ===
START_TOKEN = "<START>"
END_TOKEN = "<END>"
PADDING_TOKEN = "<PAD>"
eng_to_index = {
    "Browse": 1, "the": 2, "various": 3, "methods": 4, "of": 5, "current": 6, "accessible": 7,
    "Hide": 8, "private": 9, "attributes": 10, "Method": 11,
    START_TOKEN: 12, END_TOKEN: 13, PADDING_TOKEN: 0  # Special tokens
}
hindi_to_index = {
    "इस": 1, "समय": 2, "जिसे": 3, "प्राप्त": 4, "किया": 5, "गया": 6, "हो": 7, "विभिन्न": 8,
    "विधियों": 9, "में": 10, "विचरण": 11, "करें": 12, "निजी": 13, "गुणों": 14, "को": 15,
    "छिपाएं": 16, "विधि": 17,
    START_TOKEN: 18, END_TOKEN: 19, PADDING_TOKEN: 0  # Special tokens
}

# === Example Bilingual Sentences (English ↔ Hindi) ===
bilingual_batch = [
    ("Browse the various methods of the current accessible", "इस समय जिसे प्राप्त किया गया हो, उसकी विभिन्न विधियों (मेथड) में विचरण करें"),
    ("Hide private attributes", "निजी गुणों को छिपाएं"),
    ("Method", "विधि")
]

# === Parameters ===
max_seq_length = 5  # ✅ Limit to 5 words per sentence

# === Separate English and Hindi Sentences ===
english_sentences = [pair[0] for pair in bilingual_batch]
hindi_sentences = [pair[1] for pair in bilingual_batch]

d_model = 2
batch_size = 3
ffn_hidden = 2048
num_heads = 8
drop_prob = 0.1
num_layers = 1
max_sequence_length = 5
hn_vocab_size = len(hindi_to_index)
eng_vocab_size = len(eng_to_index)
#torch.manual_seed(2)
tokenization=SentenceEmbedding(max_sequence_length, d_model, eng_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
engtoken=tokenization(english_sentences,start=True,end=True)
print('engtoken',engtoken)
hintoken=tokenization(['को'],start=True,end=True)
print('hintoken',hintoken)

input tokenization of x
torch.Size([3, 5])
engtoken tensor([[[ 3.2072, -4.1821],
         [-1.6482, -0.8999],
         [ 1.4268,  0.6317],
         [-0.0947, -0.0738],
         [ 1.0261,  0.4570]],

        [[ 3.2072, -4.1821],
         [ 0.7681, -1.5612],
         [-0.9090,  1.2549],
         [ 0.1845, -0.2415],
         [ 1.0261,  0.4570]],

        [[ 3.2072, -4.1821],
         [ 1.5595, -0.7858],
         [-1.2191,  0.8004],
         [-0.0000,  0.0267],
         [ 1.2430,  0.5002]]], grad_fn=<MulBackward0>)
input tokenization of x
torch.Size([1, 5])
hintoken tensor([[[ 3.2072, -4.1821],
         [-1.2375,  0.0000],
         [-1.2191,  0.8004],
         [-0.2262,  0.0267],
         [ 1.2430,  0.5002]]], grad_fn=<MulBackward0>)


In [46]:
#encoder


#encoder layer

class encoderlayer(nn.Module):
  def __init__(self,max_sequence_length,d_model,hidlayer,dropout,num_heads,masking):
    super().__init__()
    self.d_model,self.hidlayer,self.dropout,self.num_heads,self.masking=d_model,hidlayer,dropout,num_heads,masking
    self.multihead_attention=multihead_attention(d_model,num_heads,masking,)
    self.feedforward=feedforward(self.d_model,self.hidlayer,self.dropout)
    self.layernorm1=CustomLayerNorm(self.d_model)
    self.layernorm2=CustomLayerNorm(self.d_model)
    self.max_sequence_length=max_sequence_length

  def forward(self,x):
    #mulihead
    #print('--mulihead attention--')
    mha=self.multihead_attention(x)
    #print(mha.shape)
    #layernorm
    #print('--layer normalisation--')
    ln1=self.layernorm1(mha+x)
    #print(ln1.shape)
    #feedforward
    #print('--feedforward network--')
    ff=self.feedforward(ln1)
    #print(ff.shape)
    #layernorm
    #print('--layer normalisation--')
    out=self.layernorm2(ff+ln1)
    #print(out.shape)
    return out



class encoder(nn.Module):
  #creating n layer of layers
  def __init__(self,max_sequence_length,d_model,hidlayer,dropout,num_heads,nlayers,language_to_index,START_TOKEN,END_TOKEN,PADDING_TOKEN,
               masking=False):
    super().__init__()
    #sequentially stack encoders
    self.encoderembeddings=SentenceEmbedding(max_sequence_length,d_model,language_to_index,START_TOKEN,END_TOKEN,PADDING_TOKEN)
    self.layers=nn.Sequential(*[encoderlayer(max_sequence_length,d_model,hidlayer,dropout,num_heads,masking) for _ in range(nlayers)])

  def forward(self, x,start,end):
    x=self.encoderembeddings(x,start,end)
    for i, layer in enumerate(self.layers):
      #print(f'\n------layer {i+1}----- ')

      x = layer(x)
      #print(f"--Output after layer {i+1}--: {x.size()}")  # Printing the size after each layer
    return x




In [47]:
##encoder testing

d_model = 4
batch_size = 3
ffn_hidden = 64
num_heads = 2
drop_prob = 0.1
num_layers = 2
max_sequence_length = 5
hn_vocab_size = len(hindi_to_index)
eng_vocab_size = len(eng_to_index)
#torch.manual_seed(2)
tokenization=SentenceEmbedding(max_sequence_length, d_model, eng_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)

Encoder =encoder(
    max_sequence_length=max_sequence_length,d_model=d_model,hidlayer=ffn_hidden
    ,dropout=drop_prob,num_heads=num_heads,nlayers=num_layers,language_to_index=eng_to_index,
    START_TOKEN=START_TOKEN,END_TOKEN=END_TOKEN,PADDING_TOKEN=PADDING_TOKEN,masking=False
)
Encoderout=Encoder(english_sentences,start=True,end=True)

print('Encoder shape', Encoderout.shape)
print("Encoder output:", Encoderout)


#perfect

input tokenization of x
torch.Size([3, 5])
Encoder shape torch.Size([3, 5, 4])
Encoder output: tensor([[[ 0.1579, -0.1973, -1.1918,  1.2311],
         [ 0.4406, -0.3767,  1.1217, -1.1856],
         [ 0.7753, -0.2086,  0.7642, -1.3309],
         [ 0.0760, -0.9095,  1.3706, -0.5371],
         [ 0.3911,  0.9277, -1.4066,  0.0878]],

        [[ 0.2206, -0.1991, -1.2173,  1.1957],
         [ 0.5783, -1.0724,  1.0817, -0.5876],
         [ 1.0823,  0.0043,  0.2435, -1.3301],
         [-1.2650, -0.1862,  0.3305,  1.1207],
         [ 0.6804,  0.7541, -1.4026, -0.0320]],

        [[ 0.2339, -0.2646, -1.1836,  1.2142],
         [-1.4265,  0.8345,  0.5120,  0.0801],
         [ 0.3899,  0.6289, -1.4926,  0.4739],
         [ 0.2198,  0.0759,  1.0567, -1.3525],
         [ 0.3320, -0.0632,  1.0593, -1.3280]]], grad_fn=<AddBackward0>)


In [48]:
#decoder
class decoderlayer(nn.Module):
  def __init__(self,max_sequence_length,d_model,hidlayer,dropout,num_heads,masking):
    super().__init__()
    self.d_model,self.hidlayer,self.dropout,self.num_heads,self.masking=d_model,hidlayer,dropout,num_heads,masking
    #self.PositionalEncoding=PositionalEncoding(self.d_model,self.input_dim) #sinencoding
    self.multihead_attention=multihead_attention(self.d_model,self.num_heads,self.masking)
    self.layernorm1=CustomLayerNorm(self.d_model)
    self.cross_attention=multihead_cross_attention(d_model,num_heads)
    self.layernorm2=CustomLayerNorm(self.d_model)
    self.feedforward=feedforward(self.d_model,self.hidlayer,self.dropout)
    self.layernorm3=CustomLayerNorm(self.d_model)
    self.max_sequence_length=max_sequence_length


  def forward(self,x,y):
    if y.size(0) != x.size(0):
             pad_size = x.size(0) - y.size(0)
             padding = torch.zeros(pad_size, y.size(1), y.size(2), device=y.device)
             y = torch.cat([y, padding], dim=0)
    #rope
    #print('---positional encoding--')
    #mulihead
    #print('--mulihead attention--')
    mha=self.multihead_attention(y)
    #print(mha.shape)
    #layernorm1
    #print('--layer normalisation 1--')
    ln1=self.layernorm1(mha+y)
    #print(ln1.shape)
    #cross attention
    #print('--cross mulihead attention--')
    cmha=self.cross_attention(x,ln1)
    #print(cmha.shape)
     #layernorm2
    #print('--layer normalisation 2--')
    ln2=self.layernorm2(cmha+ln1)
    #print(ln2.shape)

    #feedforward
    #print('--feedforward network--')
    ff=self.feedforward(ln1)
    #print(ff.shape)
    #layernorm3
    #print('--layer normalisation--')
    out=self.layernorm3(ff+ln2)
    #print(out.shape)
    return out

class decoder(nn.Module):
  #creating n layer of layers
  def __init__(self,max_sequence_length,d_model,hidlayer,dropout,num_heads,nlayers,language_to_index,START_TOKEN,END_TOKEN,PADDING_TOKEN,
               masking=True):
    super().__init__()
    #sequentially stack encoders
    self.decoderembeddings=SentenceEmbedding(max_sequence_length,d_model,language_to_index,START_TOKEN,END_TOKEN,PADDING_TOKEN)
    self.layers=nn.Sequential(*[decoderlayer(max_sequence_length,d_model,hidlayer,dropout,num_heads,masking) for _ in range(nlayers)])

  def forward(self, x,y,start,end):
    y = self.decoderembeddings(y, start,end)
    for i, layer in enumerate(self.layers):
      #print(f'\n------layer {i+1}----- ')
      l = layer(x,y)
      #print(f"--Output after layer {i+1}--: {l.size()}")  # Printing the size after each layer
    return l




In [49]:
#testing decoder class

In [50]:
d_model = 4
batch_size = 3
ffn_hidden = 64
num_heads = 2
drop_prob = 0.1
num_layers = 2
max_sequence_length = 5
hn_vocab_size = len(hindi_to_index)
eng_vocab_size = len(eng_to_index)
#torch.manual_seed(2)

tokenization=SentenceEmbedding(max_sequence_length, d_model, eng_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)

Decoder = decoder(
    max_sequence_length=max_sequence_length,d_model=d_model,hidlayer=ffn_hidden
    ,dropout=drop_prob,num_heads=num_heads,nlayers=num_layers,language_to_index=hindi_to_index,
    START_TOKEN=START_TOKEN,END_TOKEN=END_TOKEN,PADDING_TOKEN=PADDING_TOKEN,
               masking=True
)
Decoderout=Decoder(Encoderout,['को','विधि'],start=True,end=True)

print('Decoder shape', Decoderout.shape)
print("Decoder output:", Decoderout)


#perfect

input tokenization of x
torch.Size([2, 5])
Decoder shape torch.Size([3, 5, 4])
Decoder output: tensor([[[-0.4844,  1.4728, -0.7262, -0.2622],
         [-0.5261, -0.3494,  1.4908, -0.6153],
         [-1.3840,  0.0191,  0.4062,  0.9587],
         [ 0.4869,  0.6061,  0.4017, -1.4947],
         [ 0.9403,  0.6813, -1.2211, -0.4005]],

        [[-0.4589,  1.4854, -0.6808, -0.3457],
         [ 0.4829,  1.0541, -0.2814, -1.2556],
         [-1.4016,  0.0429,  0.4442,  0.9145],
         [ 0.8510,  0.8225, -1.1522, -0.5213],
         [ 0.8380,  0.7663, -1.2624, -0.3419]],

        [[-0.0169,  1.2998, -0.1478, -1.1351],
         [ 0.0278,  1.3573, -0.3782, -1.0069],
         [ 0.7810,  0.9422, -0.7682, -0.9550],
         [ 1.4119, -0.2435, -0.2204, -0.9480],
         [ 1.4385, -0.1304, -0.4837, -0.8244]]], grad_fn=<AddBackward0>)


In [51]:
#testing the combined class

# === Special Tokens ===
START_TOKEN = "<START>"
END_TOKEN = "<END>"
PADDING_TOKEN = "<PAD>"
eng_to_index = {
    "Browse": 2, "the": 1, "various": 4, "methods": 3, "of": 5, "current": 7, "accessible": 6,
    "Hide": 11, "private": 10, "attributes": 12, "Method": 9,
    START_TOKEN: 12, END_TOKEN: 13, PADDING_TOKEN: 0  # Special tokens
}
hindi_to_index = {
    "इस": 1, "समय": 2, "जिसे": 3, "प्राप्त": 4, "किया": 5, "गया": 6, "हो": 7, "विभिन्न": 8,
    "विधियों": 9, "में": 10, "विचरण": 11, "करें": 12, "निजी": 13, "गुणों": 14, "को": 15,
    "छिपाएं": 16, "विधि": 17,
    START_TOKEN: 18, END_TOKEN: 19, PADDING_TOKEN: 0  # Special tokens
}

# === Example Bilingual Sentences (English ↔ Hindi) ===
bilingual_batch = [
    ("Browse the various methods of the current accessible", "इस समय जिसे प्राप्त किया गया हो, उसकी विभिन्न विधियों (मेथड) में विचरण करें"),
    ("Hide private attributes", "निजी गुणों को छिपाएं"),
    ("Method", "विधि")
]

# === Separate English and Hindi Sentences ===
english_sentences = [pair[0] for pair in bilingual_batch]
hindi_sentences = [pair[1] for pair in bilingual_batch]

index_to_hindi = {v: k for k, v in hindi_to_index.items()}

# === Parameters ===
max_seq_length = 5  # ✅ Limit to 5 words per sentence

# === Separate English and Hindi Sentences ===
english_sentences = [pair[0] for pair in bilingual_batch]
hindi_sentences = [pair[1] for pair in bilingual_batch]

d_model = 4
batch_size = 3
ffn_hidden = 2048
num_heads = 2
drop_prob = 0.1
num_layers = 1
max_sequence_length = 5
hn_vocab_size = len(hindi_to_index)
eng_vocab_size = len(eng_to_index)
#torch.manual_seed(2)
tokenization=SentenceEmbedding(max_sequence_length, d_model, hindi_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
#engtoken=tokenization(english_sentences,start=True,end=True)
#print('engtoken',engtoken)
print('encoder shape is ',Encoderout.shape)
sample=['को','विधि']#'विधि','इस समय जिसे प्राप्त किया गया हो']
print('sample',sample)
hintoken=tokenization(sample,start=True,end=True)
print('hintoken',hintoken.shape)
mha = multihead_attention(d_model, num_heads, masking=True)
out = mha(hintoken)
cat = multihead_cross_attention(d_model, heads=num_heads, masking=True)
out = cat(Encoderout, y=out)  # Now batch size should match this swill show error eince matching shape is done at decoder so no issue
print(f'multihead_cross_attention output shape is {out.shape} and output is {out}')
ln1=CustomLayerNorm(d_model)
print('ln1',ln1(out).shape)


encoder shape is  torch.Size([3, 5, 4])
sample ['को', 'विधि']
input tokenization of x
torch.Size([2, 5])
hintoken torch.Size([2, 5, 4])


RuntimeError: shape '[3, 5, 2, 2]' is invalid for input of size 40

In [52]:
#now the transformer class

In [53]:
#transformer class
class mytransformer(nn.Module):
  def __init__(self,
               d_model,
               hidlayer,
               dropout,
               num_heads,
               nlayers,
               lang_vocab_size,
               english_to_index,
               lang_to_index,
               max_seq_len,
               START_TOKEN,
               END_TOKEN,
               PADDING_TOKEN):

    super().__init__()
    self.d_model=d_model
    self.hidlayer=hidlayer
    self.dropout=dropout
    self.num_heads=num_heads
    self.nlayers=nlayers
    self.vocab_size=lang_vocab_size
    self.english_to_index = english_to_index
    self.lang_to_index = lang_to_index
    self.max_sequence_length = max_seq_len
    self.START_TOKEN = START_TOKEN
    self.END_TOKEN = END_TOKEN
    self.PADDING_TOKEN = PADDING_TOKEN


    self.Encoder=encoder(self.max_sequence_length,self.d_model,self.hidlayer,self.dropout,self.num_heads,self.nlayers,
                         self.english_to_index,self.START_TOKEN,self.END_TOKEN,self.PADDING_TOKEN,
               )
    self.Decoder=decoder(self.max_sequence_length,self.d_model,self.hidlayer,self.dropout,self.num_heads,self.nlayers,
                         self.lang_to_index,self.START_TOKEN,self.END_TOKEN,self.PADDING_TOKEN,
               masking=True)
    self.linearlayer=nn.Linear(self.d_model,self.vocab_size)

  def forward(self,english_sentences,hindi_sentences):
    x=self.Encoder(english_sentences,start=False,end=False)
    y=self.Decoder(x,hindi_sentences,start=True,end=True)
    y=self.linearlayer(y)
    print('output of linear layer',y.shape)
    softmaxoutput=torch.softmax(y,dim=-1)
    print('output of softmax layer',softmaxoutput.shape)
    #return softmaxoutput
    #predict next tokens
    #predicted_tokens = torch.argmax(softmaxoutput, dim=-1)  # Shape: (3, 5)
    # predict next token
    predicted_next_token = torch.argmax(softmaxoutput[-1, -1, :])  # Shape: ()
    return predicted_next_token



In [54]:
#next word prediction


# === Special Tokens ===
START_TOKEN = "<START>"
END_TOKEN = "<END>"
PADDING_TOKEN = "<PAD>"
eng_to_index = {
    "Browse": 2, "the": 1, "various": 4, "methods": 3, "of": 5, "current": 7, "accessible": 6,
    "Hide": 11, "private": 10, "attributes": 12, "Method": 9,
    START_TOKEN: 12, END_TOKEN: 13, PADDING_TOKEN: 0  # Special tokens
}
hindi_to_index = {
    "इस": 1, "समय": 2, "जिसे": 3, "प्राप्त": 4, "किया": 5, "गया": 6, "हो": 7, "विभिन्न": 8,
    "विधियों": 9, "में": 10, "विचरण": 11, "करें": 12, "निजी": 13, "गुणों": 14, "को": 15,
    "छिपाएं": 16, "विधि": 17,
    START_TOKEN: 18, END_TOKEN: 19, PADDING_TOKEN: 0  # Special tokens
}

# === Example Bilingual Sentences (English ↔ Hindi) ===
bilingual_batch = [
    ("Browse the various methods of the current accessible", "इस समय जिसे प्राप्त किया गया हो, उसकी विभिन्न विधियों (मेथड) में विचरण करें"),
    ("Hide private attributes", "निजी गुणों को छिपाएं"),
    ("Method", "विधि")
]

# === Separate English and Hindi Sentences ===
english_sentences = [pair[0] for pair in bilingual_batch]
hindi_sentences = [pair[1] for pair in bilingual_batch]

index_to_hindi = {v: k for k, v in hindi_to_index.items()}


d_model = 64
batch_size = 3
ffn_hidden = 2048
num_heads = 8
drop_prob = 0.1
num_layers = 1
max_sequence_length = 5
hn_vocab_size = len(hindi_to_index)
eng_vocab_size = len(eng_to_index)
transformer=mytransformer(d_model,ffn_hidden,drop_prob,num_heads,num_layers,
                          hn_vocab_size,eng_to_index,hindi_to_index,max_sequence_length,START_TOKEN,END_TOKEN,PADDING_TOKEN)

transformerout=transformer(english_sentences,hindi_sentences)
print('transformer shape', transformerout.shape)
print("transformer output/predicted tokens:", transformerout)
predicted_word = index_to_hindi.get(int(transformerout), "<PAD>")
print("next Predicted word:", predicted_word)


#perfect

input tokenization of x
torch.Size([3, 5])
input tokenization of x
torch.Size([3, 5])
output of linear layer torch.Size([3, 5, 20])
output of softmax layer torch.Size([3, 5, 20])
transformer shape torch.Size([])
transformer output/predicted tokens: tensor(4)
next Predicted word: प्राप्त


In [56]:
transformer

mytransformer(
  (Encoder): encoder(
    (encoderembeddings): SentenceEmbedding(
      (embedding): Embedding(14, 64)
      (position_encoder): RoPEEmbedding()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): Sequential(
      (0): encoderlayer(
        (multihead_attention): multihead_attention(
          (wq): Linear(in_features=64, out_features=64, bias=True)
          (wk): Linear(in_features=64, out_features=64, bias=True)
          (wv): Linear(in_features=64, out_features=64, bias=True)
          (linearlayer): Linear(in_features=64, out_features=64, bias=True)
        )
        (feedforward): feedforward(
          (linearlayer1): Linear(in_features=64, out_features=2048, bias=True)
          (linearlayer2): Linear(in_features=2048, out_features=64, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation): ReLU()
        )
        (layernorm1): CustomLayerNorm()
        (layernorm2): CustomLayerNorm()
      )
    )
  )
  (Decoder): 

In [14]:
sample='को'
sample.split()

['को']

In [15]:
#index_to_hindi.get(int(transformer(english_sentences,['इस', 'समय','समय समय'])), "<PAD>").split()
#index_to_hindi.get(int(transformer(english_sentences,['समय'])), "<PAD>").split()
index_to_hindi.get(int(transformer(english_sentences,['समय','इस'])), "<PAD>").split()


input tokenization of x
torch.Size([3, 5])
input tokenization of x
torch.Size([2, 5])
output of linear layer torch.Size([3, 5, 20])
output of softmax layer torch.Size([3, 5, 20])


['गया']

In [16]:
numnextwords=2
predicted_words=""
transformerout=transformer(english_sentences,hindi_sentences)
predicted_word = index_to_hindi.get(int(transformerout), "<PAD>")
predicted_words=predicted_word.split()
for i in range(numnextwords):
  transformerout=transformer(english_sentences,predicted_words[-1].split())
  predicted_word = index_to_hindi.get(int(transformerout), "<PAD>")
  predicted_words.append(predicted_word)

print('predicted_words',' '.join(predicted_words))





input tokenization of x
torch.Size([3, 5])
input tokenization of x
torch.Size([3, 5])
output of linear layer torch.Size([3, 5, 20])
output of softmax layer torch.Size([3, 5, 20])
input tokenization of x
torch.Size([3, 5])
input tokenization of x
torch.Size([1, 5])
output of linear layer torch.Size([3, 5, 20])
output of softmax layer torch.Size([3, 5, 20])
input tokenization of x
torch.Size([3, 5])
input tokenization of x
torch.Size([1, 5])
output of linear layer torch.Size([3, 5, 20])
output of softmax layer torch.Size([3, 5, 20])
predicted_words <END> गया गया


In [17]:
# testing of characterwise/alphabet wise tokenization in batches

In [18]:
import torch
import torch.nn as nn

class SentenceEmbedding(nn.Module):
    "For a given sentence, create an embedding"
    def __init__(self, max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN):
        super().__init__()
        self.vocab_size = len(language_to_index)
        self.max_sequence_length = max_sequence_length
        self.embedding = nn.Embedding(self.vocab_size, d_model)
        self.language_to_index = language_to_index
        self.position_encoder = RoPEEmbedding(d_model)  # Ensure RoPEEmbedding is defined
        self.dropout = nn.Dropout(p=0.1)
        self.START_TOKEN = START_TOKEN
        self.END_TOKEN = END_TOKEN
        self.PADDING_TOKEN = PADDING_TOKEN

    def batch_tokenize(self, batch, start_token=True, end_token=True):
        def tokenize(sentence):
            # Convert each character/token to its index
            sentence_word_indices = [self.language_to_index.get(token, self.language_to_index[self.PADDING_TOKEN]) for token in list(sentence)]

            if start_token:
                sentence_word_indices.insert(0, self.language_to_index[self.START_TOKEN])
            if end_token:
                sentence_word_indices.append(self.language_to_index[self.END_TOKEN])

            # Ensure all sequences are exactly `max_sequence_length` long
            sentence_word_indices = sentence_word_indices[:self.max_sequence_length]  # Truncate if too long
            sentence_word_indices += [self.language_to_index[self.PADDING_TOKEN]] * (self.max_sequence_length - len(sentence_word_indices))  # Pad if too short

            return torch.tensor(sentence_word_indices)

        tokenized = [tokenize(sentence) for sentence in batch]
        tokenized = torch.stack(tokenized)  # Convert list of tensors to a single tensor

        return tokenized

    def forward(self, x, start_token=True, end_token=True):
        x = self.batch_tokenize(x, start_token, end_token)  # Ensure consistent function calls
        x = self.embedding(x)  # Convert token indices to embeddings
        pos = self.position_encoder(x)  # Apply positional encoding
        x = self.dropout(x + pos)  # Apply dropout
        return x


In [19]:

START_TOKEN = '<start>'
PADDING_TOKEN = '<padding>'
END_TOKEN = '<end>'

english_vocabulary = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
                      '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', '@',
                      'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                      'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                      'Y', 'Z', '[', '\\', ']', '^', '_', '`',
                      'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                      'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
                      'y', 'z', '{', '|', '}', '~', PADDING_TOKEN, END_TOKEN]



hindi_vocabulary = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
                    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', 'ँ', 'ं', 'ः',
                    'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ऌ', 'ए', 'ऐ', 'ओ', 'औ',
                    'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण',
                    'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह',
                    '़', 'ऽ', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'ॄ', 'ॅ', 'े', 'ै', 'ॉ', 'ो', 'ौ', '्', 'ॐ',
                    '०', '१', '२', '३', '४', '५', '६', '७', '८', '९', PADDING_TOKEN, END_TOKEN]

In [20]:


index_to_hindi = {k:v for k,v in enumerate(hindi_vocabulary)}
hindi_to_index = {v:k for k,v in enumerate(hindi_vocabulary)}
index_to_english = {k:v for k,v in enumerate(english_vocabulary)}
english_to_index = {v:k for k,v in enumerate(english_vocabulary)}

In [21]:

# === Example Bilingual Sentences (English ↔ Hindi) ===
bilingual_batch = [
    ("Browse the various methods of the current accessible", "इस समय जिसे प्राप्त किया गया हो, उसकी विभिन्न विधियों (मेथड) में विचरण करें"),
    ("Hide private attributes", "निजी गुणों को छिपाएं"),
    ("Method", "विधि")
]

# === Separate English and Hindi Sentences ===
english_sentences = [pair[0] for pair in bilingual_batch]
hindi_sentences = [pair[1] for pair in bilingual_batch]



In [22]:
hindi_sentences

['इस समय जिसे प्राप्त किया गया हो, उसकी विभिन्न विधियों (मेथड) में विचरण करें',
 'निजी गुणों को छिपाएं',
 'विधि']

In [23]:

max_sequence_length = 100
# to check if a token or character/alphabet ins engsen or hindi is present in about hindi/eng vocab pf charceter
def is_valid_tokens(sentence, vocab):
    for token in list(set(sentence)):
        if token not in vocab:
            return False
    return True

#to check if engsend or hindisen each sent has max 200 charcers
def is_valid_length(sentence, max_sequence_length):
    return len(list(sentence)) < (max_sequence_length - 1) # need to re-add the end token so leaving 1 space

valid_sentence_indicies = []
for index in range(len(hindi_sentences)):
    hindi_sentence, english_sentence = hindi_sentences[index], english_sentences[index]
    if is_valid_length(hindi_sentence, max_sequence_length) \
      and is_valid_length(english_sentence, max_sequence_length) \
      and is_valid_tokens(hindi_sentence, hindi_vocabulary):
        valid_sentence_indicies.append(index)

print(f"Number of sentences: {len(hindi_sentences)}")
print(f"Number of valid sentences: {len(valid_sentence_indicies)}")

Number of sentences: 3
Number of valid sentences: 3


In [24]:
valid_sentence_indicies

[0, 1, 2]

In [25]:
#testing the code

d_model = 4
batch_size = 3
ffn_hidden = 2048
num_heads = 2
drop_prob = 0.1
num_layers = 1
max_sequence_length = 5
#torch.manual_seed(2)
#engtokenization=SentenceEmbedding(max_sequence_length, d_model, english_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
#engtoken=tokenization(english_sentences,start=True,end=True)
#print('engtoken',engtoken)
#print('encoder shape is ',Encoderout.shape)
sample=['को','विधि']#'विधि','इस समय जिसे प्राप्त किया गया हो']
print('sample',sample)
hintokenization=SentenceEmbedding(max_sequence_length, d_model, hindi_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)

hintoken=hintokenization(sample,start_token=True,end_token=True)
print('hintoken',hintoken.shape)


sample ['को', 'विधि']
hintoken torch.Size([2, 5, 4])


In [26]:
english_sentences

['Browse the various methods of the current accessible',
 'Hide private attributes',
 'Method']

In [27]:
hindi_sentences

['इस समय जिसे प्राप्त किया गया हो, उसकी विभिन्न विधियों (मेथड) में विचरण करें',
 'निजी गुणों को छिपाएं',
 'विधि']

In [28]:
index_to_hindi.get(int(transformer(english_sentences,sample)), "<PAD>").split()

input tokenization of x
torch.Size([3, 5])
input tokenization of x
torch.Size([2, 5])
output of linear layer torch.Size([3, 5, 20])
output of softmax layer torch.Size([3, 5, 20])


['%']

In [29]:
numnextwords=2
predicted_words=""
transformerout=transformer(english_sentences,hindi_sentences)
predicted_word = index_to_hindi.get(int(transformerout), "<PAD>")
predicted_words=predicted_word.split()
for i in range(numnextwords):
  transformerout=transformer(english_sentences,predicted_words[-1].split())
  predicted_word = index_to_hindi.get(int(transformerout), "<PAD>")
  predicted_words.append(predicted_word)

print('predicted_words',' '.join(predicted_words))





input tokenization of x
torch.Size([3, 5])
input tokenization of x
torch.Size([3, 5])
output of linear layer torch.Size([3, 5, 20])
output of softmax layer torch.Size([3, 5, 20])
input tokenization of x
torch.Size([3, 5])
input tokenization of x
torch.Size([1, 5])
output of linear layer torch.Size([3, 5, 20])
output of softmax layer torch.Size([3, 5, 20])
input tokenization of x
torch.Size([3, 5])
input tokenization of x
torch.Size([1, 5])
output of linear layer torch.Size([3, 5, 20])
output of softmax layer torch.Size([3, 5, 20])
predicted_words 2 % %


In [None]:
#testing with batches
#break into batches


import pandas as pd

# Path to the CSV file
file_path = '/content/drive/MyDrive/ADeepLearning/hindi_english_parallel2.csv'

# Read the CSV file
df = pd.read_csv(file_path)

# Check the data
df.head()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,English,Hindi
0,Help!,बचाओ!
1,Jump.,उछलो.
2,Jump.,कूदो.
3,Jump.,छलांग.
4,Hello!,नमस्ते।


In [32]:
df.dropna(inplace=True)

In [34]:
sample=df[:100000].copy()
engsen=sample['English'].to_list()
hindisen=sample['Hindi'].to_list()
engsen1=engsen[:10000]
hindisen1=hindisen[:10000]

max_sequence_length = 100
# to check if a token or character/alphabet ins engsen or hindi is present in about hindi/eng vocab pf charceter
def is_valid_tokens(sentence, vocab):
    for token in list(set(sentence)):
        if token not in vocab:
            return False
    return True

#to check if engsend or hindisen each sent has max 200 charcers
def is_valid_length(sentence, max_sequence_length):
    return len(list(sentence)) < (max_sequence_length - 1) # need to re-add the end token so leaving 1 space

valid_sentence_indicies = []
for index in range(len(hindisen1)):
    hindi_sentence, english_sentence = hindisen1[index], engsen1[index]
    if is_valid_length(hindi_sentence, max_sequence_length) \
      and is_valid_length(english_sentence, max_sequence_length) \
      and is_valid_tokens(hindi_sentence, hindi_vocabulary):
        valid_sentence_indicies.append(index)

print(f"Number of sentences: {len(hindisen1)}")
print(f"Number of valid sentences: {len(valid_sentence_indicies)}")

Number of sentences: 10000
Number of valid sentences: 3690


In [35]:
max_sequence_length = 50  # Maximum allowed characters
min_sequence_length = 30  # Minimum required characters

# Function to check if a sentence contains only valid tokens
def is_valid_tokens(sentence, vocab):
    for token in list(set(sentence)):  # Ensure unique characters are checked
        if token not in vocab:
            return False
    return True

# Function to check if sentence length is within min & max limits
def is_valid_length(sentence, min_length, max_length):
    sentence_length = len(sentence)  # No need to convert to list explicitly
    return min_length <= sentence_length < max_length  # Ensuring it fits the range

valid_sentence_indices = []
for index in range(len(hindisen1)):
    hindi_sentence, english_sentence = hindisen1[index], engsen1[index]

    if (is_valid_length(hindi_sentence, min_sequence_length, max_sequence_length) and
        is_valid_length(english_sentence, min_sequence_length, max_sequence_length) and
        is_valid_tokens(hindi_sentence, hindi_vocabulary)):

        valid_sentence_indices.append(index)

print(f"Total sentences: {len(hindisen1)}")
print(f"Valid sentences: {len(valid_sentence_indices)}")


Total sentences: 10000
Valid sentences: 760


In [36]:
hindisen1 = [hindisen1[i] for i in valid_sentence_indices]
engsen1 = [engsen1[i] for i in valid_sentence_indices]


In [38]:

dataset = TextDataset(engsen1, hindisen1)
#this code will create batches
batch_size = 2
train_loader = DataLoader(dataset, batch_size)
iterator = iter(train_loader)

for batch_num, batch in enumerate(iterator):
    print(batch)
    if batch_num > 1:
        break

[('Are you interested in flowers?', 'Can you teach me how to steal?'), ('तुम्हें फूलों में दिलचस्पी है क्या?', 'क्या तुम मुझे चोरी करना सिखा सकते हो?')]
[('Could I please use your phone?', 'Do you live with your parents?'), ('मैं आपका फ़ोन इस्तेमाल कर सकता हूँ क्या?', 'क्या तुम अपने मम्मी-पापा के साथ रहते हो?')]
[('Do you remember what she said?', 'Excuse me, is this seat taken?'), ('तुम्हें याद है उसने क्या कहा था?', 'माफ़ कीजिएगा, यहाँ कोई बैठा हुआ है क्या?')]


In [39]:

#now combinig with sentence ebedding
batch_size = 2
train_loader = DataLoader(dataset, batch_size)
iterator = iter(train_loader)
d_model = 2
ffn_hidden = 2048
num_heads = 8
drop_prob = 0.1
num_layers = 1
max_sequence_length = 3
torch.manual_seed(2)
engtokenization=SentenceEmbedding(max_sequence_length, d_model, english_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
hintokenization=SentenceEmbedding(max_sequence_length, d_model, hindi_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)


for batch_num, batch in enumerate(iterator):
    print('\nbatch_num',batch_num+1)
    print('batch')
    print(batch)
    eng_batch, ln_batch = batch
    engtoken=engtokenization(eng_batch,start_token=True,end_token=True)
    print('engtoken',engtoken.shape)
    hintoken=hintokenization(ln_batch,start_token=True,end_token=True)
    print('hintoken',hintoken.shape)
    if batch_num > 1:
        break


batch_num 1
batch
[('Are you interested in flowers?', 'Can you teach me how to steal?'), ('तुम्हें फूलों में दिलचस्पी है क्या?', 'क्या तुम मुझे चोरी करना सिखा सकते हो?')]
engtoken torch.Size([2, 3, 2])
hintoken torch.Size([2, 3, 2])

batch_num 2
batch
[('Could I please use your phone?', 'Do you live with your parents?'), ('मैं आपका फ़ोन इस्तेमाल कर सकता हूँ क्या?', 'क्या तुम अपने मम्मी-पापा के साथ रहते हो?')]
engtoken torch.Size([2, 3, 2])
hintoken torch.Size([2, 3, 2])

batch_num 3
batch
[('Do you remember what she said?', 'Excuse me, is this seat taken?'), ('तुम्हें याद है उसने क्या कहा था?', 'माफ़ कीजिएगा, यहाँ कोई बैठा हुआ है क्या?')]
engtoken torch.Size([2, 3, 2])
hintoken torch.Size([2, 3, 2])


In [58]:
transformer.train()

mytransformer(
  (Encoder): encoder(
    (encoderembeddings): SentenceEmbedding(
      (embedding): Embedding(14, 64)
      (position_encoder): RoPEEmbedding()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): Sequential(
      (0): encoderlayer(
        (multihead_attention): multihead_attention(
          (wq): Linear(in_features=64, out_features=64, bias=True)
          (wk): Linear(in_features=64, out_features=64, bias=True)
          (wv): Linear(in_features=64, out_features=64, bias=True)
          (linearlayer): Linear(in_features=64, out_features=64, bias=True)
        )
        (feedforward): feedforward(
          (linearlayer1): Linear(in_features=64, out_features=2048, bias=True)
          (linearlayer2): Linear(in_features=2048, out_features=64, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation): ReLU()
        )
        (layernorm1): CustomLayerNorm()
        (layernorm2): CustomLayerNorm()
      )
    )
  )
  (Decoder): 