In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset,Dataset

In [None]:
class InputEmbedding(nn.Module):
  def __init__(self,vocab_size,d_model):
    super().__init__()
    self.d_model=d_model
    self.vocab_size=vocab_size
    self.embedding=nn.Embedding(vocab_size,d_model)

  def forward(self,x):
    return self.embedding(x)*torch.sqrt(torch.tensor(self.d_model))

In [None]:
class PositionalEncoding(nn.Module):
  def __init__(self,d_model,seq_len=1003):
    super().__init__()
    self.position_embedding = nn.Embedding(seq_len+1, d_model)

  def forward(self,x):
    batch_size, seq_len, _ = x.size()
    positions = torch.arange(0, seq_len, device=x.device).unsqueeze(0).repeat(batch_size, 1)
    return x + self.position_embedding(positions)

In [None]:
class FeedForward(nn.Module):
  def __init__(self,d_model,dropout=0.1):
    super().__init__()
    self.net=nn.Sequential(
        nn.Linear(d_model,4*d_model),
        nn.ReLU(),
        nn.Linear(4*d_model,d_model),
        nn.Dropout(dropout)
    )

  def forward(self,x):
    return self.net(x)

In [None]:
class ResidualConnection(nn.Module):
  def __init__(self,d_model):
    super().__init__()
    self.layernorm=nn.LayerNorm(d_model)

  def forward(self,x,y):
    x=self.layernorm(x+y)
    return x

# Encoder

In [None]:
class EncoderMultiHeadAttention(nn.Module):
  def __init__(self,d_model,heads,dropout=0.1):
    super().__init__()
    self.heads=heads
    self.d_model=d_model
    self.d_k=d_model//heads

    self.q_w=nn.Linear(d_model,d_model,bias=False)
    self.k_w=nn.Linear(d_model,d_model,bias=False)
    self.v_w=nn.Linear(d_model,d_model,bias=False)
    self.w_o=nn.Linear(d_model,d_model,bias=False)
    self.dropout=nn.Dropout(dropout)

  @staticmethod
  def attention(query,key,value,dropout:nn.Dropout,mask=None):
    d_k=query.shape[-1]
    attn_scores=(query@key.transpose(-2,-1))*d_k**-0.5  # [batch,heads,seq_len,seq_len]
    if mask is not None:  # [batch,seq_len]
      if mask.dim()==2:
        mask = mask.unsqueeze(1).unsqueeze(2) # [batch,1,1,seq_len]
      attn_scores=attn_scores.masked_fill(mask==0,float('-inf'))
    attn_scores=F.softmax(attn_scores,dim=-1)
    if dropout is not None:
      attn_scores=dropout(attn_scores)
    out=attn_scores@value # [batch,heads,seq_len,d_k]

    return out,attn_scores

  def forward(self,x,mask=None):
    query=self.q_w(x)  # [batch,seq_len,d_model]
    key=self.k_w(x)    # [batch,seq_len,d_model]
    value=self.v_w(x)  # [batch,seq_len,d_model]

    query=query.view(query.shape[0],query.shape[1],self.heads,self.d_k).transpose(1,2)  # [batch,seq_len,d_model]=>[batch,heads,seq_len,d_k]
    key=key.view(key.shape[0],key.shape[1],self.heads,self.d_k).transpose(1,2)      # [batch,seq_len,d_model]=>[batch,heads,seq_len,d_k]
    value=value.view(value.shape[0],value.shape[1],self.heads,self.d_k).transpose(1,2)  # [batch,seq_len,d_model]=>[batch,heads,seq_len,d_k]

    x,self.attention_score = EncoderMultiHeadAttention.attention(query, key, value,self.dropout, mask )
    x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.heads * self.d_k)  # [batch,heads,seq_len, d_k] --> [batch,seq_len,head,d_k] --> [batch,seq_len,d_model]
    x=self.dropout(self.w_o(x))

    return x

In [None]:
class EncoderBlock(nn.Module):
  def __init__(self,d_model,heads):
    super().__init__()
    self.SA=EncoderMultiHeadAttention(d_model,heads)
    self.FFN=FeedForward(d_model)
    self.ResidualConnection1=ResidualConnection(d_model)
    self.ResidualConnection2=ResidualConnection(d_model)

  def forward(self,x,mask=None):
    sa_out=self.SA(x,mask)
    x=self.ResidualConnection1(x,sa_out)
    ffn_out=self.FFN(x)
    x=self.ResidualConnection2(x,ffn_out)

    return x

In [None]:
class Encoder(nn.Module):
  def __init__(self,d_model,heads,layers):
    super().__init__()
    self.layers=layers
    self.EncoderBlock=nn.ModuleList([EncoderBlock(d_model, heads) for _ in range(layers)])

  def forward(self,x,mask=None):
    for layer in self.EncoderBlock:
      x = layer(x, mask)

    return x

# Decoder

In [None]:
class DecoderMultiHeadAttention(nn.Module):
  def __init__(self,d_model,heads,dropout=0.1):
    super().__init__()
    self.heads=heads
    self.d_model=d_model
    self.d_k=d_model//heads

    self.q_w=nn.Linear(d_model,d_model,bias=False)
    self.k_w=nn.Linear(d_model,d_model,bias=False)
    self.v_w=nn.Linear(d_model,d_model,bias=False)
    self.w_o=nn.Linear(d_model,d_model,bias=False)
    self.dropout=nn.Dropout(dropout)

  @staticmethod
  def attention(query,key,value,dropout:nn.Dropout,mask=None,casual=True):
    d_k=query.shape[-1]
    seq_len=query.shape[2]
    attn_scores=(query@key.transpose(-2,-1))*d_k**-0.5  # [batch,heads,seq_len,seq_len]
    if casual:
      causal_mask = torch.tril(torch.ones(seq_len, seq_len)).to("cuda" if torch.cuda.is_available() else "cpu").bool()
      causal_mask = causal_mask.unsqueeze(0).unsqueeze(1) # [1,1,seq_len,seq_len]

    if mask is not None:  # [batch,seq_len]
      if mask.dim()==2:
        mask = mask.unsqueeze(1).unsqueeze(2) # [batch,1,1,seq_len]

    combined_mask = causal_mask
    if mask is not None:
        combined_mask = combined_mask & mask

    attn_scores=attn_scores.masked_fill(combined_mask==0,float('-inf'))
    attn_scores=F.softmax(attn_scores,dim=-1)
    if dropout is not None:
      attn_scores=dropout(attn_scores)
    out=attn_scores@value # [batch,heads,seq_len,d_k]

    return out,attn_scores

  def forward(self,x,mask=None,casual=True):
    query=self.q_w(x)  # [batch,seq_len,d_model]
    key=self.k_w(x)    # [batch,seq_len,d_model]
    value=self.v_w(x)  # [batch,seq_len,d_model]

    query=query.view(query.shape[0],query.shape[1],self.heads,self.d_k).transpose(1,2)  # [batch,seq_len,d_model]=>[batch,heads,seq_len,d_k]
    key=key.view(key.shape[0],key.shape[1],self.heads,self.d_k).transpose(1,2)      # [batch,seq_len,d_model]=>[batch,heads,seq_len,d_k]
    value=value.view(value.shape[0],value.shape[1],self.heads,self.d_k).transpose(1,2)  # [batch,seq_len,d_model]=>[batch,heads,seq_len,d_k]

    x,self.attention_score = DecoderMultiHeadAttention.attention(query, key, value,self.dropout, mask,casual)
    x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.heads * self.d_k)  # [batch,heads,seq_len, d_k] --> [batch,seq_len,head,d_k] --> [batch,seq_len,d_model]
    x=self.dropout(self.w_o(x))

    return x

In [None]:
class CrossMultiHeadAttention(nn.Module):
  def __init__(self,d_model,heads,dropout=0.1):
    super().__init__()
    self.heads=heads
    self.d_model=d_model
    self.d_k=d_model//heads

    self.q_w=nn.Linear(d_model,d_model,bias=False)
    self.k_w=nn.Linear(d_model,d_model,bias=False)
    self.v_w=nn.Linear(d_model,d_model,bias=False)
    self.w_o=nn.Linear(d_model,d_model,bias=False)
    self.dropout=nn.Dropout(dropout)

  @staticmethod
  def attention(query,key,value,dropout:nn.Dropout,mask=None):
    d_k=query.shape[-1]
    attn_scores=(query@key.transpose(-2,-1))*d_k**-0.5  # [batch,heads,seq_len_query,seq_len_key]
    if mask is not None:  # [batch,seq_len_key]
      if mask.dim()==2:
        mask = mask.unsqueeze(1).unsqueeze(2) # [batch,1,1,seq_len]
      attn_scores=attn_scores.masked_fill(mask==0,float('-inf'))
    attn_scores=F.softmax(attn_scores,dim=-1)
    if dropout is not None:
      attn_scores=dropout(attn_scores)
    out=attn_scores@value # [batch,heads,seq_len,d_k]

    return out,attn_scores

  def forward(self,decoder_output,encoder_output,mask=None):
    query=self.q_w(decoder_output)  # [batch,seq_len,d_model]
    key=self.k_w(encoder_output)    # [batch,seq_len,d_model]
    value=self.v_w(encoder_output)  # [batch,seq_len,d_model]

    query=query.view(query.shape[0],query.shape[1],self.heads,self.d_k).transpose(1,2)  # [batch,seq_len,d_model]=>[batch,heads,seq_len,d_k]
    key=key.view(key.shape[0],key.shape[1],self.heads,self.d_k).transpose(1,2)      # [batch,seq_len,d_model]=>[batch,heads,seq_len,d_k]
    value=value.view(value.shape[0],value.shape[1],self.heads,self.d_k).transpose(1,2)  # [batch,seq_len,d_model]=>[batch,heads,seq_len,d_k]

    x,self.attention_score = CrossMultiHeadAttention.attention(query, key, value,self.dropout,mask)
    x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.heads * self.d_k)  # [batch,heads,seq_len, d_k] --> [batch,seq_len,head,d_k] --> [batch,seq_len,d_model]
    x=self.dropout(self.w_o(x))

    return x

In [None]:
class DecoderBlock(nn.Module):
  def __init__(self,d_model,heads):
    super().__init__()
    self.MSA=DecoderMultiHeadAttention(d_model,heads)
    self.cross_attention=CrossMultiHeadAttention(d_model,heads)
    self.FFN=FeedForward(d_model)
    self.ResidualConnection1=ResidualConnection(d_model) # For Self Attention
    self.ResidualConnection2=ResidualConnection(d_model) # For Cross Attention
    self.ResidualConnection3=ResidualConnection(d_model) # For Feed Forward

  def forward(self,x,encoder_output,encode_mask=None,mask=None,casual=True):
    # Masked Multi-Head Attention
    msa_out=self.MSA(x,mask,casual)
    x=self.ResidualConnection1(x,msa_out)
    # Cross Attention
    cross_out=self.cross_attention(x,encoder_output,encode_mask)
    x=self.ResidualConnection2(x,cross_out)
    # Feed Forward
    ffn_out=self.FFN(x)
    x=self.ResidualConnection3(x,ffn_out)

    return x

In [None]:
class Decoder(nn.Module):
  def __init__(self,d_model,heads,layers):
    super().__init__()
    self.layers=layers
    self.DecoderBlock=nn.ModuleList([DecoderBlock(d_model, heads) for _ in range(layers)])

  def forward(self,x,encoder_output,encode_mask=None,mask=None,casual=True):
    for layer in self.DecoderBlock:
      x=layer(x,encoder_output,encode_mask,mask,casual)

    return x

# Transformer

In [None]:
class Transformer(nn.Module):
  def __init__(self,vocab_size,d_model,heads,layers):
    super().__init__()
    self.vocab_size=vocab_size
    self.d_model=d_model

    self.inputembedding=InputEmbedding(vocab_size,d_model)
    self.inpositionalembedding=PositionalEncoding(d_model)

    self.outembedding=InputEmbedding(vocab_size,d_model)
    self.outpositionalembedding=PositionalEncoding(d_model)

    self.encoder=Encoder(d_model,heads,layers)
    self.decoder=Decoder(d_model,heads,layers)

    self.fc=nn.Linear(in_features=d_model,out_features=vocab_size)

  def forward(self,encode,decode,encode_mask=None,decode_mask=None,casual=True):
    encode=self.inputembedding(encode)
    encode=self.inpositionalembedding(encode)
    encode=self.encoder(encode,encode_mask)

    decode=self.outembedding(decode)
    decode=self.outpositionalembedding(decode)
    decode=self.decoder(decode,encode,encode_mask,decode_mask,casual)

    out=F.log_softmax(self.fc(decode),dim=-1)
    return out

  def encode(self):
    encode=self.inputembedding(encode)
    encode=self.inpositionalembedding(encode)
    encode=self.encoder(encode,encode_mask)

# Dataset

In [None]:
!pip install datasets



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
import pickle
from datasets import load_dataset
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset,Dataset
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

In [None]:
dataset=load_dataset("CohleM/english-to-nepali")

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['en', 'ne'],
        num_rows: 177334
    })
})

In [None]:
english=dataset['train']['en'][:10000]
nepali=dataset['train']['ne'][:10000]

In [None]:
class Tokenizer:
    def __init__(self):
      self.vocab={}

    def load(self,path):
      with open(path, "rb") as file:
        self.vocab=pickle.load(file)

    def train(self,text, vocab_size=1000):
        tokens = list(text.encode("utf-8"))
        vocab_size = vocab_size
        num_merges = vocab_size - 256
        merges = {}
        ids = list(tokens)

        for i in range(num_merges):
            stats = self.get_stats(ids)
            pair = max(stats, key=stats.get)
            idx = 256 + i
            ids = self.merge(ids, pair, idx)
            merges[pair] = idx

        self.vocab['vocab'] = {idx: bytes([idx]) for idx in range(256)}
        for (p0, p1), idx in merges.items():
            self.vocab['vocab'][idx] = self.vocab['vocab'][p0] + self.vocab['vocab'][p1]

        self.vocab['merges']=merges

        return self.vocab

    def get_stats(self, ids):
        counts = {}
        for pair in zip(ids, ids[1:]):
            counts[pair] = counts.get(pair, 0) + 1
        return counts

    def merge(self, ids, pair, idx):
        new_ids = []
        i = 0
        while i < len(ids):
            if i < len(ids) - 1 and ids[i] == pair[0] and ids[i + 1] == pair[1]:
                new_ids.append(idx)
                i += 2
            else:
                new_ids.append(ids[i])
                i += 1
        return new_ids

    def decode(self, ids):
        token = b"".join(self.vocab['vocab'][idx] for idx in ids)
        text = token.decode("utf-8", errors="replace")
        return text

    def encode(self, text):
        token = list(text.encode("utf-8"))
        while len(token) >= 2:
            stats = self.get_stats(token)
            pair = min(stats, key=lambda p: self.vocab['merges'].get(p, float("inf")))
            if pair not in self.vocab['merges']:
                break
            idx = self.vocab['merges'][pair]
            token = self.merge(token, pair, idx)
        return token


In [None]:
en_tokenizer=Tokenizer()
en_tokenizer.load("/content/drive/MyDrive/Tokenizer/English_Tokenizer_500.pkl")

In [None]:
np_tokenizer=Tokenizer()
np_tokenizer.load("/content/drive/MyDrive/Tokenizer/Nepali_Tokenizer_500.pkl")

In [None]:
en_tokenizer.vocab['vocab'][1002] = b'<sos>'
en_tokenizer.vocab['vocab'][1001]=b'<eos>'
en_tokenizer.vocab['vocab'][0] = b'<pad>'

np_tokenizer.vocab['vocab'][1002] = b'<sos>'
np_tokenizer.vocab['vocab'][1001]=b'<eos>'
np_tokenizer.vocab['vocab'][0] = b'<pad>'

In [None]:
# english_tensor = [torch.tensor([1002]+en_tokenizer.encode(i) + [1001]) for i in tqdm(english)]
# nepali_tensor = [torch.tensor([1002]+np_tokenizer.encode(i) + [1001]) for i in tqdm(nepali)]

In [None]:
# len(english_tensor),len(nepali_tensor)

In [None]:
# dataset=list(zip(english_tensor,nepali_tensor))

In [None]:
# torch.save(dataset, '/content/drive/MyDrive/dataset/dataset.pt')

# Dataloader

In [None]:
dataset=torch.load('/content/drive/MyDrive/dataset/dataset.pt')

  dataset=torch.load('/content/drive/MyDrive/dataset/dataset.pt')


In [None]:
def collate_fn(batch):
    english_batch, nepali_batch = zip(*batch)

    max_len_english = max(len(seq) for seq in english_batch)
    max_len_nepali = max(len(seq) for seq in nepali_batch)

    padded_english_batch = pad_sequence(english_batch, batch_first=True, padding_value=0)
    padded_nepali_batch = pad_sequence(nepali_batch, batch_first=True, padding_value=0)

    padded_english_batch = F.pad(padded_english_batch, (0, max_len_english - padded_english_batch.size(1)))
    padded_nepali_batch = F.pad(padded_nepali_batch, (0, max_len_nepali - padded_nepali_batch.size(1)))

    x_padding_mask = (padded_english_batch != 1000).int()
    y_padding_mask = (padded_nepali_batch != 1000).int()

    return padded_english_batch,x_padding_mask, padded_nepali_batch,y_padding_mask


In [None]:
dataloader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn, drop_last=True)

In [None]:
x_train,x_train_mask,y_train,y_train_mask=next(iter(dataloader))

In [None]:
x_train.shape,x_train_mask.shape,y_train.shape,y_train_mask.shape

(torch.Size([32, 144]),
 torch.Size([32, 144]),
 torch.Size([32, 143]),
 torch.Size([32, 143]))

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
model=Transformer(vocab_size=1003,d_model=168,heads=8,layers=2)

In [None]:
model=model.to(device)
x_train,x_train_mask,y_train,y_train_mask=x_train.to(device),x_train_mask.to(device),y_train.to(device),y_train_mask.to(device)

In [None]:
out=model(x_train,y_train,x_train_mask,y_train_mask)

In [None]:
out.shape

torch.Size([32, 143, 1003])

In [None]:
model=Transformer(vocab_size=1003,d_model=300,heads=8,layers=6).to(device)
optimizer=optim.AdamW(model.parameters(), lr=0.001,weight_decay=0.01)
criterion = nn.NLLLoss()

In [None]:
train_loss=[]

for i in range(50):
  model.train()
  batch_train_loss=[]
  for batch in tqdm(dataloader,leave=False):
    x_train=batch[0].to(device)
    x_train_mask=batch[1].to(device)
    y_train=batch[2].to(device)
    y_train_mask=batch[3].to(device)


    output=model(x_train,y_train,x_train_mask,y_train_mask)
    loss = criterion(output.view(-1,output.size(-1)),y_train.view(-1))
    batch_train_loss.append(loss.item())

    optimizer.zero_grad()
    loss.backward()

    optimizer.step()

  train_loss.append(sum(batch_train_loss)/len(batch_train_loss))
  print(f"Epoch={i}\tTrain Loss={sum(batch_train_loss)/len(batch_train_loss)}")




OutOfMemoryError: CUDA out of memory. Tried to allocate 168.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 21.06 MiB is free. Process 7391 has 14.72 GiB memory in use. Of the allocated memory 13.90 GiB is allocated by PyTorch, and 713.61 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
output.shape

In [None]:
import matplotlib.pyplot as plt

plt.plot(train_loss,label="Train Loss")
plt.legend()
plt.show()

In [None]:
eng_text = english[0]
target = nepali[0]

with torch.no_grad():
    input_tensor = torch.tensor([1002] + en_tokenizer.encode(eng_text) + [1001]).unsqueeze(0).to(device)
    decoder_input = torch.tensor([[1002]], device=device)  # Shape: [1, 1]

    predicted_tokens = []

    for _ in range(10):
        output = model(input_tensor, decoder_input)
        output = output[:, -1, :]

        probs = torch.exp(output)
        probs[:, 0] = 0
        next_token = torch.multinomial(probs, num_samples=1)

        predicted_tokens.append(next_token.squeeze().item())

        if next_token.item() == 1001:
            break

        decoder_input = torch.cat([decoder_input, next_token], dim=1)

    predicted_text = np_tokenizer.decode(predicted_tokens)

# Output the results
print(f"Input: {eng_text}")
print(f"Target: {target}")
print(f"Predicted: {predicted_text}")
