In [None]:
!pip install datasets



In [None]:
import torch
import pickle
from datasets import load_dataset
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset,Dataset
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

In [None]:
dataset=load_dataset("CohleM/english-to-nepali")

In [None]:
english=dataset['train']['en'][:20000]
nepali=dataset['train']['ne'][:20000]

In [None]:
class Tokenizer:
    def __init__(self):
      self.vocab={}

    def load(self,path):
      with open(path, "rb") as file:
        self.vocab=pickle.load(file)

    def train(self,text, vocab_size=1000):
        tokens = text.encode("utf-8")
        vocab_size = vocab_size
        num_merges = vocab_size - 256
        merges = {}
        ids = list(tokens)

        for i in range(num_merges):
            stats = self.get_stats(ids)
            pair = max(stats, key=stats.get)
            idx = 256 + i
            ids = self.merge(ids, pair, idx)
            merges[pair] = idx

        self.vocab['vocab'] = {idx: bytes([idx]) for idx in range(256)}
        for (p0, p1), idx in merges.items():
            self.vocab['vocab'][idx] = self.vocab['vocab'][p0] + self.vocab['vocab'][p1]

        self.vocab['merges']=merges

        return self.vocab

    def get_stats(self, ids):
        counts = {}
        for pair in zip(ids, ids[1:]):
            counts[pair] = counts.get(pair, 0) + 1
        return counts

    def merge(self, ids, pair, idx):
        new_ids = []
        i = 0
        while i < len(ids):
            if i < len(ids) - 1 and ids[i] == pair[0] and ids[i + 1] == pair[1]:
                new_ids.append(idx)
                i += 2
            else:
                new_ids.append(ids[i])
                i += 1
        return new_ids

    def decode(self, ids):
        token = b"".join(self.vocab['vocab'][idx] for idx in ids)
        text = token.decode("utf-8", errors="replace")
        return text

    def encode(self, text):
        token = list(text.encode("utf-8"))
        while len(token) >= 2:
            stats = self.get_stats(token)
            pair = min(stats, key=lambda p: self.vocab['merges'].get(p, float("inf")))
            if pair not in self.vocab['merges']:
                break
            idx = self.vocab['merges'][pair]
            token = self.merge(token, pair, idx)
        return token


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# en_tokenizer=Tokenizer()
# en_tokenizer.load("/content/drive/MyDrive/English_Nepali/English_Tokenizer_500.pkl")

In [None]:
# np_tokenizer=Tokenizer()
# np_tokenizer.load("/content/drive/MyDrive/English_Nepali/Nepali_Tokenizer_500.pkl")

In [None]:
# en_tokenizer.vocab['vocab'][1002] = b'<sos>'
# en_tokenizer.vocab['vocab'][1001]=b'<eos>'
# en_tokenizer.vocab['vocab'][1000] = b'<pad>'

# np_tokenizer.vocab['vocab'][1002] = b'<sos>'
# np_tokenizer.vocab['vocab'][1001]=b'<eos>'
# np_tokenizer.vocab['vocab'][1000] = b'<pad>'

In [None]:
tokenizer=Tokenizer()
tokenizer.load("/content/drive/MyDrive/English_Nepali/English_Nepali")

In [None]:
tokenizer.vocab['vocab'][1002] = b'<sos>'
tokenizer.vocab['vocab'][1001]=b'<eos>'
tokenizer.vocab['vocab'][1000] = b'<pad>'

In [None]:
# dataset=torch.load('/content/drive/MyDrive/English_Nepali/dataset.pt')

  dataset=torch.load('/content/drive/MyDrive/English_Nepali/dataset.pt')


In [None]:
dataset=torch.load('/content/drive/MyDrive/English_Nepali/one_token_dataset.pt')

  dataset=torch.load('/content/drive/MyDrive/English_Nepali/one_token_dataset.pt')


In [None]:
def collate_fn(batch):
    english_batch, nepali_batch = zip(*batch)

    max_len_english = max(len(seq) for seq in english_batch)
    max_len_nepali = max(len(seq) for seq in nepali_batch)

    padded_english_batch = pad_sequence(english_batch, batch_first=True, padding_value=1000)
    padded_nepali_batch = pad_sequence(nepali_batch, batch_first=True, padding_value=1000)

    padded_english_batch = F.pad(padded_english_batch, (0, max_len_english - padded_english_batch.size(1)))
    padded_nepali_batch = F.pad(padded_nepali_batch, (0, max_len_nepali - padded_nepali_batch.size(1)))

    return padded_english_batch, padded_nepali_batch

dataloader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn, drop_last=True)

In [None]:
data=next(iter(dataloader))
data[0].shape,data[1].shape

(torch.Size([32, 172]), torch.Size([32, 161]))

In [None]:
class EncoderRNN(nn.Module):
  def __init__(self,input_size,hidden_size,dropout_p=0.1):
    super().__init__()
    self.hidden_size=hidden_size

    self.embedding=nn.Embedding(num_embeddings=input_size,embedding_dim=hidden_size)
    self.RNN=nn.GRU(input_size=hidden_size,hidden_size=hidden_size,batch_first=True)
    self.dropout=nn.Dropout(dropout_p)

  def forward(self,x):
      x=self.dropout(self.embedding(x))
      out,hidden=self.RNN(x)

      return out,hidden

In [None]:
class Attention(nn.Module):
  def __init__(self,hidden_size):
    super().__init__()
    self.Wa=nn.Linear(hidden_size,hidden_size)
    self.Ua=nn.Linear(hidden_size,hidden_size)
    self.Va=nn.Linear(hidden_size,1)

  def forward(self,query,keys):
    scores=self.Va(torch.tanh(self.Wa(query)+self.Ua(keys)))
    scores=scores.squeeze(2).unsqueeze(1)

    weights=F.softmax(scores,dim=-1)
    context=torch.bmm(weights,keys)

    return context,weights

In [None]:
class AttnDecoderRNN(nn.Module):
  def __init__(self,hidden_size,output_size,dropout_p=0.1):
    super().__init__()
    self.embedding=nn.Embedding(output_size,hidden_size)
    self.attention=Attention(hidden_size)
    self.RNN=nn.GRU(2*hidden_size,hidden_size,batch_first=True)
    self.out=nn.Linear(hidden_size,output_size)
    self.dropout=nn.Dropout(dropout_p)

  def forward(self,encoder_outputs,encoder_hidden,target_tensor=None):
    MAX_LENGTH=target_tensor.shape[1] if target_tensor is not None else 20
    batch_size=encoder_outputs.size(0)
    decoder_input=torch.empty(batch_size,1,dtype=torch.long,device=device).fill_(1002)
    decoder_hidden=encoder_hidden
    decoder_outputs=[]
    attentions=[]

    for i in range(MAX_LENGTH):
      decoder_output,decoder_hidden,attn_weights=self.forward_step(decoder_input,decoder_hidden,encoder_outputs)
      decoder_outputs.append(decoder_output)
      attentions.append(attn_weights)

      if target_tensor is not None:
        decoder_input=target_tensor[:,i].unsqueeze(1)
      else:
        _,topi=decoder_output.topk(1)
        decoder_input=topi.squeeze(-1).detach()

    decoder_outputs=torch.cat(decoder_outputs,dim=1)
    decoder_outputs=F.log_softmax(decoder_outputs,dim=-1)
    attentions=torch.cat(attentions,dim=1)

    return decoder_outputs,decoder_hidden,attentions

  def forward_step(self,input,hidden,encoder_outputs):
    embedded=self.dropout(self.embedding(input))
    query=hidden.permute(1,0,2)
    context,attn_weights=self.attention(query,encoder_outputs)
    input_rnn=torch.cat((embedded,context),dim=2)

    output,hidden=self.RNN(input_rnn,hidden)
    output=self.out(output)

    return output,hidden,attn_weights


In [None]:
device="cuda" if torch.cuda.is_available() else "cpu"

In [None]:
len(tokenizer.vocab['vocab'])

1003

In [None]:
hidden_size = 128

encoder = EncoderRNN(len(tokenizer.vocab['vocab']), hidden_size).to(device)
decoder=AttnDecoderRNN(hidden_size, len(tokenizer.vocab['vocab'])).to(device)

encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.01)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.01)
criterion = nn.NLLLoss()

In [None]:
train_loss=[]

for i in range(15):
  encoder.train()
  decoder.train()
  batch_train_loss=[]
  for batch in tqdm(dataloader):
    x_train=batch[0].to(device)
    y_train=batch[1].to(device)

    encoder_outputs, encoder_hidden = encoder(x_train)
    decoder_outputs,decoder_hidden, _,= decoder(encoder_outputs,encoder_hidden, y_train)
    loss = criterion(decoder_outputs.view(-1, decoder_outputs.size(-1)),y_train.view(-1))
    batch_train_loss.append(loss.item())

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

  train_loss.append(sum(batch_train_loss)/len(batch_train_loss))
  print(f"Epoch={i}\tTrain Loss={sum(batch_train_loss)/len(batch_train_loss)}")


100%|██████████| 625/625 [05:00<00:00,  2.08it/s]


Epoch=0	Train Loss=1.3677364879131317


  9%|▉         | 57/625 [00:22<03:53,  2.43it/s]

In [None]:
encoder.eval()
decoder.eval()

eng_text=english[0]
target=nepali[0]
with torch.no_grad():
  input_tensor=torch.tensor(tokenizer.encode(eng_text)+[1001]).unsqueeze(0).to(device)
  encoder_outputs, encoder_hidden = encoder(input_tensor)
  decoder_outputs,decoder_hidden,decoder_attn= decoder(encoder_outputs,encoder_hidden)
  _, topi = decoder_outputs.topk(1)
  decoded_ids = topi.squeeze()

  decoded_words = []
  for idx in decoded_ids:
    if idx.item() == 1001:
      decoded_words.append(1001)
      break
    decoded_words.append(idx.item())
print(f"input:{eng_text}")
print(f"target:{target}")
print(f"predict:{tokenizer.decode(decoded_words)}")

input:Technical committees will be attached to each ministry.
target:प्रत्येक मन्त्रालय अन्तर्गत शिल्प (टेक्निकल) कमिटीहरु गठन गरिनेछन्
predict:तपाईंले तपाईंका सेवकहरू सुलेमानले आफ्ना सुलेमानले आफ्न
