In [54]:
import datasets
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import re
import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [55]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [56]:
# import gdown
# gdown.download(url="https://github.com/Text-Mining/Persian-Wikipedia-Corpus/raw/master/models/glove/vectors.zip",output="vectors.zip")
# !unzip "vectors.zip"

In [57]:
SOS_token = 0
EOS_token = 1
PAD_token = 2
MAX_LENGTH = 0

class Lang:
   def __init__(self):
       self.word2index = {}
       self.word2count = {}
       self.index2word = {0: "SOS", 1: "EOS", 2:"PAD"} #
       self.n_words = 3  

   def addSentence(self, sentence):
       for word in sentence.split(' '):
           self.addWord(word)

   def addWord(self, word):
       if word not in self.word2index:
           self.word2index[word] = self.n_words
           self.word2count[word] = 1
           self.index2word[self.n_words] = word
           self.n_words += 1
       else:
           self.word2count[word] += 1

In [58]:
def processing(lang1,lang2):
   global MAX_LENGTH
   df = pd.read_csv('ferdousi.txt', header=None, names=['Beyt'])[2::]
   d1 = df[0::2].values[:,0].tolist()
   d2 = df[0::2].values[:,0].tolist()
   df = pd.DataFrame({lang1:d1 , lang2:d2 })
   source = Lang()
   target = Lang()
   pairs = []
   sen1 = df[lang1]
   sen2 = df[lang2]
   
   for i in range(len(df)):
      if len(sen1[i].split(' ')) > MAX_LENGTH:
         MAX_LENGTH = len(sen1[i].split(' '))
   
      source.addSentence(sen1[i])
      target.addSentence(sen2[i])
      pairs.append([sen1[i], sen2[i]])
   
   return source, target, pairs

source, target, pairs = processing('M1', 'M2')

In [59]:
def indexesFromSentence(lang, sentence):
   return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
   global MAX_LENGTH
   global PAD_token
   indexes = indexesFromSentence(lang, sentence)
   pad_list=[PAD_token for i in range(MAX_LENGTH-len(indexes))]
   indexes =  indexes  + pad_list
   indexes.append(EOS_token)
   return torch.tensor(indexes, dtype=torch.long,device=device).view(-1, 1)

def tensorsFromPair(input_lang, output_lang, pair):
   input_tensor = tensorFromSentence(input_lang, pair[0])
   target_tensor = tensorFromSentence(output_lang, pair[1])
   return (input_tensor, target_tensor)

In [60]:
class Encoder(nn.Module):
   def __init__(self, input_dim, hidden_dim, embbed_dim, num_layers,GRU=False):
       super(Encoder, self).__init__()
      
       self.input_dim = input_dim
       self.embbed_dim = embbed_dim
       self.hidden_dim = hidden_dim
       self.num_layers = num_layers

       self.embedding = nn.Embedding(input_dim, self.embbed_dim,padding_idx=2) 
       if GRU:
        self.recurrent = nn.GRU(self.embbed_dim, self.hidden_dim, num_layers=self.num_layers)
       else:
        self.recurrent = nn.LSTM(self.embbed_dim, self.hidden_dim, num_layers=self.num_layers)
              
   def forward(self, src):
       embedded = self.embedding(src).view(1,1,-1)
       outputs, hidden = self.recurrent(embedded)
       return outputs, hidden
   def initHidden(self):
     return torch.zeros(1, 1, self.hidden_dim, device=device)

In [61]:
class Decoder(nn.Module):
   def __init__(self, output_dim, hidden_dim, embbed_dim, num_layers,GRU=False):
       super(Decoder, self).__init__()

       self.embbed_dim = embbed_dim
       self.hidden_dim = hidden_dim
       self.output_dim = output_dim
       self.num_layers = num_layers

       self.embedding = nn.Embedding(output_dim, self.embbed_dim ,padding_idx=2) # 
       
       if GRU:
        self.recurrent = nn.GRU(self.embbed_dim, self.hidden_dim, num_layers=self.num_layers)
       else :
        self.recurrent=nn.LSTM(self.embbed_dim, self.hidden_dim, num_layers=self.num_layers)

       self.out = nn.Linear(self.hidden_dim, output_dim)
       self.softmax = nn.LogSoftmax(dim=1)
      
   def forward(self, input, hidden):

       input = input.view(1, -1)
       embedded = F.relu(self.embedding(input))
       output, hidden = self.recurrent(embedded, hidden)       
       prediction = self.softmax(self.out(output[0]))
      
       return prediction, hidden
   def initHidden(self):
      return torch.zeros(1, 1, self.hidden_dim, device=device)

In [62]:
class Seq2Seq(nn.Module):
   def __init__(self, encoder, decoder, device, MAX_LENGTH=MAX_LENGTH,mode='GRU'):
       super().__init__()
      
       self.encoder = encoder
       self.decoder = decoder
       self.device = device
       self.mode = mode
   def forward(self, source, target, teacher_forcing_ratio=0.5):

       input_length = source.size(0) 
       batch_size = target.shape[1] 
       target_length = target.shape[0]
       vocab_size = self.decoder.output_dim
      
       outputs = torch.zeros(target_length, batch_size, vocab_size).to(self.device)

       for i in range(input_length):
           encoder_output, encoder_hidden = self.encoder(source[i])
       
       if self.mode == "GRU":
         decoder_hidden = encoder_hidden.to(device)
       elif self.mode == "LSTM":
         decoder_hidden = (encoder_hidden[0].to(device),encoder_hidden[1].to(device))
       
       decoder_input = torch.tensor([SOS_token], device=device) 

       for t in range(target_length):   
           decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
           outputs[t] = decoder_output
           teacher_force = random.random() < teacher_forcing_ratio
           topv, topi = decoder_output.topk(1)
           input = (target[t] if teacher_force else topi)
           if(teacher_force == False and input.item() == EOS_token):
               break

       return outputs

In [63]:
def evaluate(model, input_lang, output_lang, sentences, MAX_LENGTH=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentences[0])
        output_tensor = tensorFromSentence(output_lang, sentences[1])
        decoded_words = []
        output = model(input_tensor, output_tensor)
        for ot in range(output.size(0)):
            topv, topi = output[ot].topk(1)
            if topi[0].item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi[0].item()])
    return decoded_words
    

In [64]:
teacher_forcing_ratio = 0.5
metric = datasets.load_metric('sacrebleu')

# GRU

In [65]:
source, target, pairs = processing('M1', 'M2')
print(f'random sentence {random.choice(pairs)}')
print('Input : {} Output : {}'.format(source.n_words, target.n_words))

random sentence ['نکردند زیشان کسی آفرین', 'نکردند زیشان کسی آفرین']
Input : 12687 Output : 12687


In [66]:
encoder = Encoder(source.n_words, 512, 256, 1,GRU=True)
decoder = Decoder(target.n_words, 512, 256, 1,GRU=True)
model = Seq2Seq(encoder, decoder, device,mode='GRU').to(device)

In [67]:
model.train()
optimizer = optim.Adam(model.parameters(), lr=0.001,weight_decay=1e-5)
criterion = nn.CrossEntropyLoss()
total_loss_iterations = 0
final_score = 0
training_pairs = [tensorsFromPair(source, target, random.choice(pairs))
                    for i in range(100000)]

for iter in tqdm(range(1, 100000+1)):
    training_pair = training_pairs[iter - 1]
    optimizer.zero_grad()
    input_length = training_pair[0].size(0)
    loss = 0
    epoch_loss = 0
    output = model(training_pair[0], training_pair[1],teacher_forcing_ratio)
    num_iter = output.size(0)
    for ot in range(num_iter):
        loss += criterion(output[ot], training_pair[1][ot])
    metric.add(predictions=output, references=training_pair[1])
    loss.backward()
    optimizer.step()
    loss = loss.item() / num_iter
    s  = metric.compute()
    final_score += s['score']
    total_loss_iterations += loss
    if iter % 10000 == 0:
        avarage_loss= total_loss_iterations / 10000
        avg_score = final_score /10000
        total_loss_iterations = 0
        tqdm.write(f'iter :{iter} , Loss : {avarage_loss} , score :{avg_score}')
        torch.save(model.state_dict(), f'GRU_Model.pt')
        


 10%|█         | 10004/100000 [02:53<29:28, 50.90it/s]

iter :10000 , Loss : 3.544348538351059 , score :0.29426069483185535


 20%|██        | 20006/100000 [05:48<27:23, 48.66it/s]

iter :20000 , Loss : 3.380070158545189 , score :0.5793217455610776


 30%|███       | 30002/100000 [08:42<24:09, 48.29it/s]

iter :30000 , Loss : 3.349424096743259 , score :0.8643799903933321


 40%|████      | 40003/100000 [11:37<20:52, 47.88it/s]

iter :40000 , Loss : 3.350461881113033 , score :1.1494347609575444


 50%|█████     | 50004/100000 [14:32<18:20, 45.45it/s]

iter :50000 , Loss : 3.3423789758364437 , score :1.434314864185007


 60%|██████    | 60002/100000 [17:26<13:55, 47.88it/s]

iter :60000 , Loss : 3.3451814335982015 , score :1.7184840942340827


 70%|███████   | 70006/100000 [20:22<10:17, 48.60it/s]

iter :70000 , Loss : 3.340851335668563 , score :2.002457300241479


 80%|████████  | 80004/100000 [23:17<07:17, 45.66it/s]

iter :80000 , Loss : 3.3400858798027175 , score :2.2864183024174864


 90%|█████████ | 90007/100000 [26:11<03:19, 49.97it/s]

iter :90000 , Loss : 3.3364618187745285 , score :2.5696787283978413


100%|██████████| 100000/100000 [29:07<00:00, 57.23it/s]

iter :100000 , Loss : 3.325283986695601 , score :2.853174840871889





In [68]:
for i in range(10):
    pair = random.choice(pairs)
    print('source {}'.format(pair[0]))
    print('target {}'.format(pair[1]))
    output_words = evaluate(model, source, target, pair)
    output_sentence = ' '.join(output_words)
    print('predicted {}'.format(output_sentence))

source ابر چشم او راست کن هر دو دست
target ابر چشم او راست کن هر دو دست
predicted چو گفت و و PAD PAD PAD PAD PAD PAD PAD <EOS>
source تو خون سر بیگناهان مریز
target تو خون سر بیگناهان مریز
predicted چو گفت و و PAD PAD PAD PAD PAD PAD PAD <EOS>
source برادر ندیدیم هرگز دو شاه
target برادر ندیدیم هرگز دو شاه
predicted چو گفت و و PAD PAD PAD PAD PAD PAD PAD <EOS>
source چو هنگام برگشتن شاه بود
target چو هنگام برگشتن شاه بود
predicted چو گفت و و PAD PAD PAD PAD PAD PAD PAD <EOS>
source چنان بد که ابلیس روزی پگاه
target چنان بد که ابلیس روزی پگاه
predicted چو گفت و و PAD PAD PAD PAD PAD PAD PAD <EOS>
source ازان پس همه فیلسوفان شهر
target ازان پس همه فیلسوفان شهر
predicted چو گفت و و PAD PAD PAD PAD PAD PAD PAD <EOS>
source بپرسید کز برتری کارها
target بپرسید کز برتری کارها
predicted چو گفت و و PAD PAD PAD PAD PAD PAD PAD <EOS>
source بگرداندش سر ز یزدان پاک
target بگرداندش سر ز یزدان پاک
predicted چو گفت و و PAD PAD PAD PAD PAD PAD PAD <EOS>
source چو جنبیدن شاه کردم درست
target چو جنبیدن 

# LSTM

In [69]:
source, target, pairs = processing('M1', 'M2')
print(f'random sentence {random.choice(pairs)}')
print('Input : {} Output : {}'.format(source.n_words, target.n_words))

random sentence ['چومن زینهاری بود ننگ تو', 'چومن زینهاری بود ننگ تو']
Input : 12687 Output : 12687


In [71]:
encoder = Encoder(source.n_words, 512, 256, num_layers=1,GRU=False)
decoder = Decoder(target.n_words, 512, 256, num_layers=1,GRU=False)
model = Seq2Seq(encoder, decoder, device,mode='LSTM').to(device)

In [72]:
model.train()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
total_loss_iterations = 0
final_score = 0
training_pairs = [tensorsFromPair(source, target, random.choice(pairs))
                    for i in range(100000)]

for iter in tqdm(range(1, 100000+1)):
    training_pair = training_pairs[iter - 1]
    optimizer.zero_grad()
    input_length = training_pair[0].size(0)
    loss = 0
    epoch_loss = 0
    output = model(training_pair[0], training_pair[1],teacher_forcing_ratio)
    num_iter = output.size(0)
    for ot in range(num_iter):
        loss += criterion(output[ot], training_pair[1][ot])
    metric.add(predictions=output, references=training_pair[1])
    loss.backward()
    optimizer.step()
    loss = loss.item() / num_iter
    s  = metric.compute()
    final_score += s['score']
    total_loss_iterations += loss
    if iter % 10000 == 0:
        avarage_loss = total_loss_iterations / 10000
        avg_score = final_score /10000
        total_loss_iterations = 0
        tqdm.write(f'iter :{iter} , Loss : {avarage_loss} , score :{avg_score}')
        torch.save(model.state_dict(), f'LSTM_Model.pt')
        

 10%|█         | 10007/100000 [03:00<29:01, 51.69it/s]

iter :10000 , Loss : 3.470179303646088 , score :0.29021072247338614


 20%|██        | 20004/100000 [06:01<28:12, 47.27it/s]

iter :20000 , Loss : 3.4210155901749926 , score :0.5751614105561029


 30%|███       | 30005/100000 [09:02<26:32, 43.94it/s]

iter :30000 , Loss : 3.473370235919937 , score :0.8587498145045047


 40%|████      | 40005/100000 [12:03<21:29, 46.53it/s]

iter :40000 , Loss : 3.502995540046694 , score :1.1409924066143318


 50%|█████     | 50002/100000 [15:03<19:01, 43.79it/s]

iter :50000 , Loss : 3.5259281517346643 , score :1.4225587882500546


 60%|██████    | 60001/100000 [18:04<14:50, 44.90it/s]

iter :60000 , Loss : 3.55907597169876 , score :1.7040646270148914


 70%|███████   | 70003/100000 [21:05<11:16, 44.36it/s]

iter :70000 , Loss : 3.5835821776548897 , score :1.9856444379818203


 80%|████████  | 80004/100000 [24:06<07:33, 44.12it/s]

iter :80000 , Loss : 3.585838124910987 , score :2.2671662098443357


 90%|█████████ | 90003/100000 [27:07<03:36, 46.23it/s]

iter :90000 , Loss : 3.567206091976192 , score :2.5487665648420297


100%|██████████| 100000/100000 [30:08<00:00, 55.31it/s]

iter :100000 , Loss : 3.5860943871498216 , score :2.83020404350871





In [73]:
for i in range(10):
    pair = random.choice(pairs)
    print('source {}'.format(pair[0]))
    print('target {}'.format(pair[1]))
    output_words = evaluate(model, source, target, pair)
    output_sentence = ' '.join(output_words)
    print('predicted {}'.format(output_sentence))

source چو پیران چنان دید لشکر همه
target چو پیران چنان دید لشکر همه
predicted چو گفت و و PAD PAD PAD PAD PAD PAD PAD <EOS>
source خور و خواب و آرام بر دشت و کوه
target خور و خواب و آرام بر دشت و کوه
predicted چو گفت و و PAD PAD PAD PAD PAD PAD PAD <EOS>
source خردمند کز دور دریا بدید
target خردمند کز دور دریا بدید
predicted چو گفت و و PAD PAD PAD PAD PAD PAD PAD <EOS>
source بدو گفت سهراب کز مرد پیر
target بدو گفت سهراب کز مرد پیر
predicted چو گفت و و PAD PAD PAD PAD PAD PAD PAD <EOS>
source تهمتن چنین گفت با شهریار
target تهمتن چنین گفت با شهریار
predicted چو گفت و و PAD PAD PAD PAD PAD PAD PAD <EOS>
source سپهدار ایران ز پشت سپاه
target سپهدار ایران ز پشت سپاه
predicted چو گفت و و PAD PAD PAD PAD PAD PAD PAD <EOS>
source به جای زبونی و جای فریب
target به جای زبونی و جای فریب
predicted چو گفت و و PAD PAD PAD PAD PAD PAD PAD <EOS>
source دگر کو بدرویش بر مهربان
target دگر کو بدرویش بر مهربان
predicted چو گفت و و PAD PAD PAD PAD PAD PAD PAD <EOS>
source به پیلان گردون کش و گاومیش
target