In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import nltk
nltk.download('punkt')


import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package punkt to /Users/badamosor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS
        
    def addSentence(self, sentence):
        
        regex = "[\u0618-\u061A|\u064B-\u0653]"

        tokenized_text = megasplit('', sentence)
        
        chars = []
        
        for i in range(len(tokenized_text)):
            if (tokenized_text[i] == ''):
                continue
            if (re.search(regex, tokenized_text[i])):
                continue
                
            if (re.search(regex, tokenized_text[i+1])):
                char = tokenized_text[i] + tokenized_text[i+1]
                chars.append(char)
                self.addWord(char)
                
            else:
                self.addWord(tokenized_text[i])
                                  
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [4]:
import codecs
import csv
import random
from sklearn.model_selection import train_test_split

def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")   

    FILENAME_TRAIN = "char_basedTrain.data"
    FILENAME_TEST = "char_basedTest.data"   

    ENCODING = 'utf-8'
    train_pairs = []
    with codecs.open(FILENAME_TRAIN, "r", ENCODING) as fp:
      reader = csv.reader(fp)
      for rows in reader:
        train_pairs.append(rows)
        
    test_pairs = []
    with codecs.open(FILENAME_TEST, "r", ENCODING) as fp:
      reader = csv.reader(fp)
      for rows in reader:
        test_pairs.append(rows)
      
        
    input_lang = Lang(lang1)
    output_lang = Lang(lang2)
    
    return input_lang, output_lang, train_pairs, test_pairs


In [5]:
def megasplit(pattern, string):
    splits = list((m.start(), m.end()) for m in re.finditer(pattern, string))
    starts = [0] + [i[1] for i in splits]
    ends = [i[0] for i in splits] + [len(string)]
    return [string[start:end] for start, end in zip(starts, ends)]

  
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, train_pairs, test_pairs = readLangs(lang1, lang2, reverse)
  
    print("Read %s training sentence pairs" % len(train_pairs))
    print("Counting words...")
    
    for pair in train_pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    
    ##!!!Adding new words in test to training words
    
    print("Read %s test sentence pairs" % len(test_pairs))
    print("Counting words...")
    for pair in test_pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    
    return input_lang, output_lang, train_pairs, test_pairs


input_lang, output_lang, train_pairs, test_pairs = prepareData('NoDiac', 'Diac', False)
print(random.choice(train_pairs))
print(random.choice(test_pairs))



Reading lines...
Read 12004 training sentence pairs
Counting words...
Counted words:
NoDiac 39
Diac 257
Read 1700 test sentence pairs
Counting words...
Counted words:
NoDiac 39
Diac 257
['الأعظم', 'الأَعظم']
['لعمري', 'لَعَمْرِي']


In [6]:
print([input_lang.word2index])
print([output_lang.word2index])


[{'ك': 2, 'ل': 3, 'م': 4, 'ا': 5, 'ت': 6, 'أ': 7, 'و': 8, 'ع': 9, 'ب': 10, 'ي': 11, 'ر': 12, 'ط': 13, 'ق': 14, 'ة': 15, 'خ': 16, 'ف': 17, 'ج': 18, 'ه': 19, 'ح': 20, 'د': 21, 'ؤ': 22, 'ن': 23, 'س': 24, 'ز': 25, 'ض': 26, 'ش': 27, 'ء': 28, 'ـ': 29, 'ص': 30, 'غ': 31, 'ذ': 32, 'ئ': 33, 'آ': 34, 'ظ': 35, 'ث': 36, 'إ': 37, 'ى': 38}]
[{'كَ': 2, 'لِ': 3, 'مَ': 4, 'ا': 5, 'تٌ': 6, 'أ': 7, 'و': 8, 'تَ': 9, 'عَ': 10, 'بِ': 11, 'ي': 12, 'رٌ': 13, 'مُ': 14, 'طَ': 15, 'قَ': 16, 'ةٌ': 17, 'خِ': 18, 'ل': 19, 'فُ': 20, 'جَ': 21, 'م': 22, 'ه': 23, 'ح': 24, 'دٌ': 25, 'بْ': 26, 'دُ': 27, 'ؤ': 28, 'ن': 29, 'ب': 30, 'ع': 31, 'د': 32, 'قُ': 33, 'س': 34, 'ف': 35, 'أَ': 36, 'مّ': 37, 'بَ': 38, 'هَ': 39, 'طُ': 40, 'حِ': 41, 'نُ': 42, 'كُ': 43, 'زِ': 44, 'لْ': 45, 'تُ': 46, 'لَ': 47, 'مْ': 48, 'رَ': 49, 'ضَ': 50, 'بّ': 51, 'ك': 52, 'يْ': 53, 'ضِ': 54, 'دّ': 55, 'ج': 56, 'ضُ': 57, 'ر': 58, 'دِ': 59, 'شِ': 60, 'ءُ': 61, 'رِ': 62, 'كْ': 63, 'ضّ': 64, 'عُ': 65, 'ـ': 66, 'شْ': 67, 'هِ': 68, 'سُ': 69, 'صَ': 70, 'غّ': 7

In [21]:

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [22]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [23]:

MAX_LENGTH = 50

In [24]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
       

In [25]:
def indexesFromSentence(lang, sentence):
    regex = "[\u0618-\u061A|\u064B-\u0653]"
    tokenized_text = megasplit('', sentence)
        
    chars = []
        
    for i in range(len(tokenized_text)):
        if (tokenized_text[i] == ''):
            continue
        if (re.search(regex, tokenized_text[i])):
            continue
                
        if (re.search(regex, tokenized_text[i+1])):
            char = tokenized_text[i] + tokenized_text[i+1]
            chars.append(lang.word2index[char])                
        else:
            chars.append(lang.word2index[tokenized_text[i]])

    return chars

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [26]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    #use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    use_teacher_forcing = False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            #decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            #decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [27]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [28]:
averageLoss = []
accuracies = []

In [36]:
lr_per_iter = []


def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    #training_pairs = [tensorsFromPair(random.choice(pairs)) for i in range(n_iters)]
    #training_pairs = [tensorsFromPair(random.choice(train_pairs)) for i in range(n_iters)]
    
    criterion = nn.NLLLoss()
        
    for i in range(n_iters):
        print(i)
        numExample = 0
        for pairs in train_pairs:
            training_pair = tensorsFromPair(pairs)
            input_tensor = training_pair[0]
            target_tensor = training_pair[1]
            
            loss = train(input_tensor, target_tensor, encoder,decoder, encoder_optimizer, decoder_optimizer, criterion)
            print_loss_total += loss
            plot_loss_total += loss
            numExample += 1
            
            if (numExample % print_every == 0):
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                #print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         #iter, iter / n_iters * 100, print_loss_avg))
                print('%s %d %.4f %d' % (timeSince(start, numExample), numExample, print_loss_avg, i))
      
        accuracy = getAccuracy(encoder1, decoder)
        accuracies.append(accuracy)
        averageLoss.append(print_loss_total / print_every)
        print(('%.4f,%.4f') % ((print_loss_total / print_every),loss))
        print(accuracy)
      

Evaluation
==========



In [31]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
           # decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
           # decoder_attentions[di] = decoder_attention.data
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)            
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                #decoded_words.append('<EOS>')
                break
            else:
                #if (topi.item() not in output_lang.test_index2word):
                  #decoded_words.append("unk")
                #else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        #return decoded_words, decoder_attentions[:di + 1]
        return decoded_words

In [38]:
def getAccuracy(encoder, decoder, n=10):
    
    correct_sentence_pair = 0
    incorrect_sentence_pair = 0
    
    correct_word_pair = 0
    incorrect_word_pair = 0
        
    correct_diac = 0
    incorrect_diac = 0
    
    regex = "[\u0618-\u061A|\u064B-\u0653]"
 
    count = 0
    sentenceInput = []
    sentenceTarget = []
    sentenceOutput = []
    
    for pair in test_pairs:
        
        output_chars = evaluate(encoder, decoder, pair[0])
        output_word = ''.join(output_chars)
            
        if(count < 4):
            sentenceInput.append(pair[0])
            sentenceTarget.append(pair[1])
            sentenceOutput.append(output_word)
            count += 1
        else:
            #Accuracy on sentence basis
            
            if (sentenceTarget == sentenceOutput):
                correct_sentence_pair += 1
                #print('correct: ',sentenceTarget, sentenceOutput)

            else:
                incorrect_sentence_pair += 1
                #print('incorrect: ',sentenceTarget, sentenceOutput)

            sentenceInput = []
            sentenceTarget = []
            sentenceOutput = [] 
            
            sentenceInput.append(pair[0])
            sentenceTarget.append(pair[1])
            sentenceOutput.append(output_word)
            count = 1            
            
        
        #Accuracy on word basis
        if (pair[1] == output_word):
            if (re.search(regex, pair[1])):
                correct_diac += 1
                correct_word_pair += 1
        else:
            if (re.search(regex, pair[1])):
                incorrect_diac += 1
                incorrect_word_pair += 1     
  
    return((correct_word_pair/(correct_word_pair+incorrect_word_pair), correct_sentence_pair/(correct_sentence_pair+incorrect_sentence_pair)))  

In [33]:
def evaluateRandomly(encoder, decoder, n=10):
    
    correct_sentence_pair = 0
    incorrect_sentence_pair = 0
    
    correct_word_pair = 0
    incorrect_word_pair = 0
        
    correct_diac = 0
    incorrect_diac = 0
    
    regex = "[\u0618-\u061A|\u064B-\u0653]"
 
    count = 0
    sentenceInput = []
    sentenceTarget = []
    sentenceOutput = []
    
    for pair in test_pairs:
                    
        output_chars = evaluate(encoder, decoder, pair[0])
        output_word = ''.join(output_chars)
            
        if(count < 4):
            sentenceInput.append(pair[0])
            sentenceTarget.append(pair[1])
            sentenceOutput.append(output_word)
            count += 1
        else:
            #Accuracy on sentence basis
            
            if (sentenceTarget == sentenceOutput):
                correct_sentence_pair += 1
                print('correct: ',sentenceTarget, sentenceOutput)


            else:
                incorrect_sentence_pair += 1
                print('incorrect: ',sentenceTarget, sentenceOutput)

            sentenceInput = []
            sentenceTarget = []
            sentenceOutput = [] 
            
            sentenceInput.append(pair[0])
            sentenceTarget.append(pair[1])
            sentenceOutput.append(output_word)
            count = 1            
            
        
        #Accuracy on word basis
        if (pair[1] == output_word):
            if (re.search(regex, pair[1])):
                correct_diac += 1
                correct_word_pair += 1
        else:
            if (re.search(regex, pair[1])):
                incorrect_diac += 1
                incorrect_word_pair += 1     
  
    print("accuracy of words:",correct_word_pair/(correct_word_pair+incorrect_word_pair),correct_word_pair,incorrect_word_pair) 
    print("percentage correct diacritic:",correct_diac/correct_word_pair)
    print("Number of all words: ",correct_word_pair+correct_word_pair)
    
    print("accuracy of sentence:",correct_sentence_pair/(correct_sentence_pair+incorrect_sentence_pair),correct_sentence_pair,incorrect_sentence_pair) 
    print("Number of all sentences: ", correct_sentence_pair+incorrect_sentence_pair)

In [34]:
print("input:", input_lang.n_words)
print("train size", len(train_pairs))

input: 39
train size 12004


In [42]:
hidden_size = 256

encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
#attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)
decoder1 = DecoderRNN(hidden_size, output_lang.n_words).to(device)
#print("Embeddings:", encoder1.embedding)

#trainIters(encoder1, attn_decoder1, 100, print_every=500)
#trainIters(encoder1, decoder1, 1000, print_every=500)
trainIters(encoder1, decoder1, 100, print_every=1000)

0
0m 17s (- -1m 42s) 1000 2.8476 0
0m 36s (- -1m 23s) 2000 2.7635 0
0m 55s (- -1m 4s) 3000 2.3606 0
1m 16s (- -2m 43s) 4000 2.2564 0
1m 36s (- -2m 23s) 5000 1.9955 0
1m 56s (- -2m 3s) 6000 1.9438 0
2m 16s (- -3m 43s) 7000 1.8275 0
2m 37s (- -3m 22s) 8000 1.6953 0
2m 58s (- -3m 1s) 9000 1.5690 0
3m 20s (- -4m 39s) 10000 1.5394 0
3m 40s (- -4m 19s) 11000 1.4749 0
4m 1s (- -5m 58s) 12000 1.3835 0
0.0046,2.2705
(0.10880458124552612, 0.0)
1
4m 27s (- -5m 32s) 1000 1.5295 1
4m 48s (- -5m 11s) 2000 1.3457 1
5m 8s (- -6m 51s) 3000 1.1750 1
5m 29s (- -6m 30s) 4000 1.2420 1
5m 50s (- -6m 9s) 5000 1.1265 1
6m 11s (- -7m 48s) 6000 1.1746 1
6m 32s (- -7m 27s) 7000 1.1374 1
6m 53s (- -7m 6s) 8000 1.1065 1
7m 14s (- -8m 45s) 9000 1.0392 1
7m 34s (- -8m 25s) 10000 1.0838 1
7m 55s (- -8m 4s) 11000 1.0257 1
8m 16s (- -9m 43s) 12000 0.9993 1
0.0034,1.6693
(0.14387974230493916, 0.0)
2
8m 42s (- -9m 17s) 1000 1.1292 2
9m 3s (- -10m 56s) 2000 0.9864 2
9m 24s (- -10m 35s) 3000 0.8898 2
9m 45s (- -10m 14s) 40

73m 48s (- -74m 13s) 3000 0.3265 17
74m 8s (- -75m 52s) 4000 0.3894 17
74m 29s (- -75m 31s) 5000 0.3529 17
74m 51s (- -75m 9s) 6000 0.3878 17
75m 11s (- -76m 49s) 7000 0.3700 17
75m 32s (- -76m 27s) 8000 0.3478 17
75m 54s (- -76m 6s) 9000 0.3801 17
76m 14s (- -77m 45s) 10000 0.3711 17
76m 35s (- -77m 24s) 11000 0.3612 17
76m 56s (- -77m 3s) 12000 0.3561 17
0.0003,0.1405
(0.2984967788117394, 0.030660377358490566)
18
77m 23s (- -78m 41s) 1000 0.4099 18
77m 44s (- -78m 18s) 2000 0.3642 18
78m 5s (- -79m 56s) 3000 0.3260 18
78m 26s (- -79m 34s) 4000 0.3813 18
78m 47s (- -79m 13s) 5000 0.3343 18
79m 8s (- -80m 51s) 6000 0.3912 18
79m 29s (- -80m 31s) 7000 0.3516 18
79m 50s (- -80m 9s) 8000 0.3575 18
80m 11s (- -81m 48s) 9000 0.3588 18
80m 32s (- -81m 28s) 10000 0.3489 18
80m 52s (- -81m 7s) 11000 0.3405 18
81m 13s (- -82m 46s) 12000 0.3359 18
0.0005,0.1901
(0.31496062992125984, 0.03537735849056604)
19
81m 40s (- -82m 24s) 1000 0.3959 19
82m 1s (- -82m 1s) 2000 0.3455 19
82m 22s (- -83m 39s)

143m 34s (- -144m 26s) 6000 0.2404 33
143m 55s (- -144m 5s) 7000 0.2201 33
144m 16s (- -145m 44s) 8000 0.2011 33
144m 38s (- -145m 22s) 9000 0.1968 33
144m 58s (- -145m 2s) 10000 0.2106 33
145m 19s (- -146m 41s) 11000 0.1735 33
145m 40s (- -146m 20s) 12000 0.1865 33
0.0001,0.0390
(0.4101646385110952, 0.0589622641509434)
34
146m 7s (- -146m 1s) 1000 0.2140 34
146m 28s (- -147m 35s) 2000 0.1938 34
146m 49s (- -147m 13s) 3000 0.1808 34
147m 9s (- -148m 52s) 4000 0.2174 34
147m 30s (- -148m 31s) 5000 0.2231 34
147m 51s (- -148m 9s) 6000 0.2338 34
148m 12s (- -149m 48s) 7000 0.2163 34
148m 34s (- -149m 26s) 8000 0.2037 34
148m 55s (- -149m 5s) 9000 0.1817 34
149m 16s (- -150m 44s) 10000 0.2070 34
149m 37s (- -150m 23s) 11000 0.1642 34
149m 57s (- -150m 2s) 12000 0.1911 34
0.0001,0.0807
(0.4108804581245526, 0.06132075471698113)
35
150m 24s (- -151m 44s) 1000 0.2190 35
150m 45s (- -151m 18s) 2000 0.1924 35
151m 6s (- -152m 56s) 3000 0.1756 35
151m 27s (- -152m 34s) 4000 0.2174 35
151m 48s (- 

212m 25s (- -213m 36s) 6000 0.1817 49
212m 46s (- -213m 15s) 7000 0.1566 49
213m 8s (- -214m 53s) 8000 0.1421 49
213m 28s (- -214m 32s) 9000 0.1349 49
213m 49s (- -214m 11s) 10000 0.1575 49
214m 10s (- -215m 50s) 11000 0.1236 49
214m 31s (- -215m 29s) 12000 0.1340 49
0.0001,0.0226
(0.4574087329992842, 0.08962264150943396)
50
214m 58s (- -215m 14s) 1000 0.1550 50
215m 19s (- -216m 46s) 2000 0.1400 50
215m 40s (- -216m 23s) 3000 0.1347 50
216m 1s (- -216m 1s) 4000 0.1614 50
216m 22s (- -217m 40s) 5000 0.1748 50
216m 43s (- -217m 18s) 6000 0.1808 50
217m 4s (- -218m 57s) 7000 0.1550 50
217m 26s (- -218m 35s) 8000 0.1405 50
217m 47s (- -218m 13s) 9000 0.1358 50
218m 8s (- -219m 52s) 10000 0.1555 50
218m 29s (- -219m 31s) 11000 0.1253 50
218m 50s (- -219m 10s) 12000 0.1354 50
0.0001,0.0177
(0.4523979957050823, 0.08962264150943396)
51
219m 17s (- -220m 55s) 1000 0.1543 51
219m 38s (- -220m 27s) 2000 0.1380 51
219m 59s (- -220m 4s) 3000 0.1333 51
220m 20s (- -221m 42s) 4000 0.1592 51
220m 41s

280m 57s (- -281m 5s) 5000 0.1652 65
281m 18s (- -282m 44s) 6000 0.1892 65
281m 39s (- -282m 22s) 7000 0.1529 65
282m 0s (- -282m 1s) 8000 0.1386 65
282m 22s (- -283m 39s) 9000 0.1345 65
282m 42s (- -283m 18s) 10000 0.1608 65
283m 4s (- -284m 57s) 11000 0.1226 65
283m 24s (- -284m 36s) 12000 0.1394 65
0.0000,0.0146
(0.44738725841088045, 0.07311320754716981)
66
283m 51s (- -284m 25s) 1000 0.1579 66
284m 13s (- -285m 55s) 2000 0.1374 66
284m 34s (- -285m 31s) 3000 0.1322 66
284m 54s (- -285m 9s) 4000 0.1637 66
285m 15s (- -286m 47s) 5000 0.1651 66
285m 36s (- -286m 26s) 6000 0.1899 66
285m 57s (- -286m 4s) 7000 0.1551 66
286m 19s (- -287m 42s) 8000 0.1404 66
286m 40s (- -287m 21s) 9000 0.1287 66
287m 0s (- -287m 0s) 10000 0.1572 66
287m 21s (- -288m 39s) 11000 0.1221 66
287m 42s (- -288m 18s) 12000 0.1380 66
0.0000,0.0171
(0.4502505368647101, 0.08018867924528301)
67
288m 9s (- -288m 7s) 1000 0.1574 67
288m 30s (- -289m 37s) 2000 0.1408 67
288m 51s (- -289m 14s) 3000 0.1309 67
289m 12s (-

349m 14s (- -350m 51s) 4000 0.4860 81
349m 34s (- -350m 29s) 5000 0.4201 81
349m 55s (- -350m 7s) 6000 0.4283 81
350m 16s (- -351m 46s) 7000 0.4476 81
350m 37s (- -351m 25s) 8000 0.4239 81
350m 58s (- -351m 3s) 9000 0.4567 81
351m 19s (- -352m 42s) 10000 0.4243 81
351m 40s (- -352m 21s) 11000 0.4236 81
352m 1s (- -352m 0s) 12000 0.4007 81
0.0007,0.6536
(0.33070866141732286, 0.02830188679245283)
82
352m 28s (- -353m 52s) 1000 0.4556 82
352m 49s (- -353m 21s) 2000 0.4194 82
353m 10s (- -354m 56s) 3000 0.3638 82
353m 31s (- -354m 33s) 4000 0.4387 82
353m 52s (- -354m 11s) 5000 0.4173 82
354m 13s (- -355m 49s) 6000 0.4224 82
354m 34s (- -355m 28s) 7000 0.4054 82
354m 55s (- -355m 7s) 8000 0.3973 82
355m 16s (- -356m 45s) 9000 0.3935 82
355m 37s (- -356m 24s) 10000 0.4039 82
355m 58s (- -356m 3s) 11000 0.3744 82
356m 19s (- -357m 42s) 12000 0.3851 82
0.0003,0.1866
(0.3278453829634932, 0.02358490566037736)
83
356m 46s (- -357m 35s) 1000 0.4337 83
357m 7s (- -357m 3s) 2000 0.3976 83
357m 28s 

418m 24s (- -419m 41s) 4000 0.2129 97
418m 45s (- -419m 19s) 5000 0.2102 97
419m 7s (- -420m 57s) 6000 0.2306 97
419m 27s (- -420m 35s) 7000 0.2106 97
419m 48s (- -420m 14s) 8000 0.1948 97
420m 9s (- -421m 52s) 9000 0.1794 97
420m 30s (- -421m 31s) 10000 0.1924 97
420m 51s (- -421m 10s) 11000 0.1716 97
421m 12s (- -422m 49s) 12000 0.1818 97
0.0000,0.0207
(0.4652827487473157, 0.10613207547169812)
98
421m 39s (- -422m 45s) 1000 0.2132 98
422m 0s (- -422m 11s) 2000 0.1788 98
422m 21s (- -423m 46s) 3000 0.1721 98
422m 42s (- -423m 23s) 4000 0.2052 98
423m 3s (- -423m 1s) 5000 0.2025 98
423m 25s (- -424m 38s) 6000 0.2213 98
423m 45s (- -424m 17s) 7000 0.2007 98
424m 7s (- -425m 55s) 8000 0.1792 98
424m 28s (- -425m 33s) 9000 0.1641 98
424m 49s (- -425m 13s) 10000 0.1816 98
425m 10s (- -426m 51s) 11000 0.1580 98
425m 31s (- -426m 30s) 12000 0.1733 98
0.0000,0.0127
(0.4767358625626342, 0.10141509433962265)
99
425m 58s (- -426m 27s) 1000 0.2075 99
426m 19s (- -427m 53s) 2000 0.1732 99
426m 40s

In [43]:
print(averageLoss)


[0.004524723070008414, 0.003046418326241629, 0.004640259070055826, 0.0033863210933549066, 0.002989165073349362, 0.0029475689899353757, 0.002555003109432402, 0.0024455779847644627, 0.0022855837174824306, 0.0018724803413663592, 0.0017464915400459654, 0.001662321959223066, 0.001260582378932408, 0.0018137335720516386, 0.0008216958386557442, 0.000743340600104559, 0.000557165849776495, 0.0007193632296153477, 0.000942945576849438, 0.00026623263813200447, 0.00045018343130747476, 0.0003278510627292451, 0.00037607145877111524, 0.0003091663462775094, 0.00040567541122436525, 0.0003827231895355951, 0.00021660184860229493, 0.000392386828150068, 0.00012174491655258907, 0.00019474859464736213, 0.0001984676406497047, 0.0002039830457596552, 0.0002533365601585025, 0.00019486017454238165, 8.229967526027134e-05, 9.496303967067174e-05, 0.00013660386630467007, 0.00030855251493908114, 7.575352986653645e-05, 7.17220874059768e-05, 9.694730667840868e-05, 5.67348911648705e-05, 0.00011043399856204079, 6.0792650495

In [44]:
print(accuracies)

[(0.11095204008589836, 0.0), (0.15175375805297064, 0.0023584905660377358), (0.10880458124552612, 0.0), (0.14387974230493916, 0.0), (0.16678596993557623, 0.01179245283018868), (0.19040801717967074, 0.009433962264150943), (0.19756621331424482, 0.01179245283018868), (0.2125984251968504, 0.01179245283018868), (0.20830350751610593, 0.01179245283018868), (0.22763063707945597, 0.01179245283018868), (0.23335719398711524, 0.02358490566037736), (0.23693629205440228, 0.014150943396226415), (0.2612741589119542, 0.018867924528301886), (0.2634216177523264, 0.014150943396226415), (0.2691481746599857, 0.01650943396226415), (0.27415891195418757, 0.02122641509433962), (0.2727272727272727, 0.02358490566037736), (0.28274874731567645, 0.01650943396226415), (0.2984967788117394, 0.030660377358490566), (0.2984967788117394, 0.030660377358490566), (0.31496062992125984, 0.03537735849056604), (0.30565497494631355, 0.02358490566037736), (0.2927702219040802, 0.02358490566037736), (0.3192555476020043, 0.037735849056

In [45]:
evaluateRandomly(encoder1, decoder1)

incorrect:  ['الخطابي', 'أَن', 'حَبَّةَ', 'العُرَنيَّ'] ['الخطابي', 'أَن', 'حَبّةَ', 'العُرَنيّ']
incorrect:  ['أَرض', 'في', 'أَنه', 'مؤَنث'] ['أَرض', 'في', 'أَنه', 'مُؤَنّثٌ']
incorrect:  ['على', 'أَحَرَّ', 'كأَنه', 'أَراد'] ['على', 'أَحَرّ', 'كأَنه', 'أَراد']
incorrect:  ['حَرَّ', 'العبدُ', 'يَحَرُّ', 'حَرارَةً'] ['حَرّ', 'العبدُ', 'يَحِرّ', 'حَرارةً']
incorrect:  ['إِلى', 'أَهله', 'أَي', 'صاحبه'] ['إِلى', 'أَهلُه', 'أَي', 'صاحبَه']
incorrect:  ['عليها', 'كُلُّ', 'بِكْرٍ', 'حُرَّةٍ'] ['عليها', 'كلّ', 'بِكْرٌ', 'حَرّةٌ']
incorrect:  ['من', 'الحُسن', 'إِلا', 'أَن'] ['من', 'الحسْنِ', 'إِلا', 'أَن']
incorrect:  ['ويقال', 'ساقُ', 'حُرٍّ', 'صَوْتُ'] ['ويقال', 'ساق', 'حَرّ', 'صَوْتِ']
incorrect:  ['سَاقَ', 'حُرٍّ', 'إِن', 'كان'] ['ساق', 'حَرّ', 'إِنّ', 'كان']
incorrect:  ['عَرَفْتُ', 'من', 'ضَرْب', 'الحَرِيرِ'] ['عَرفْت', 'من', 'ضرْب', 'الحَريرِ']
incorrect:  ['للضأْن', 'وفي', 'المحكَم', 'وحَرِّ'] ['للضأْن', 'وفي', 'المحكم', 'وحَرّ']
incorrect:  ['فقال', 'القَبِيلة', 'إِلا', 'أَنهم'] ['فقال

incorrect:  ['الله', 'عَمْرَك', 'وعُمْرَك', 'وإِن'] ['اللهُ', 'عمْرك', 'وعُمْرَكَ', 'وإِن']
incorrect:  ['اللهُ', 'وعَمَرَه', 'أَبقاه', 'وعَمَّرَ'] ['اللهُ', 'وعُمّرُهُ', 'أَبْقاه', 'وعُمْرٌ']
incorrect:  ['نَفْسَه', 'قدَّر', 'لها', 'قدْراً'] ['نَفْسُه', 'قدّرَ', 'لها', 'قِدَراً']
incorrect:  ['يُعَمَّرُ', 'مِن', 'مُعَمَّرٍ', 'ولا'] ['يَعمرُ', 'من', 'مَعْمَرٍ', 'ولا']
incorrect:  ['يُنْقَص', 'من', 'عُمُرِه', 'إِلا'] ['يُنْقَص', 'من', 'عُمْرِه', 'إِلا']
incorrect:  ['يُطَوَّلُ', 'مِن', 'عُمُرِ', 'مُعَمَّر'] ['يُطَوّلُ', 'من', 'عمر', 'مَعْمَرٍ']
incorrect:  ['مُعَمَّرٍ', 'ولا', 'يُنْقَص', 'مِن'] ['مَعْمَرٍ', 'ولا', 'يُنْقَص', 'من']
incorrect:  ['عُمُرِه', 'يقول', 'إِذا', 'أَتى'] ['عُمْرِه', 'يقول', 'إِذا', 'أَتى']
incorrect:  ['وكأَن', 'الأَول', 'أَشبه', 'بالصواب'] ['وكأَنّ', 'الأَوّل', 'أَشبه', 'بالصواب']
incorrect:  ['للرجل', 'طولَ', 'عُمُرِك', 'أَو'] ['للرجل', 'طولَ', 'عمْرك', 'أَو']
correct:  ['عُمُرِهوقال', 'ثعلب', 'العُمْرَى', 'أَن'] ['عُمُرِهوقال', 'ثعلب', 'العُمْرَى', 'أَن']
inco

correct:  ['يَبْنِيَ', 'الرجلُ', 'بامرأَته', 'في'] ['يَبْنِيَ', 'الرجلُ', 'بامرأَته', 'في']
incorrect:  ['أَهلها', 'فإِن', 'نقلها', 'إِلى'] ['أَهْلها', 'فإِن', 'نقلها', 'إِلى']
incorrect:  ['رَيْحانٍ', 'الطَّيِّب', 'الثناء', 'الطَّيِّب'] ['رَيْحانٍ', 'الطّيّب', 'الثّناء', 'الطّيّب']
incorrect:  ['وهو', 'الآسوالعِمَارة', 'والعَمارة', 'التحيّة'] ['وهو', 'الآسوالعِمَارة', 'والعَمارة', 'التّحيّة']
incorrect:  ['عَمّرَك', 'اللهُ', 'وحيّاك', 'وليس'] ['عمْرك', 'اللهُ', 'وحيّاك', 'وليس']
incorrect:  ['للحُرّة', 'خِمار', 'ولا', 'صَوْقَعة'] ['للحُرّة', 'خِمار', 'ولا', 'صَوْقَعَة']
incorrect:  ['قامَتْ', 'تُصَلّي', 'والخِمارُ', 'مِن'] ['قامَتْ', 'تُصَلّي', 'والخِمارُ', 'من']
incorrect:  ['عَمَر', 'ربَّه', 'عبَدَه', 'وإِنه'] ['عمر', 'ربّه', 'عبَدَه', 'وإِنه']
incorrect:  ['لعَامِرٌ', 'لربّه', 'أَي', 'عابدٌوحكى'] ['لعامِر', 'لربّه', 'أَي', 'عابدٌوحكى']
incorrect:  ['يَعمرُ', 'ربَّه', 'أَي', 'يعبده'] ['يَعمرُ', 'ربّه', 'أَي', 'يعبده']
incorrect:  ['أَمره', 'الثَّخينُ', 'الوَرَعِ', 'مأْخوذ'] ['أَمْره

incorrect:  ['الهذلي', 'لعلكمُ', 'لَمَّا', 'قُتِلْتُم'] ['الهذلي', 'لعلكم', 'لمّا', 'قُتِلْتُم']
correct:  ['ذَكَرْتم', 'ولن', 'تَتْركُوا', 'أَن'] ['ذَكَرْتم', 'ولن', 'تَتْركُوا', 'أَن']
incorrect:  ['تَقْتُلوا', 'مَن', 'تَعَمَّرا', 'قيل'] ['تَقْتُلوا', 'من', 'تَعَمّرا', 'قيل']
incorrect:  ['جَمّعوا', 'لغدٍ', 'شَمْلَكم', 'لك'] ['جَمّعوا', 'لغدٍ', 'شَمْلَكم', 'لَكَ']
incorrect:  ['الأُمُّ', 'مما', 'باليَعامِير', 'والأَبُ'] ['الأُمّ', 'ممّا', 'باليَعامِيرُ', 'والأَبُ']
incorrect:  ['عُمَيْر', 'كنية', 'الفَرْجوأُمُّ', 'عَمْرو'] ['عَمِيْر', 'كُنْية', 'الفَرْجوأُمّ', 'عمْرٌو']
correct:  ['أَبْشِري', 'بالبُشْرَى', 'مَوْتٌ', 'ذَرِيعٌ'] ['أَبْشِري', 'بالبُشْرَى', 'مَوْتٌ', 'ذَرِيعٌ']
incorrect:  ['لا', 'تَقْبِرُوني', 'إِنّ', 'قَبْرِي'] ['لا', 'تَقْبِرُوني', 'إِنّ', 'قَبْرِيّ']
incorrect:  ['وكَمْ', 'مِن', 'وِجارٍ', 'كجَيْبِ'] ['وكَمْ', 'من', 'وِجارٍ', 'كجَيْبِ']
incorrect:  ['فُرْعُلُ', 'ومن', 'أَمثالهم', 'خامِرِي'] ['فُرْعُلُ', 'ومِنْ', 'أَمثالهم', 'خامِرِي']
incorrect:  ['أُمَّ', 'عامر', 'أَ

incorrect:  ['بِقَومٍ', 'حَلَّ', 'بِهِم', 'البلاءُ'] ['بِقَومٍ', 'حَلّ', 'بِهم', 'البلاءُ']
incorrect:  ['من', 'القَتْلِ', 'ابنِ', 'عُمارَةَ'] ['من', 'القتلُ', 'ابن', 'عِمارةً']
incorrect:  ['كثُمامَةٍ', 'بأرضِ', 'فارِسَ', 'ماءٌ'] ['كثُمامَةٍ', 'بأَرْضٍ', 'فارِسُ', 'ماءُ']
incorrect:  ['قُطْرُبٍ', 'وخُطِّئَ', 'وأمُّ', 'عَمْرٍو'] ['قطْرُب', 'وخُطّئَ', 'وأُمّ', 'عمْرٌو']
incorrect:  ['وأمُّ', 'عامِرٍ', 'الضَّبُعُ', 'والعامِرُ'] ['وأُمّ', 'عامرٌ', 'الضّبُع', 'والعامِرُ']
incorrect:  ['جِرْوُها', 'والعَمَّارُ', 'الكثيرُ', 'الصلاةِ'] ['جِرْوُها', 'والعَمّار', 'الكثيرُ', 'الصلاةِ']
incorrect:  ['والصيامِ', 'والقَوِيُّ', 'الإِيمانِ', 'الثابتُ'] ['والصيامِ', 'والقَوِيّ', 'الإِيمان', 'الثابتُ']
incorrect:  ['في', 'أمرِهِ', 'والطَّيِّبُ', 'الثَّناءِ'] ['في', 'أَمْره', 'والطّيّب', 'الثّناء']
incorrect:  ['والطَّيِّبُ', 'الرَّوائِحِ', 'والمُجْتَمِعُ', 'الأمر'] ['والطّيّب', 'الرّوائِحِ', 'والمُجْتَمِعُ', 'الأَمر']
correct:  ['اللازمُ', 'للجماعةِ', 'الحَدِبُ', 'على'] ['اللازمُ', 'للجماعةِ', 'الحَدِب