In [43]:
%matplotlib inline

In [4]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import nltk
nltk.download('punkt')


import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package punkt to /Users/badamosor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS
        
    def addSentence(self, sentence):
        #print("input:", sentence)
        #tokenized_text = nltk.word_tokenize(sentence)
        tokenized_text = re.split('', sentence)

        #print("tokenized:", tokenized_text)
        for word in tokenized_text:
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [6]:
import codecs
import csv
import random
from sklearn.model_selection import train_test_split

def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    #lines = open('%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
#     lines = open('arabic.txt', encoding='utf-8')
#         read().strip().split('\n')
#     print (lines)
    

    #FILENAME = "arabic_words.csv"
    FILENAME_TRAIN = "result.data"
    #FILENAME_TEST = "test_data_5000_2112.csv"

    ENCODING = 'utf-8'
    pairs = []
    with codecs.open(FILENAME_TRAIN, "r", ENCODING) as fp:
      reader = csv.reader(fp)
      for rows in reader:
        pairs.append(rows)
        
#     test_pairs = []
#     with codecs.open(FILENAME_TEST, "r", ENCODING) as fp:
#       reader = csv.reader(fp)
#       for rows in reader:
#         test_pairs.append(rows)
      
        
    input_lang = Lang(lang1)
    output_lang = Lang(lang2)
    
    #Split into train and test
   
    #pairs = pairs[0:100]
    train_pairs, test_pairs = train_test_split(pairs, test_size=0.2, random_state=42)
    
    return input_lang, output_lang, train_pairs, test_pairs
#     # Split every line into pairs and normalize
#     pairs = [[normalizeString(s) for s in l.split(',')] for l in lines]

#     # Reverse pairs, make Lang instances
#     if reverse:
#         pairs = [list(reversed(p)) for p in pairs]
#         input_lang = Lang(lang2)
#         output_lang = Lang(lang1)
#     else:
#         input_lang = Lang(lang1)
#         output_lang = Lang(lang2)

        

Since there are a *lot* of example sentences and we want to train
something quickly, we'll trim the data set to only relatively short and
simple sentences. Here the maximum length is 10 words (that includes
ending punctuation) and we're filtering to sentences that translate to
the form "I am" or "He is" etc. (accounting for apostrophes replaced
earlier).




The full process for preparing the data is:

-  Read text file and split into lines, split lines into pairs
-  Normalize text, filter by length and content
-  Make word lists from sentences in pairs




In [7]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, train_pairs, test_pairs = readLangs(lang1, lang2, reverse)
  
    print("Read %s training sentence pairs" % len(train_pairs))
  
    #pairs = filterPairs(pairs)
    #print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in train_pairs:
        #print("printing pairs:", pair[0], pair[1])
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    
    ##!!!Adding new words in test to training words
    
    print("Read %s test sentence pairs" % len(test_pairs))
    print("Counting words...")
    for pair in test_pairs:
        #print("printing pairs:", pair[0], pair[1])
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    
    return input_lang, output_lang, train_pairs, test_pairs


input_lang, output_lang, train_pairs, test_pairs = prepareData('NoDiac', 'Diac', False)
print(random.choice(train_pairs))
print(random.choice(test_pairs))



Reading lines...
Read 3196 training sentence pairs
Counting words...
Counted words:
NoDiac 40
Diac 48
Read 800 test sentence pairs
Counting words...
Counted words:
NoDiac 40
Diac 48
['العربي', 'العَرَبِيِّ']
['بن', 'بنِ']


In [8]:
#print(random.choice([input_lang.word2index.keys()]))
#print(random.choice([output_lang.word2index.keys()]))
print([input_lang.word2index])
print([output_lang.word2index])


[{'': 2, 'ب': 3, 'و': 4, 'ز': 5, 'ن': 6, 'ر': 7, 'خ': 8, 'ع': 9, 'ي': 10, 'ت': 11, 'ج': 12, 'ا': 13, 'ل': 14, 'ذ': 15, 'ك': 16, 'ض': 17, 'ح': 18, 'ة': 19, 'ه': 20, 'إ': 21, 'ق': 22, 'ؤ': 23, 'م': 24, 'أ': 25, 'س': 26, 'د': 27, 'ف': 28, 'ث': 29, 'ش': 30, 'ئ': 31, 'ط': 32, 'ص': 33, 'ء': 34, 'ى': 35, 'ظ': 36, 'غ': 37, 'آ': 38, 'ـ': 39}]
[{'': 2, 'ب': 3, 'و': 4, 'ز': 5, 'ن': 6, 'ر': 7, 'ِ': 8, 'خ': 9, 'ْ': 10, 'ٌ': 11, 'ع': 12, 'ي': 13, 'ت': 14, 'ُ': 15, 'َ': 16, 'ج': 17, 'ا': 18, 'ل': 19, 'ذ': 20, 'ك': 21, 'ض': 22, 'ّ': 23, 'ح': 24, 'ة': 25, 'ه': 26, 'إ': 27, 'ق': 28, 'ؤ': 29, 'م': 30, 'ً': 31, 'أ': 32, 'س': 33, 'د': 34, 'ف': 35, 'ث': 36, 'ٍ': 37, 'ش': 38, 'ئ': 39, 'ط': 40, 'ص': 41, 'ء': 42, 'ى': 43, 'ظ': 44, 'غ': 45, 'آ': 46, 'ـ': 47}]


In [9]:

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [10]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [11]:
MAX_LENGTH = 20

In [12]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
       

In [13]:
def indexesFromSentence(lang, sentence):
      return [lang.word2index[word] for word in re.split('', sentence)]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    #print("Pair is:",pair[0],pair[1])
    #print("Tensors:",input_tensor, target_tensor)
    return (input_tensor, target_tensor)

In [30]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
   # print("input_length",input_length,"target_length:",target_length)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    #print(encoder_outputs.shape)

    loss = 0
    

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    #use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    use_teacher_forcing = False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            #decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            #decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

This is a helper function to print time elapsed and estimated time
remaining given the current time and progress %.




In [31]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

The whole training process looks like this:

-  Start a timer
-  Initialize optimizers and criterion
-  Create set of training pairs
-  Start empty losses array for plotting

Then we call ``train`` many times and occasionally print the progress (%
of examples, time so far, estimated time) and average loss.




In [32]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    #training_pairs = [tensorsFromPair(random.choice(pairs))
     #                 for i in range(n_iters)]
    #training_pairs = [tensorsFromPair(random.choice(train_pairs))
                    #  for i in range(n_iters)]
    
    criterion = nn.NLLLoss()
    

    
    for i in range(n_iters):
        print ("Iteration: ", i)
        numExample = 0
        for pairs in train_pairs:
            training_pair = tensorsFromPair(pairs)
            input_tensor = training_pair[0]
            target_tensor = training_pair[1]

            loss = train(input_tensor, target_tensor, encoder,decoder, encoder_optimizer, decoder_optimizer, criterion)
            print_loss_total += loss
            plot_loss_total += loss
     #    if iter % print_every == 0:
            
            numExample += 1
            if (numExample % print_every == 0): 
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
            #print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         #iter, iter / n_iters * 100, print_loss_avg))
                print('%s %d %.4f' % (timeSince(start, numExample), numExample, print_loss_avg))
            
#         if iter % plot_every == 0:
#             plot_loss_avg = plot_loss_total / plot_every
#             plot_losses.append(plot_loss_avg)
#             plot_loss_total = 0

#     #print ("got here")
#     showPlot(plot_losses)

Plotting results
----------------

Plotting is done with matplotlib, using the array of loss values
``plot_losses`` saved while training.




In [33]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

Evaluation
==========

Evaluation is mostly the same as training, but there are no targets so
we simply feed the decoder's predictions back to itself for each step.
Every time it predicts a word we add it to the output string, and if it
predicts the EOS token we stop there. We also store the decoder's
attention outputs for display later.




In [27]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
           # decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
           # decoder_attentions[di] = decoder_attention.data
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)            
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                #decoded_words.append('<EOS>')
                break
            else:
                #if (topi.item() not in output_lang.test_index2word):
                  #decoded_words.append("unk")
                #else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        #return decoded_words, decoder_attentions[:di + 1]
        return decoded_words

We can evaluate random sentences from the training set and print out the
input, target, and output to make some subjective quality judgements:




In [38]:
def evaluateRandomly(encoder, decoder, n=10):
    correct_word_pair = 0
    incorrect_word_pair = 0
    
    correct_sentence_pair = 0
    incorrect_sentence_pair = 0
    
    for pair in test_pairs:
      
        #pair = random.choice(pairs)
        #pair = random.choice(test_pairs)
        print('>', pair[0])
        print('=', pair[1])
        #output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_words = evaluate(encoder, decoder, pair[0])
        output_sentence = ''.join(output_words)
        print('<', output_sentence)
        print('')
        
        #Accuracy on sentence basis
        if (pair[1] == output_sentence):
            correct_sentence_pair += 1
        else:
            incorrect_sentence_pair += 1
            
        #Accuracy on word basis
        #tokenized_target = nltk.word_tokenize(pair[1])
        tokenized_target = re.split('', pair[1])

        
        for target, output in zip(tokenized_target, output_words):
          if (target == output):
            correct_word_pair += 1
          else:
            incorrect_word_pair += 1
            print("Incorrect word pair:")
            print(target, output)
                
    print("accuracy of sentences:",correct_sentence_pair/(correct_sentence_pair+incorrect_sentence_pair),correct_sentence_pair,incorrect_sentence_pair) 
    print("accuracy of words:",correct_word_pair/(correct_word_pair+incorrect_word_pair),correct_word_pair,incorrect_sentence_pair) 

In [21]:
print("input:", input_lang.n_words)
print("train size", len(train_pairs))

input: 40
train size 3196


In [40]:
hidden_size = 256

encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
#attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)
decoder1 = DecoderRNN(hidden_size, output_lang.n_words).to(device)
#print("Embeddings:", encoder1.embedding)

#trainIters(encoder1, attn_decoder1, 75000, print_every=5000)
#trainIters(encoder1, decoder1, 1000, print_every=500)
trainIters(encoder1, decoder1, 10, print_every=500)

Iteration:  0
0m 12s (- -1m 47s) 500 2.1020
0m 24s (- -1m 35s) 1000 2.0321
0m 37s (- -1m 22s) 1500 1.9857
0m 54s (- -1m 5s) 2000 1.9013
1m 7s (- -2m 52s) 2500 1.7234
1m 20s (- -2m 39s) 3000 1.6307
Iteration:  1
1m 38s (- -2m 21s) 500 2.1410
1m 52s (- -2m 7s) 1000 1.4471
2m 5s (- -3m 54s) 1500 1.3655
2m 19s (- -3m 41s) 2000 1.3361
2m 32s (- -3m 27s) 2500 1.2627
2m 45s (- -3m 14s) 3000 1.2238
Iteration:  2
3m 3s (- -4m 56s) 500 1.6695
3m 17s (- -4m 42s) 1000 1.1677
3m 31s (- -4m 29s) 1500 1.1310
3m 44s (- -4m 15s) 2000 1.1095
3m 58s (- -4m 2s) 2500 1.0511
4m 11s (- -5m 48s) 3000 1.0721
Iteration:  3
4m 29s (- -5m 30s) 500 1.4368
4m 43s (- -5m 16s) 1000 1.0378
4m 57s (- -5m 2s) 1500 0.9905
5m 10s (- -6m 49s) 2000 0.9898
5m 24s (- -6m 36s) 2500 0.9653
5m 37s (- -6m 22s) 3000 0.9628
Iteration:  4
5m 55s (- -6m 5s) 500 1.3094
6m 9s (- -7m 51s) 1000 0.9121
6m 22s (- -7m 38s) 1500 0.9022
6m 35s (- -7m 24s) 2000 0.9121
6m 48s (- -7m 11s) 2500 0.9036
7m 4s (- -8m 55s) 3000 0.8795
Iteration:  5
7

In [41]:
evaluateRandomly(encoder1, decoder1)

> في
= في
< في

> وأداته
= وأَداتُه
< وأَاات

Incorrect word pair:
د ا
Incorrect word pair:
ُ 
> قالت
= قَالَتِ
< قالت

Incorrect word pair:
َ ا
Incorrect word pair:
ا ل
Incorrect word pair:
ل ت
Incorrect word pair:
َ 
> بن
= بن
< بنُ

Incorrect word pair:
 ُ
> ورجل
= ورجل
< ورجل

> وهجا
= وهَجا
< وهجاا

Incorrect word pair:
َ ج
Incorrect word pair:
ج ا
> له
= له
< له

> عرابة
= عَرابَةٌ
< عرابةةِ

Incorrect word pair:
َ ر
Incorrect word pair:
ر ا
Incorrect word pair:
ا ب
Incorrect word pair:
ب ة
Incorrect word pair:
َ ة
Incorrect word pair:
ة ِ
> تقول
= تقول
< تقول

> أي
= أَي
< أي

Incorrect word pair:
َ ي
Incorrect word pair:
ي 
> الفرس
= الفرسِ
< الفرس

Incorrect word pair:
ِ 
> ألست
= أَلَسْتَ
< لأَست

Incorrect word pair:
أ ل
Incorrect word pair:
َ أ
Incorrect word pair:
ل َ
Incorrect word pair:
َ س
Incorrect word pair:
س ت
Incorrect word pair:
ْ 
> هجنة
= هُجْنة
< هُجْنة

> ونجم
= ونجْمٌ
< وجمنم

Incorrect word pair:
ن ج
Incorrect word pair:
ج م
Incorrect word pair:
ْ ن
Incorrec

< الأَمعي

Incorrect word pair:
ص م
Incorrect word pair:
م ع
Incorrect word pair:
ع ي
> وبيانها
= وبيانَها
< وببيَناا

Incorrect word pair:
ي ب
Incorrect word pair:
ا ي
Incorrect word pair:
ن َ
Incorrect word pair:
َ ن
Incorrect word pair:
ه ا
> يوم
= يوم
< ويمم

Incorrect word pair:
ي و
Incorrect word pair:
و ي
Incorrect word pair:
 م
> قال
= قال
< قال

> غير
= غير
< غير

> كغراب
= كغُرابٍ
< كغاراٍ

Incorrect word pair:
ُ ا
Incorrect word pair:
ب 
> سكنت
= سكنت
< سَكََِّ

Incorrect word pair:
ك َ
Incorrect word pair:
ن ك
Incorrect word pair:
ت ّ
Incorrect word pair:
 ِ
> الشفة
= الشَّفَةِ
< الشّة

Incorrect word pair:
َ ة
Incorrect word pair:
ف 
> يقال
= يقال
< يقال

> الويلات
= الوَيْلاتُ
< اليَيات

Incorrect word pair:
و ي
Incorrect word pair:
ْ ا
Incorrect word pair:
ل ت
Incorrect word pair:
ا 
> من
= من
< منن

Incorrect word pair:
 ن
> بدويا
= بدوياً
< بدوياً

> القوم
= القوم
< القومْ

Incorrect word pair:
 ْ
> لحائه
= لِحائِه
< لَحاا

Incorrect word pair:
ِ َ
Incorrect word pair:

< الرَّمما

Incorrect word pair:
ط ر
Incorrect word pair:
ر ّ
Incorrect word pair:
م َ
Incorrect word pair:
ا م
Incorrect word pair:
ح م
Incorrect word pair:
 ا
> الرجل
= الرجلُ
< الرّل

Incorrect word pair:
ج ّ
> هكذا
= هكذا
< هكذا

> إلا
= إِلا
< إلا

Incorrect word pair:
ِ ل
Incorrect word pair:
ل ا
Incorrect word pair:
ا 
> لأيا
= لأْياً
< لأْياً

> النار
= النار
< النار

> بكر
= بَكْرٌ
< بِكْرٍ

Incorrect word pair:
َ ِ
Incorrect word pair:
ٌ ٍ
> لا
= لا
< لا

> لأى
= لأَى
< لأَى

> قريش
= قُرَيْشٍ
< قَرييْ

Incorrect word pair:
ُ َ
Incorrect word pair:
َ ي
> في
= في
< في

> المضبب
= المُضَبَّبِ
< المِبب

Incorrect word pair:
ُ ِ
Incorrect word pair:
ض ب
Incorrect word pair:
َ ب
Incorrect word pair:
ب 
> اليمن
= اليمن
< المين

Incorrect word pair:
ي م
Incorrect word pair:
م ي
> على
= على
< على

> في
= في
< في

> لم
= لم
< لم

> وقال
= وقال
< وقال

> زوجوهم
= زَوَّجُوهم
< جُوهوم

Incorrect word pair:
ز ج
Incorrect word pair:
َ ُ
Incorrect word pair:
ّ ه
Incorrect word pair:
َ و
Inc

< بالدها

Incorrect word pair:
ل ا
Incorrect word pair:
ا ل
> ونحو
= ونُحُوٌّ
< ونَحْو

Incorrect word pair:
ُ َ
Incorrect word pair:
ُ ْ
> فكانت
= فكانتْ
< فَانََ

Incorrect word pair:
ك َ
Incorrect word pair:
ت َ
Incorrect word pair:
ْ َ
> ظل
= ظَلَّ
< لِ

Incorrect word pair:
ظ ل
Incorrect word pair:
َ ِ
Incorrect word pair:
ل 
> بذكر
= بذكر
< بِكر

Incorrect word pair:
ذ ِ
> مجنبات
= مُجَنَّباتِ
< مَتِجابب

Incorrect word pair:
ُ َ
Incorrect word pair:
ج ت
Incorrect word pair:
َ ِ
Incorrect word pair:
ن ج
Incorrect word pair:
ّ ا
Incorrect word pair:
َ ب
Incorrect word pair:
ا 
> القياس
= القياس
< القياس

> جملا
= جملاً
< جملاً

> صلوات
= صلوات
< والصَّت

Incorrect word pair:
ص و
Incorrect word pair:
ل ا
Incorrect word pair:
و ل
Incorrect word pair:
ا ص
Incorrect word pair:
ت ّ
Incorrect word pair:
 َ
> تعالى
= تعالى
< تعالى

> وجوه
= وجُوه
< وجاه

Incorrect word pair:
ُ ا
Incorrect word pair:
و ه
Incorrect word pair:
ه 
> القول
= القول
< القول

> صرفهوأنحيت
= صرَفهوأَنْحَيْتُ
< صَ

< العِيننَ

Incorrect word pair:
ي ِ
Incorrect word pair:
ن ي
Incorrect word pair:
ِ ن
Incorrect word pair:
 ن
> الأمة
= الأَمة
< الأُمة

Incorrect word pair:
َ ُ
> ضلوعه
= ضُلُوعَه
< وضعلوُ

Incorrect word pair:
ض و
Incorrect word pair:
ُ ض
Incorrect word pair:
ل ع
Incorrect word pair:
ُ ل
Incorrect word pair:
ع ُ
Incorrect word pair:
َ 
> لها
= لها
< لها

> قالوالعرب
= قالوالعَرَب
< العالرببُ

Incorrect word pair:
ق ا
Incorrect word pair:
ا ل
Incorrect word pair:
ل ع
Incorrect word pair:
و ا
Incorrect word pair:
ا ل
Incorrect word pair:
ل ر
Incorrect word pair:
ع ب
Incorrect word pair:
َ ب
Incorrect word pair:
ر ُ
> يكون
= يكون
< يُكون

Incorrect word pair:
ك ُ
Incorrect word pair:
و ك
Incorrect word pair:
ن و
Incorrect word pair:
 ن
> السقم
= السَّقَمْ
< السِّم

Incorrect word pair:
َ ِ
Incorrect word pair:
ق م
> شفاء
= شِفاءُ
< شَفاء

Incorrect word pair:
ِ َ
Incorrect word pair:
ُ 
> وقال
= وقال
< وقال

> واتخاذ
= واتِّخاذُ
< وااتخذ

Incorrect word pair:
ت ا
Incorrect word pair:
ّ

< ابن

> الله
= اللّه
< اللّ

Incorrect word pair:
ه 
> منعطفا
= مُنْعَطِفاً
< منفْطاً

Incorrect word pair:
ُ ن
Incorrect word pair:
ن ف
Incorrect word pair:
ع ط
Incorrect word pair:
َ ا
Incorrect word pair:
ط ً
> من
= من
< منن

Incorrect word pair:
 ن
> ثم
= ثم
< ثم

> من
= من
< منن

Incorrect word pair:
 ن
> فصحهم
= فِصحُهم
< فصصحممم

Incorrect word pair:
ِ ص
Incorrect word pair:
ُ م
Incorrect word pair:
ه م
> أنها
= أَنها
< أَنها

> الله
= اللّه
< اللّ

Incorrect word pair:
ه 
> هاء
= هاء
< هاء

> الجمعةوابن
= الجُمُعَةِوابنُ
< المُجااااااننون

Incorrect word pair:
ج م
Incorrect word pair:
م ج
Incorrect word pair:
ُ ا
Incorrect word pair:
ع ا
Incorrect word pair:
َ ا
Incorrect word pair:
ة ا
Incorrect word pair:
ِ ا
Incorrect word pair:
و ا
Incorrect word pair:
ا ن
Incorrect word pair:
ب ن
Incorrect word pair:
ن و
Incorrect word pair:
ُ ن
> نسبوا
= نُسِبوا
< نسبوبا

Incorrect word pair:
ُ س
Incorrect word pair:
س ب
Incorrect word pair:
ِ و
Incorrect word pair:
و ا
Incorrect word pa

< له

Incorrect word pair:
َ ه
Incorrect word pair:
ه 
> الوجد
= الوَجْدُ
< الوجلد

Incorrect word pair:
َ ج
Incorrect word pair:
ج ل
Incorrect word pair:
ْ د
Incorrect word pair:
د 
> أممته
= أَمَمْتُه
< أُمهت

Incorrect word pair:
َ ُ
Incorrect word pair:
َ ه
Incorrect word pair:
م ت
Incorrect word pair:
ْ 
> لشيء
= لِشيءٍ
< لشيء

Incorrect word pair:
ِ ش
Incorrect word pair:
ش ي
Incorrect word pair:
ي ء
Incorrect word pair:
ء 
> الحسن
= الحُسْنِ
< الحَحن

Incorrect word pair:
ُ َ
Incorrect word pair:
س ح
Incorrect word pair:
ْ ن
Incorrect word pair:
ن 
Incorrect word pair:
ِ 
> يضرب
= يضرب
< يُرببِ

Incorrect word pair:
ض ُ
Incorrect word pair:
 ب
> منها
= منها
< منها

> إعراب
= إعراب
< إِرابب

Incorrect word pair:
ع ِ
Incorrect word pair:
 ب
> والهاجن
= والهاجِنُ
< والهانن

Incorrect word pair:
ج ن
Incorrect word pair:
ِ ن
Incorrect word pair:
ن 
Incorrect word pair:
ُ 
> وأنشد
= وأَنشد
< وأَنشد

> عليه
= عليه
< عليه

> أن
= أَن
< أَ

Incorrect word pair:
ن 
> والإبل
= والإبل
< وال

Visualizing Attention
---------------------

A useful property of the attention mechanism is its highly interpretable
outputs. Because it is used to weight specific encoder outputs of the
input sequence, we can imagine looking where the network is focused most
at each time step.

You could simply run ``plt.matshow(attentions)`` to see attention output
displayed as a matrix, with the columns being input steps and rows being
output steps:




In [0]:
output_words, attentions = evaluate(
    encoder1, attn_decoder1, "je suis trop froid .")
plt.matshow(attentions.numpy())

For a better viewing experience we will do the extra work of adding axes
and labels:




In [0]:
def showAttention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()


def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(
        encoder1, attn_decoder1, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    showAttention(input_sentence, output_words, attentions)


evaluateAndShowAttention("elle a cinq ans de moins que moi .")

evaluateAndShowAttention("elle est trop petit .")

evaluateAndShowAttention("je ne crains pas de mourir .")

evaluateAndShowAttention("c est un jeune directeur plein de talent .")