In [0]:
%matplotlib inline

In [5]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import nltk
nltk.download('punkt')


import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS
        
    def addSentence(self, sentence):
        #print("input:", sentence)
        tokenized_text = nltk.word_tokenize(sentence)

        #print("tokenized:", tokenized_text)
        for word in tokenized_text:
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [0]:
import codecs
import csv
import random
from sklearn.model_selection import train_test_split

def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    #lines = open('%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
#     lines = open('arabic.txt', encoding='utf-8')
#         read().strip().split('\n')
#     print (lines)
    

    #FILENAME = "arabic_words.csv"
    #FILENAME = "arabic_sentences_4_short_medium.csv"
    FILENAME = "arabic_sentences_4.csv"

    ENCODING = 'utf-8'
    pairs = []
    with codecs.open(FILENAME, "r", ENCODING) as fp:
      reader = csv.reader(fp)
      for rows in reader:
        #print("Row:", rows[0])
        #ascii = normalizeString(row[0])
        pairs.append(rows)
        
    input_lang = Lang(lang1)
    output_lang = Lang(lang2)
    
    #Split into train and test
   
    #pairs = pairs[0:100]
    train_pairs, test_pairs = train_test_split(pairs, test_size=0.2, random_state=42)
    
    return input_lang, output_lang, train_pairs, test_pairs
#     # Split every line into pairs and normalize
#     pairs = [[normalizeString(s) for s in l.split(',')] for l in lines]

#     # Reverse pairs, make Lang instances
#     if reverse:
#         pairs = [list(reversed(p)) for p in pairs]
#         input_lang = Lang(lang2)
#         output_lang = Lang(lang1)
#     else:
#         input_lang = Lang(lang1)
#         output_lang = Lang(lang2)

        

Since there are a *lot* of example sentences and we want to train
something quickly, we'll trim the data set to only relatively short and
simple sentences. Here the maximum length is 10 words (that includes
ending punctuation) and we're filtering to sentences that translate to
the form "I am" or "He is" etc. (accounting for apostrophes replaced
earlier).




The full process for preparing the data is:

-  Read text file and split into lines, split lines into pairs
-  Normalize text, filter by length and content
-  Make word lists from sentences in pairs




In [20]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, train_pairs, test_pairs = readLangs(lang1, lang2, reverse)
  
    print("Read %s training sentence pairs" % len(train_pairs))
    #pairs = filterPairs(pairs)
    #print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in train_pairs:
        #print("printing pairs:", pair[0], pair[1])
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    
    ##!!!Adding new words in test to training words
    
    print("Read %s test sentence pairs" % len(test_pairs))
    print("Counting words...")
    for pair in test_pairs:
        #print("printing pairs:", pair[0], pair[1])
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    
    return input_lang, output_lang, train_pairs, test_pairs


input_lang, output_lang, train_pairs, test_pairs = prepareData('NoDiac', 'Diac', False)
print(random.choice(train_pairs))
print(random.choice(test_pairs))



Reading lines...
Read 80 training sentence pairs
Counting words...
Counted words:
NoDiac 244
Diac 254
Read 20 test sentence pairs
Counting words...
Counted words:
NoDiac 288
Diac 301
['والمدن والنسبة إليهما أعرابي', 'والمدن والنسبة إليهما أعرابيٌّ']
['صلى الله عليه وسلم', 'صلى اللّه عليه وسلم']


In [21]:
#print(random.choice([input_lang.word2index.keys()]))
#print(random.choice([output_lang.word2index.keys()]))
print([input_lang.word2index])
print([output_lang.word2index])


[{'على': 2, 'الحجاج': 3, 'يوما': 4, 'فقال': 5, 'يتق': 6, 'أحدا': 7, 'قال': 8, 'الكميت': 9, 'منهم': 10, 'وأخذ': 11, 'من': 12, 'لفظه': 13, 'صلى': 14, 'الله': 15, 'عليه': 16, 'وسلم': 17, 'دارا': 18, 'وأحسنه': 19, 'جوارا': 20, 'وأعربه': 21, 'فيها': 22, 'كثير': 23, 'السقم': 24, 'وقد': 25, 'الأعراب': 26, 'آمنا': 27, 'قل': 28, 'لم': 29, 'أجرا': 30, 'إلا': 31, 'المودة': 32, 'خلاف': 33, 'العجم': 34, 'وهما': 35, 'واحد': 36, 'وجعل': 37, 'النبي': 38, 'الجوهري': 39, 'العريب': 40, 'تصغير': 41, 'العرب': 42, 'نعما': 43, 'ورعوا': 44, 'مساقط': 45, 'الغيث': 46, 'تعربا': 47, 'واستعرب': 48, 'استعرابا': 49, 'كل': 50, 'لائل': 51, 'تقول': 52, 'عرب': 53, 'عاربة': 54, 'وقال': 55, 'معرب': 56, 'مفصح': 57, 'الصحاح': 58, 'في': 59, 'اللغةالصحاح': 60, 'ترقرق': 61, 'مناكبها': 62, 'الدماء': 63, 'نهار': 64, 'ثم': 65, 'هي': 66, 'ثابتا': 67, 'وإن': 68, 'نلت': 69, 'منها': 70, 'كما': 71, 'نلتم': 72, 'بعدما': 73, 'كانوا': 74, 'عربا': 75, 'وفي': 76, 'لنبط': 77, 'وإنما': 78, 'اسم': 79, 'لسان': 80, 'والعرب': 81, 'للتقية': 82, '

In [0]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [0]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [0]:
MAX_LENGTH = 10

In [0]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
       

In [0]:
def indexesFromSentence(lang, sentence):
      return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    #print("Pair is:",pair[0],pair[1])
    #print("Tensors:",input_tensor, target_tensor)
    return (input_tensor, target_tensor)

In [0]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    #print("input_length",input_length,"target_length:",target_length)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    #print(encoder_outputs.shape)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    #use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    use_teacher_forcing = False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            #decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            #decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

This is a helper function to print time elapsed and estimated time
remaining given the current time and progress %.




In [0]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

The whole training process looks like this:

-  Start a timer
-  Initialize optimizers and criterion
-  Create set of training pairs
-  Start empty losses array for plotting

Then we call ``train`` many times and occasionally print the progress (%
of examples, time so far, estimated time) and average loss.




In [0]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    #training_pairs = [tensorsFromPair(random.choice(pairs))
     #                 for i in range(n_iters)]
    training_pairs = [tensorsFromPair(random.choice(train_pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        #print("Training pair", training_pair)
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    #print ("got here")
    showPlot(plot_losses)

Plotting results
----------------

Plotting is done with matplotlib, using the array of loss values
``plot_losses`` saved while training.




In [0]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

Evaluation
==========

Evaluation is mostly the same as training, but there are no targets so
we simply feed the decoder's predictions back to itself for each step.
Every time it predicts a word we add it to the output string, and if it
predicts the EOS token we stop there. We also store the decoder's
attention outputs for display later.




In [0]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
           # decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
           # decoder_attentions[di] = decoder_attention.data
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)            
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                #decoded_words.append('<EOS>')
                break
            else:
                #if (topi.item() not in output_lang.test_index2word):
                  #decoded_words.append("unk")
                #else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        #return decoded_words, decoder_attentions[:di + 1]
        return decoded_words

We can evaluate random sentences from the training set and print out the
input, target, and output to make some subjective quality judgements:




In [0]:
def evaluateRandomly(encoder, decoder, n=10):
    correct = 0
    incorrect = 0
    for i in range(n):
        #pair = random.choice(pairs)
        pair = random.choice(test_pairs)

        print('>', pair[0])
        print('=', pair[1])
        #output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_words = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')
        
        tokenized_target = nltk.word_tokenize(pair[1])
        
        for target, output in zip(tokenized_target, output_words):
          if (target == output):
            correct += 1
          else:
            incorrect += 1
            print("Example incorrect:")
            print(target, output)
                
#        if (pair[1] == output_sentence):
#           correct += 1
#         else:
#           print("Example incorrect:")
#           incorrect += 0
    print("accuracy:",correct/(correct+incorrect),correct,incorrect) 

In [40]:
print("input:", input_lang.n_words)

input: 288


In [60]:
hidden_size = 256

encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
#attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)
decoder1 = DecoderRNN(hidden_size, output_lang.n_words).to(device)
#print("Embeddings:", encoder1.embedding)

#trainIters(encoder1, attn_decoder1, 75000, print_every=5000)
#trainIters(encoder1, decoder1, 1000, print_every=500)
trainIters(encoder1, decoder1, 20000, print_every=500)

0m 11s (- 7m 21s) (500 2%) 4.0195
0m 21s (- 6m 43s) (1000 5%) 3.4113


KeyboardInterrupt: ignored

In [58]:
evaluateRandomly(encoder1, decoder1)

> ولم يسموا أعرابا وتقول
= ولم يسموا أعراباً وتقول
< بالتفصيل كان كان

Example incorrect:
ولم بالتفصيل
Example incorrect:
يسموا كان
Example incorrect:
أعراباً كان
> أحسابا أي أبينهم وأوضحهم
= أحساباً أي أبينهم وأوضحهم
< للتقيّة قال الأزهري

Example incorrect:
أحساباً للتقيّة
Example incorrect:
أي قال
Example incorrect:
أبينهم الأزهري
> كان فصيحا وقال الليث
= كان فصيحاً وقال الليث
< للتقيّة كان كان

Example incorrect:
كان للتقيّة
Example incorrect:
فصيحاً كان
Example incorrect:
وقال كان
> أحسابا أي أبينهم وأوضحهم
= أحساباً أي أبينهم وأوضحهم
< للتقيّة قال الأزهري

Example incorrect:
أحساباً للتقيّة
Example incorrect:
أي قال
Example incorrect:
أبينهم الأزهري
> لم يكن بدويا والأعرابي
= لم يكن بدوياً والأعرابي
< للتقيّة قال الأزهري

Example incorrect:
لم للتقيّة
Example incorrect:
يكن قال
Example incorrect:
بدوياً الأزهري
> أو ذات صلة الباحث
= أو ذَاتُ صِلَة الباحث
< للتقيّة قال الأزهري

Example incorrect:
أو للتقيّة
Example incorrect:
ذَاتُ قال
Example incorrect:
صِلَة الأزهري
> هرم وما في

Visualizing Attention
---------------------

A useful property of the attention mechanism is its highly interpretable
outputs. Because it is used to weight specific encoder outputs of the
input sequence, we can imagine looking where the network is focused most
at each time step.

You could simply run ``plt.matshow(attentions)`` to see attention output
displayed as a matrix, with the columns being input steps and rows being
output steps:




In [0]:
output_words, attentions = evaluate(
    encoder1, attn_decoder1, "je suis trop froid .")
plt.matshow(attentions.numpy())

For a better viewing experience we will do the extra work of adding axes
and labels:




In [0]:
def showAttention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()


def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(
        encoder1, attn_decoder1, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    showAttention(input_sentence, output_words, attentions)


evaluateAndShowAttention("elle a cinq ans de moins que moi .")

evaluateAndShowAttention("elle est trop petit .")

evaluateAndShowAttention("je ne crains pas de mourir .")

evaluateAndShowAttention("c est un jeune directeur plein de talent .")