In [449]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler



In [450]:
SOS_token = 0
EOS_token = 1

class Language:
    def __init__(self, name) :
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0:'SOS',1:'EOS'} # we should define start and enc token 
        self.n_words = 2 # SOS and EOS was counted

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word 
            self.n_words += 1
        else:
            self.word2count[word] += 1





In [451]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

In [452]:
with open ('/Users/arian/Downloads/ML/EX/NLP/translation/data/eng-fra.txt') as f:
    lines = f.read().strip().split('\n')
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
input_lang = Language('eng')
output_lang = Language('fra')
pairs = pairs[:300]

In [453]:
MAX_LENGTH = 10
eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)
def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[0].startswith(eng_prefixes) # reducing samples
pairs = [pair for pair in pairs if filterPair(pair)]

In [454]:
for pair in pairs :
    input_lang.addSentence(pair[0])
    output_lang.addSentence(pair[1])


In [455]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True) # the input and output tensors are (batch, seq, feature)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input)) 
        output, hidden = self.gru(embedded)
        return output, hidden

In [456]:
class DecoderRnn(nn.Module):
    def __init__(self, hidden_size, output_size,target_tensor=None):
        super(DecoderRnn, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        for i in range(MAX_LENGTH):
    
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                
            # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
            # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach() 
        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop




    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden


In [457]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]



def get_dataloader(batch_size):
    n = len(pairs)
    input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    for idx, (inp, tgt) in enumerate(pairs):
        inp_ids = indexesFromSentence(input_lang, inp)
        tgt_ids = indexesFromSentence(output_lang, tgt)
        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)
        input_ids[idx,:len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids
    train_ds = TensorDataset(torch.LongTensor(input_ids),torch.LongTensor(target_ids))
    train_sampler = RandomSampler(train_ds)
    train_dataloader = DataLoader(train_ds, sampler=train_sampler, batch_size=batch_size)
    return input_lang, output_lang, train_dataloader
        

        

In [458]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):
    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        encoder_outputs, encoder_hidden = encoder(input_tensor)
        
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)
        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()
    return total_loss / len(dataloader)

In [459]:
def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
               print_every=1):
    
    

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print(f'loss in  epoch number {epoch} is {loss}')
        

        
            
        

    

In [460]:
hidden_size = 128
batch_size = 32

input_lang, output_lang, train_dataloader = get_dataloader(batch_size)

encoder = EncoderRNN(input_lang.n_words, hidden_size)
decoder = DecoderRnn(hidden_size, output_lang.n_words)

train(train_dataloader, encoder, decoder, 80, print_every=1)

loss in  epoch number 1 is 3.951293706893921
loss in  epoch number 2 is 3.3642537593841553
loss in  epoch number 3 is 2.8080376386642456
loss in  epoch number 4 is 2.284605026245117
loss in  epoch number 5 is 1.865330457687378
loss in  epoch number 6 is 1.598844289779663
loss in  epoch number 7 is 1.4668562412261963
loss in  epoch number 8 is 1.3499886393547058
loss in  epoch number 9 is 1.2962356209754944
loss in  epoch number 10 is 1.2164058685302734
loss in  epoch number 11 is 1.1195417046546936
loss in  epoch number 12 is 1.1318131685256958
loss in  epoch number 13 is 1.1008946299552917
loss in  epoch number 14 is 1.0648645162582397
loss in  epoch number 15 is 1.0125495791435242
loss in  epoch number 16 is 0.9629949033260345
loss in  epoch number 17 is 0.9315615594387054
loss in  epoch number 18 is 0.9223119914531708
loss in  epoch number 19 is 0.9292579591274261
loss in  epoch number 20 is 0.8849940001964569
loss in  epoch number 21 is 0.8456110060214996
loss in  epoch number 22 i