![seq2seq_rnn](./images/seq_2_seq_rnn.png)

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import unicodedata
import os
import re
import random

device = torch.device("mps" if torch.has_mps else "cpu")

In [19]:
SOS_token = 0
EOS_token = 1
MAX_LENGTH = 20

class Language:
    def __init__(self):
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "<SOS>", 1: "<EOS>"}
        self.n_words = 2 

    def add_sentence(self, sentence):
        for word in sentence.split(' '):
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [3]:
def normalize_sentences(df: pd.DataFrame, lang: str) -> pd.Series:
    sentence = df[lang].str.lower()
    sentence = sentence.str.replace('[^a-zäöüß\s]+', '', regex=True)
    sentence = sentence.apply(lambda x: unicodedata.normalize('NFD', x))
    sentence = sentence.str.encode('ascii', errors='ignore').str.decode('utf-8')
    return sentence

def read_sentences(df, lang1, lang2):
    sentence1 = normalize_sentences(df, lang1)
    sentence2 = normalize_sentences(df, lang2)
    return sentence1, sentence2

# def read_file(loc, lang1, lang2):
#    df = pd.read_csv(loc, delimiter='\t', header=None, names=[lang1, lang2])
#    return df

def read_file(loc, lang1, lang2):
    df = pd.read_csv(loc, delimiter='\t', header=None, names=[lang1, lang2, 'metadata'])
    df = df[[lang1, lang2]] 
    # print(df.head())
    return df

def process_data(lang1, lang2):
    df = read_file('data/%s-%s.txt' % (lang1, lang2), lang1, lang2)
    sentence1, sentence2 = read_sentences(df, lang1, lang2)
    source = Language()
    target = Language()
    pairs = []
    for i in range(len(df)):
        if len(sentence1[i].split(' ')) < MAX_LENGTH and len(sentence2[i].split(' ')) < MAX_LENGTH:
            source.add_sentence(sentence1[i])
            target.add_sentence(sentence2[i])
            pairs.append([sentence1[i], sentence2[i]])
    return source, target, pairs

In [4]:
def idx_from_sentences(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensor_from_sentences(lang, sentence):
    idx = idx_from_sentences(lang, sentence)
    idx.append(EOS_token)
    return torch.tensor(idx, dtype=torch.long, device=device).view(-1, 1)

def tensors_from_pair(pair, source, target):
    source_tensor = tensor_from_sentences(source, pair[0])
    target_tensor = tensor_from_sentences(target, pair[1])
    return (source_tensor, target_tensor)

In [5]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, embed_dim, num_layers):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.rnn = nn.GRU(embed_dim, hidden_dim, num_layers)
    
    def forward(self, source):
        embedded = self.embedding(source)
        output, hidden = self.rnn(embedded)
        return output, hidden

In [6]:
class Decoder(nn.Module):
    def __init__(self, output_dim, hidden_dim, embbed_dim, num_layers):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, embbed_dim)
        self.rnn = nn.GRU(embbed_dim, hidden_dim, num_layers)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, input, hidden):
        input = input.view(1, -1)
        embedded = F.relu(self.embedding(input))
        output, hidden = self.rnn(embedded, hidden)
        prediction = self.softmax(self.fc_out(output[0]))

        return prediction, hidden

In [7]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device, MAX_LENGTH=MAX_LENGTH):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, source, target, teacher_forcing_ratio=0.65):
        # input_length = source.size(0)
        # target_length = target.shape[0]
        # batch_size = target.shape[1]
        # vocab_size = self.decoder.output_dim
        # outputs = torch.zeros(target_length, batch_size, vocab_size).to(self.device)
        
        # for i in range(input_length):
        #     encoder_output, encoder_hidden = self.encoder(source[i])

        # decoder_hidden = encoder_hidden.to(self.device)
        # decoder_input = torch.tensor([[SOS_token]], device=self.device)

        # for t in range(target_length):
        #     decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
        #     outputs[t] = decoder_output
        #     teacher_force = random.random() < teacher_forcing_ratio
        #     topv, topi = decoder_output.topk(1)
        #     input = (target[t] if teacher_force else topi)
        #     if teacher_force == False and input.item() == EOS_token:
        #         break

        # return outputs
        batch_size = source.size(1)
        target_length = target.shape[0]
        vocab_size = self.decoder.output_dim
        outputs = torch.zeros(target_length, batch_size, vocab_size).to(self.device)
        
        encoder_output, encoder_hidden = self.encoder(source)
        
        decoder_input = torch.tensor([SOS_token for _ in range(batch_size)], device=self.device)
        decoder_hidden = encoder_hidden
        
        for t in range(target_length):
            decoder_output, decoder_hidden = self.decoder(decoder_input.unsqueeze(0), decoder_hidden)
            outputs[t] = decoder_output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = decoder_output.argmax(1) 
            decoder_input = target[t] if teacher_force else top1

        return outputs

In [8]:
def clac_model(model, input_tensor, target_tensor, model_optimizer, criterion):
    model_optimizer.zero_grad()
    input_length = input_tensor.size(0)
    loss = 0
    epoch_loss = 0
    output = model(input_tensor, target_tensor)
    num_iter = output.size(0)
    for i in range(num_iter):
        loss += criterion(output[i], target_tensor[i])
    loss.backward()
    model_optimizer.step()
    epoch_loss = loss.item() / num_iter
    return epoch_loss

In [9]:
def train(model, source, target, pairs, num_iterations=20000):
    model.train()
    optimizer = optim.Adam(model.parameters())
    criterion = nn.NLLLoss()
    total_loss = 0
    training_pairs = [tensors_from_pair(random.choice(pairs), source, target) for i in range(num_iterations)]
    for iter in range(1, num_iterations + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]
        loss = clac_model(model, input_tensor, target_tensor, optimizer, criterion)
        total_loss += loss
        if iter % 1000 == 0:
            print('iter: %d, loss: %.4f' % (iter, total_loss / 1000))
            total_loss = 0

    torch.save(model.state_dict(), 'model/seq2seq_rnn.pt')
    return model

In [10]:
def evaluate(model, input_lang, output_lang, sentences, MAX_LENGTH=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensor_from_sentences(input_lang, sentences[0])
        output_tensor = tensor_from_sentences(output_lang, sentences[1])

        decoded_words = []

        output = model(input_tensor, output_tensor)
        for i in range(output.size(0)):
            topv, topi = output[i].topk(1)
            if topi[0].item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi[0].item()])
    return decoded_words

In [23]:
def evaluate_randomly(model, source, target, pairs, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('source: {}'.format(pair[0]))
        print('target: {}'.format(pair[1]))
        output_words = evaluate(model, source, target, pair)
        output_sentence = ' '.join(output_words)
        print('predicted: {}'.format(output_sentence))

In [20]:
lang1 = 'deu'
lang2 = 'eng'

source, target, pairs = process_data(lang1, lang2)
randomize = random.choice(pairs)
print('random sentence {}'.format(randomize))


random sentence ['it wasnt there the last time', 'ich war das letztemal nicht da']


In [21]:
input_size = source.n_words
output_size = target.n_words

print('Input : {} Output : {}'.format(input_size, output_size))

Input : 17136 Output : 36887


In [14]:
embed_size = 256
hidden_size = 512
num_layers = 2
num_iters = 20000

encoder = Encoder(input_size, hidden_size, embed_size, num_layers)
decoder = Decoder(output_size, hidden_size, embed_size, num_layers)

model = Seq2Seq(encoder, decoder, device).to(device)

In [15]:
print(encoder)
print(decoder)

Encoder(
  (embedding): Embedding(17136, 256)
  (rnn): GRU(256, 512, num_layers=2)
)
Decoder(
  (embedding): Embedding(36887, 256)
  (rnn): GRU(256, 512, num_layers=2)
  (fc_out): Linear(in_features=512, out_features=36887, bias=True)
  (softmax): LogSoftmax(dim=1)
)


In [16]:
model = train(model, source, target, pairs, num_iters)

iter: 1000, loss: 6.7308
iter: 2000, loss: 6.0124
iter: 3000, loss: 5.7154
iter: 4000, loss: 5.4805
iter: 5000, loss: 5.3974
iter: 6000, loss: 5.2438
iter: 7000, loss: 5.1878
iter: 8000, loss: 5.0264
iter: 9000, loss: 4.9978
iter: 10000, loss: 4.9237
iter: 11000, loss: 4.9163
iter: 12000, loss: 4.9442
iter: 13000, loss: 4.7817
iter: 14000, loss: 4.7696
iter: 15000, loss: 4.7943
iter: 16000, loss: 4.7372
iter: 17000, loss: 4.7128
iter: 18000, loss: 4.6624
iter: 19000, loss: 4.6618
iter: 20000, loss: 4.6184


In [24]:
evaluate_randomly(model, source, target, pairs)

source: everybody likes you
target: alle mogen dich
predicted: siehst du ihnen dich
source: stand up
target: stehen sie auf
predicted: ich werde mich <EOS>
source: i wont likely go to boston
target: nach boston gehe ich wohl nicht
predicted: ich werde nicht nicht boston gehen boston
source: when does tom get here
target: wann wird tom hier sein
predicted: wann wann tom wann <EOS>
source: tom never loses his cool
target: tom verliert nie die fassung
predicted: tom setzte nie nie nie <EOS>
source: how do you prevent back pain
target: wie verhindert man ruckenschmerzen
predicted: wie haltst du das <EOS>
source: tom is home
target: tom ist zu hause
predicted: tom ist hause hause <EOS>
source: theyre having extreme money problems
target: sie haben heftige geldprobleme
predicted: sie geld geld geld <EOS>
source: tom puts a lot of sugar and cream in his coffee
target: tom trinkt seinen kaffee mit viel zucker und sahne
predicted: tom und einer und kaffee einen in und sein kaffee
source: we tho