In [592]:
import torch
from torch import nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

In [593]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [594]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Dataset Preprocessing

In [595]:
import time
eng = []
ger = []
with open('/content/drive/MyDrive/learning machine learning/deu.txt', 'r') as file:
    i = 0
    # loading first 10000 sentences
    while file.readline() != '' and i < 10000:
        data = file.readline()
        data = data.replace('.', '')
        data = data.replace(',', '')
        data = data.replace("'", '')
        data = data.replace('!', '')
        data = data.replace('?', '')
        data = data.split('CC-BY')[0].split('\t')

        # adding english sentence
        eng.append(data[0].lower())

        # adding german sentences with <SOS> and <EOS> tags
        ger.append('<SOS> ' + data[1].lower() + ' <EOS>')
        i += 1

In [596]:
eng = eng[5100:5200]
ger = ger[5100:5200]

In [597]:
# creating eng vocab out of sentences
eng_vocab = []
for sentence in eng:
    for word in sentence.split():
        if word not in eng_vocab:
            eng_vocab.append(word)

In [598]:
# creating ger vocab out of sentences
ger_vocab = []
for sentence in ger:
    for word in sentence.split():
        if word not in ger_vocab:
            ger_vocab.append(word)

### Definint input and embedding sizes for encoder, decoder

In [604]:

embedding_size=300
enc_hidden_size=1024
dec_hidden_size=1024
enc_input_size=300
dec_input_size=300
eng_embedding_layer = nn.Embedding(num_embeddings=len(eng_vocab), embedding_dim=embedding_size)
ger_embedding_layer = nn.Embedding(num_embeddings=len(ger_vocab), embedding_dim=embedding_size)


## Encoder Definition

In [605]:
class EncoderNetwork(nn.Module):

  def __init__(self, input_size, hidden_size):
    super(EncoderNetwork, self).__init__()
    self.encoder = nn.LSTM(input_size=input_size, hidden_size=hidden_size)

  def forward(self, input):
    # print('encoder input shape', input.shape)
    output, (hidden, cell) = self.encoder(input)
    return hidden, cell


## Decoder Definition

In [606]:
# decoder defn

class DecoderNetwork(nn.Module):

  def __init__(self, input_size, hidden_size, num_classes):
    super().__init__()
    self.decoder = nn.LSTM(input_size=input_size, hidden_size=hidden_size)
    self.fc = nn.Linear(in_features=hidden_size, out_features=num_classes)
    self.softmax = nn.Softmax(dim=-1)

  def forward(self, input, hidden, cell):
    outputs, (hidden, cell) = self.decoder(input, (hidden, cell))
    probs = self.softmax(self.fc(hidden.squeeze(0)))
    return hidden, cell, probs

## Seq2Seq Network Definition

In [607]:
class Seq2SeqNetwork(nn.Module):

    def __init__(self, encoder, decoder, eng_vocab, ger_vocab, eng_embedding_layer, ger_embedding_layer):
        super(Seq2SeqNetwork, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

        # embedding layer for english and german words respectively
        self.eng_embedding_layer = eng_embedding_layer
        self.ger_embedding_layer = ger_embedding_layer


        self.eng_vocab = eng_vocab
        self.ger_vocab = ger_vocab

    def get_one_hot_vector(self, word):
      """
      returns the one hot vector for the word
      """
        ohe_vector = torch.zeros(len(self.ger_vocab))
        ohe_vector[self.ger_vocab.index(word)] = 1
        return ohe_vector

    def get_sentence_embedding(self, sentence):
      """
      returns the sentence embedding and the length of the sentence
      """
        seq_len = len(sentence)
        s_vec = []
        for word in sentence:
            s_vec.append(self.eng_embedding_layer(torch.tensor(self.eng_vocab.index(word))))
        return torch.stack(s_vec).reshape(1, -1), seq_len

    def forward(self, input_sentence, output_sentence, eval_mode):

        # since sending one sentence at a time
        batch_size = 1

        # getting embedding for input sentence ; torch tensor
        enc_input, enc_input_seq_len = self.get_sentence_embedding(input_sentence)

        # reshaping enc_input to (seq_len, batch_size, feature_dim)
        # passing input sentence to encoder and getting context and cell vectors from the encoder output
        enc_input = enc_input.reshape(enc_input_seq_len, batch_size, -1)
        context, e_cell = self.encoder(enc_input)

        # we need to initialize the decoder hidden, cell values from the encoder
        # the input sentence as the <SOS> tag
        d_seq_len = len(output_sentence)

        # d_hidden is the initial hidden state for the decoder
        d_hidden = context.reshape(1, batch_size, dec_hidden_size)

        # d_input is the initial input for the decoder
        # equivalent to the output_sentence's first word
        d_input = ger_embedding_layer(torch.tensor(self.ger_vocab.index(output_sentence[0]))).reshape(1, batch_size, dec_input_size)

        # d_cell is the initial cell state for the decoder
        d_cell = e_cell.reshape(1, batch_size, dec_hidden_size)

        decoder_loss = 0.0

        # getting hidden, cell and predictions from the decoder
        curr_d_hidden, curr_d_cell, probs = self.decoder(d_input, d_hidden, d_cell)

        # for each word in the output_sentence
        for i in range(1, len(output_sentence)):

            # make prediction based on current hidden, cell and input
            pred_word_index = torch.argmax(probs, dim=-1)
            pred_word = self.ger_vocab[pred_word_index]

            # if in eval_mode, print pred word until <EOS>
            if eval_mode:
              if pred_word == '<EOS>':
                break
              else:
                print(pred_word, end=' ')

            # calculate loss and return
            loss = loss_fn(probs.view(-1), self.get_one_hot_vector(output_sentence[i]))
            decoder_loss += loss

            # sending new d_hidden, d_cell, and d_input (next word)
            curr_d_input = ger_embedding_layer(torch.tensor(self.ger_vocab.index(output_sentence[i]))).reshape(1, batch_size, dec_input_size)
            curr_d_hidden, curr_d_cell, probs = self.decoder(curr_d_input, curr_d_hidden, curr_d_cell)

        return decoder_loss

### Initializing Encoder, Decoder and Seq2Seq Networks

In [608]:
encoder = EncoderNetwork(input_size=embedding_size, hidden_size=enc_hidden_size).to(device)
decoder = DecoderNetwork(input_size=dec_input_size, hidden_size=dec_hidden_size, num_classes=len(ger_vocab)).to(device)

In [609]:
s2s = Seq2SeqNetwork(encoder, decoder, eng_vocab, ger_vocab, eng_embedding_layer, ger_embedding_layer).to(device)

In [610]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(s2s.parameters(), lr=0.001, weight_decay=1e-4)

### Training

In [614]:
epochs = 1000
import time
for _ in range(epochs):
    epoch_loss = 0.0
    for input_sentence, output_sentence in zip(eng[:10], ger[:10]): # training for the first 10 sentences cause i dont have compute power
        output_words = output_sentence.split()
        loss = s2s(input_sentence.split(), output_words, False)
        epoch_loss += loss

    try:
      epoch_loss.backward()
      optimizer.step()
      optimizer.zero_grad()
    except:
      pass
    print(f'\nEpoch {_ + 1} loss {epoch_loss}')


Epoch 1 loss 259.1486511230469

Epoch 2 loss 258.8638916015625

Epoch 3 loss 257.76483154296875

Epoch 4 loss 252.3050537109375

Epoch 5 loss 244.31369018554688

Epoch 6 loss 240.761962890625

Epoch 7 loss 240.67018127441406

Epoch 8 loss 240.667724609375

Epoch 9 loss 240.6873779296875

Epoch 10 loss 240.6937713623047

Epoch 11 loss 240.6968231201172

Epoch 12 loss 240.69895935058594

Epoch 13 loss 240.6999969482422

Epoch 14 loss 240.69956970214844

Epoch 15 loss 240.69821166992188

Epoch 16 loss 240.69664001464844

Epoch 17 loss 240.6952362060547

Epoch 18 loss 240.69412231445312

Epoch 19 loss 240.69313049316406

Epoch 20 loss 240.6922149658203

Epoch 21 loss 240.691162109375

Epoch 22 loss 240.68978881835938

Epoch 23 loss 240.68748474121094

Epoch 24 loss 240.6830596923828

Epoch 25 loss 240.673095703125

Epoch 26 loss 240.64883422851562

Epoch 27 loss 240.5835723876953

Epoch 28 loss 240.36814880371094

Epoch 29 loss 239.73178100585938

Epoch 30 loss 238.96640014648438

Epoch 3

In [658]:
sample_no = 4
input_sentence = eng[sample_no]
output_sentence = ger[sample_no]

In [659]:
input_sentence

'we need it now'

In [660]:
output_sentence = "<SOS> du du du du du <EOS>" # sample output sentence starting with <SOS> tag for prediction

In [661]:
s2s(input_sentence.split(), output_sentence.split(), True)

wir brauchen es jetzt 

tensor(20.3667, grad_fn=<AddBackward0>)