# RNN

This implements a simple, naive sequence-to-sequence RNN for machine translation. There will be no LSTM units or GRU units here - this is a bare-bones RNN implementation (just some hidden state).

In [1]:
import sys
sys.path.append('../../')

In [2]:
import math
import torch

from torch.utils.data import DataLoader, random_split
from utils import load_en_fr

## Loading Data

In [3]:
dataset, EN, FR = load_en_fr(root='../..', mini=True)

processing english...
[1/3] preprocessing
[2/3] building vocab
[3/3] processing
processing french...
[1/3] preprocessing
[2/3] building vocab
[3/3] processing
Processing took: 0.01m


In [4]:
train_portion = 0.9
train_len = math.floor(train_portion * len(dataset))
valid_len = len(dataset) - train_len

train_dataset, valid_dataset = random_split(dataset, [train_len, valid_len])

(len(train_dataset), len(valid_dataset))

(900, 100)

## Define the Model

In [35]:
# This encoder is modeled to be similar to the encoder
# described in the PyTorch tutorial:
# https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html

class Encoder(torch.nn.Module):
    def __init__(self, field, embedding_size, hidden_size, dropout_param):
        super().__init__()

        vocab = field.vocab
        pad_idx = vocab.stoi[field.pad_token]

        self.input_size = len(vocab.stoi)
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = len(vocab.stoi)
        self.dropout_param = dropout_param
        
        self.i2e = torch.nn.Embedding(self.input_size, embedding_size, padding_idx=pad_idx)
        self.e2h = torch.nn.Linear(embedding_size + hidden_size, hidden_size)
        self.e2o = torch.nn.Linear(embedding_size + hidden_size, self.output_size)
        self.oh2o = torch.nn.Linear(self.output_size + hidden_size, self.output_size)
        self.dropout = torch.nn.Dropout(dropout_param)
        self.softmax = torch.nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden_prev):
        X = self.i2e(input)
        X = torch.cat([X, hidden_prev], dim=1)
        hidden = self.e2h(X)
        output = self.e2o(X)
        output = torch.cat([output, hidden], dim=1)
        output = self.oh2o(output)
        output = self.dropout(output)

        return self.softmax(output), hidden

        
    def init_hidden(self, batch_size):
        # TODO: Not sure if this is the correct way to generalize
        # the hidden input to a large batch size. It feels like the
        # proper way may be to just concatenate <batch_size> copies of
        # the same hidden representation as columns.
        return torch.zeros(batch_size, self.hidden_size)


In [36]:
# The Decoder is modeled to look the similar to the encoder.

class Decoder(torch.nn.Module):
    def __init__(self, field, embedding_size, hidden_size, dropout_param):
        super().__init__()
        
        vocab = field.vocab
        pad_idx = vocab.stoi[field.pad_token]

        self.input_size = len(vocab.stoi)
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = len(vocab.stoi)
        self.dropout_param = dropout_param
        
        self.i2e = torch.nn.Embedding(self.input_size, embedding_size, padding_idx=pad_idx)
        self.e2h = torch.nn.Linear(embedding_size + hidden_size, hidden_size)
        self.e2o = torch.nn.Linear(embedding_size + hidden_size, self.output_size)
        self.oh2o = torch.nn.Linear(self.output_size + hidden_size, self.output_size)
        self.dropout = torch.nn.Dropout(dropout_param)
        self.softmax = torch.nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden_prev):
        X = self.i2e(input)
        X = torch.cat([X, hidden_prev], dim=1)
        hidden = self.e2h(X)
        output = self.e2o(X)
        output = torch.cat([output, hidden], dim=1)
        output = self.oh2o(output)
        output = self.dropout(output)

        return self.softmax(output), hidden

        
    def init_hidden(self, batch_size):
        # TODO: Not sure if this is the correct way to generalize
        # the hidden input to a large batch size. It feels like the
        # proper way may be to just concatenate <batch_size> copies of
        # the same hidden representation as columns.
        return torch.zeros(batch_size, self.hidden_size)


## Training

In [46]:
init_token_idx = FR.vocab.stoi[FR.init_token]

batch_size = 32
data_loader = DataLoader(train_dataset, batch_size=batch_size)

encoder = Encoder(EN, embedding_size=100, hidden_size=100, dropout_param=0.1)
decoder = Decoder(FR, embedding_size=100, hidden_size=100, dropout_param=0.1)

criterion = torch.nn.NLLLoss()

encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=0.01)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.01)

epochs = 10

for i in range(epochs):
    print(f'Epoch {i+1} / {epochs}')

    epoch_loss = 0

    for j, samples in enumerate(data_loader):
        loss = 0

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        # Step 1: Feed data through encoder.

        en_samples = samples[0]
        encoder_input_len = en_samples.size()[1]

        encoder_hidden = encoder.init_hidden(len(en_samples))

        for k in range(encoder_input_len):
            input = en_samples[:, k]
            output, encoder_hidden = encoder(input, encoder_hidden)

        # Step 2: Feed data through decoder.

        decoder_hidden = encoder_hidden

        # TODO: For now, only doing teacher forcing. Need to update this.
        fr_samples = samples[1]
        decoder_input_len = fr_samples.size()[1]
        
        for k in range(decoder_input_len - 1):
            input = fr_samples[:, k]
            output, decoder_hidden = decoder(input, decoder_hidden)
            expected = fr_samples[:, k+1]
            loss += criterion(output, expected)
            
        # Step 3: Back-Propagate
        
        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()
        
        epoch_loss += loss.item()

        print(f'Epoch {i+1}, Batch {j+1}')
        print(f'Loss: {loss.item()}')

    print(f'Finished epoch {i+1}. Loss={epoch_loss}')


Epoch 1 / 10
Epoch 158, Batch 1
Loss: 1338.5963134765625
Epoch 158, Batch 2
Loss: 907.57470703125
Epoch 158, Batch 3
Loss: 12118.4052734375
Epoch 158, Batch 4
Loss: 610.9775390625
Epoch 158, Batch 5
Loss: 1047.3250732421875
Epoch 158, Batch 6
Loss: 1100.5404052734375
Epoch 158, Batch 7
Loss: 1288.9718017578125
Epoch 158, Batch 8
Loss: 2719368.75
Epoch 158, Batch 9
Loss: 1649.4808349609375
Epoch 158, Batch 10
Loss: 153870206500864.0
Epoch 158, Batch 11
Loss: 1310.01904296875
Epoch 158, Batch 12
Loss: 934136663506944.0
Epoch 158, Batch 13
Loss: 1482.5887451171875
Epoch 158, Batch 14
Loss: 2607.7294921875
Epoch 158, Batch 15
Loss: 2715.912841796875
Epoch 158, Batch 16
Loss: 2983.06005859375
Epoch 158, Batch 17
Loss: 3480.14501953125
Epoch 158, Batch 18
Loss: 535929.75
Epoch 158, Batch 19
Loss: 14988157952.0
Epoch 158, Batch 20
Loss: 139698274041856.0
Epoch 158, Batch 21
Loss: 5469.02197265625
Epoch 158, Batch 22
Loss: 1436717809664.0
Epoch 158, Batch 23
Loss: 1.897858004708806e+19
Epoch 1

KeyboardInterrupt: 