In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

# Recurrent Neural Network (RNN)

Let's start off with a toy example using PyTorch, we will ignore the details that are happening within the RNN cell as of now.

In [2]:
# suppose we have a
# One hot encoding for each char in 'hello'
# and the sequence for the word 'hello' is 5
seq_len = 5
h = [1, 0, 0, 0]
e = [0, 1, 0, 0]
l = [0, 0, 1, 0]
o = [0, 0, 0, 1]

# here we specify a single RNN cell with the property of
# input_dim (4) -> output_dim (2)
# batch_first explained in the following
rnn_cell = nn.RNN(input_size = 4, hidden_size = 2, batch_first = True)

# our input shape should be of shape
# (batch, seq_len, input_size) when batch_first=True;
# the input size basically referrs to the number of feature's dimension
# (seq_len, batch_size, input_size) when batch_first=False (default)
# thus we reshape our input to the appropriate size, torch.view is
# equivalent to numpy.reshape
inputs = Variable(torch.Tensor([h, e, l, l, o]))
inputs = inputs.view(1, 5, -1)

# our hidden is the weights that gets passed along the cells,
# here we initialize some random values for it:
# (batch, num_layers * num_directions, hidden_size) for batch_first=True
# disregard the second argument as of now
hidden = Variable(torch.randn(1, 1, 2))
out, hidden = rnn_cell(inputs, hidden)
print('sequence input size', inputs.size(), 'out size', out.size())

sequence input size torch.Size([1, 5, 4]) out size torch.Size([1, 5, 2])


In the next section, we'll teach our RNN to produce "ihello" from "hihell".

In [3]:
# create an index to character mapping
idx2char = ['h', 'i', 'e', 'l', 'o']

# Teach hihell -> ihello
x_data = [[0, 1, 0, 2, 3, 3]]    # hihell
x_one_hot = [[[1, 0, 0, 0, 0],   # h 0
              [0, 1, 0, 0, 0],   # i 1
              [1, 0, 0, 0, 0],   # h 0
              [0, 0, 1, 0, 0],   # e 2
              [0, 0, 0, 1, 0],   # l 3
              [0, 0, 0, 1, 0]]]  # l 3

y_data = [1, 0, 2, 3, 3, 4]      # ihello

# As we have one batch of samples, we will change them to variables only once
inputs = Variable(torch.Tensor(x_one_hot))
labels = Variable(torch.LongTensor(y_data))


# hyperparameters
seq_len = 6      # |ihello| == 6
input_size = 5   # one-hot size
batch_size = 1   # one sentence
num_layers = 1   # one-layer rnn
num_classes = 5  # predicting 5 distinct character
hidden_size = 4  # output from the RNN


class RNN(nn.Module):
    """
    The RNN model will be a RNN followed by a linear layer,
    i.e. a fully-connected layer
    """
    def __init__(self, seq_len, num_classes, input_size, hidden_size, num_layers):
        super().__init__()
        self.seq_len = seq_len
        self.num_layers = num_layers
        self.input_size = input_size
        self.num_classes = num_classes
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first = True)
        self.linear = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # assuming batch_first = True for RNN cells
        batch_size = x.size(0)
        hidden = self._init_hidden(batch_size)
        x = x.view(batch_size, self.seq_len, self.input_size)
        
        # apart from the output, rnn also gives us the hidden
        # cell, this gives us the opportunity to pass it to
        # the next cell if needed; we won't be needing it here
        # because the nn.RNN already computed all the time steps
        # for us
        rnn_out, _ = self.rnn(x, hidden)
        linear_out = self.linear(rnn_out.view(-1, hidden_size))
        return linear_out

    def _init_hidden(self, batch_size):
        """
        Initialize hidden cell states, assuming
        batch_first = True for RNN cells
        """
        hidden = Variable(torch.zeros(
            batch_size, self.num_layers, self.hidden_size))
        return hidden

In [4]:
# Set loss, optimizer and the RNN model
torch.manual_seed(777)
rnn = RNN(seq_len, num_classes, input_size, hidden_size, num_layers)
print(rnn)

# train the model
num_epochs = 15
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn.parameters(), lr = 0.1)
for epoch in range(1, num_epochs + 1):
    optimizer.zero_grad()
    outputs = rnn(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    
    # check the current predicted string
    _, idx = outputs.max(dim = 1)
    idx = idx.data.numpy()
    result_str = [idx2char[c] for c in idx]
    print('epoch: {}, loss: {:1.3f}'.format(epoch, loss.data[0]))
    print('Predicted string: ', ''.join(result_str))

RNN(
  (rnn): RNN(5, 4, batch_first=True)
  (linear): Linear(in_features=4, out_features=5)
)
epoch: 1, loss: 1.756
Predicted string:  eeeeee
epoch: 2, loss: 1.626
Predicted string:  ehhhhh
epoch: 3, loss: 1.485
Predicted string:  elllll
epoch: 4, loss: 1.405
Predicted string:  llllll
epoch: 5, loss: 1.293
Predicted string:  illlll
epoch: 6, loss: 1.217
Predicted string:  iiilll
epoch: 7, loss: 1.057
Predicted string:  iollll
epoch: 8, loss: 0.967
Predicted string:  ielllo
epoch: 9, loss: 0.837
Predicted string:  ihlllo
epoch: 10, loss: 0.696
Predicted string:  ihello
epoch: 11, loss: 0.615
Predicted string:  ihello
epoch: 12, loss: 0.535
Predicted string:  ihhllo
epoch: 13, loss: 0.452
Predicted string:  ihhllo
epoch: 14, loss: 0.387
Predicted string:  ihello
epoch: 15, loss: 0.322
Predicted string:  ihello


For those interested the following link has a nice blog post that implements RNN in numpy. [Blog: Recurrent Neural Networks Tutorial, Part 2 – Implementing a RNN with Python, Numpy and Theano](http://www.wildml.com/2015/09/recurrent-neural-networks-tutorial-part-2-implementing-a-language-model-rnn-with-python-numpy-and-theano/)

# Reference

- [Github: Simple PyTorch Tutorials Zero to ALL!](https://github.com/hunkim/PyTorchZeroToAll)