# Learning the sequence "HELLOHELLO" at once

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable

In [2]:
# x and y data.
idx2char = ['h', 'e', 'l', 'o']
# representing "hellohell" and "ellohello" in terms of their indices in idx2char.
x_data = [0, 1, 2, 2, 3, 0, 1, 2, 2]
y_data = [1, 2, 2, 3, 0, 1, 2, 2, 3]
one_hot_lookup = [[1, 0, 0, 0],
                  [0, 1, 0, 0],
                  [0, 0, 1, 0],
                  [0, 0, 0, 1]]
# converting data using one_hot_lookup tables (this is sort of a foundation before we move to Embedding Layers)
x_data_ohe = [one_hot_lookup[x] for x in x_data]

In [3]:
# create nodes on our computational graph
inputs = Variable(torch.Tensor(x_data_ohe))
labels = Variable(torch.LongTensor(y_data))

In [4]:
# building the model class and saving the above variables as part of the class itself. This is probably the better practice.
class RecurrentModelSeq(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, num_layers, seq_len, batch_size):
        super(RecurrentModelSeq, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        self.num_layers = num_layers
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.rnn = nn.RNN(input_size=self.input_size, hidden_size=self.hidden_size, batch_first=True)
    
    # this function no longer needs an input hidden vector, since it creates its own intial hidden vector, processes the entire slice of bread / sequence, and returns 
    # all the outputs for all 4 timesteps (in this case)
    def forward(self, x):
        init_hidden = Variable(torch.zeros(self.num_layers, self.batch_size, self.hidden_size))
        x = x.view(self.batch_size, self.seq_len, self.input_size)
        out, _ = self.rnn(x, init_hidden)
        return out.view(-1, num_classes)

In [5]:
# setting hyperparams
num_classes = 4
input_size = 4
hidden_size = 4 # we don't want to feed the hidden tensor to a linear layer to scale up/down the output to ohe dimensions, since this task is easy enough.
seq_len = len(x_data) # This is where the change HAPPENS! (feeding in the entire sequence)
num_layers = 1
batch_size = 1

# instantiate model
rnn = RecurrentModelSeq(input_size, hidden_size, num_classes, num_layers, seq_len, batch_size)
print(rnn)

RecurrentModelSeq(
  (rnn): RNN(4, 4, batch_first=True)
)


In [6]:
print(x_data_ohe)

[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0], [0, 0, 0, 1], [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0]]


In [7]:
loss_criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=0.1)

for epoch in range(200):
    optimizer.zero_grad()
    outputs = rnn(inputs)
    """
    From the docs for CrossEntropyLoss:
        This criterion expects a class index in the range [0, C - 1] (note the zero) as the target for each value of a 1D tensor of size minibatch
    """
    # I'm assuming the inputs are expected to be of the form (number of data points(in general) x num_classes, here however, it is seq_len x num_classes)
    loss = loss_criterion(outputs, labels)
    if not epoch:
        print("What the outputs and labels look like fyi:")
        print(outputs, labels, loss)
    loss.backward()
    optimizer.step()
    _, idx = outputs.max(1)
    idx = idx.data.numpy()
    result_str = [idx2char[x] for x in idx.squeeze()]
    print("Epoch: {}, Loss: {}".format(epoch + 1, loss))
    print("Predicted String: {}".format(result_str))
print("Done Learning!")

What the outputs and labels look like fyi:
tensor([[ 0.2730,  0.6032, -0.5920, -0.1158],
        [ 0.8027,  0.1555, -0.4976,  0.0528],
        [ 0.4603,  0.5085, -0.5793, -0.3680],
        [ 0.7289,  0.4557, -0.5172, -0.2288],
        [ 0.5213,  0.2105, -0.2454, -0.0831],
        [ 0.1726,  0.7436, -0.5053, -0.1020],
        [ 0.8324,  0.1171, -0.5249,  0.1372],
        [ 0.4117,  0.5180, -0.5895, -0.3806],
        [ 0.7415,  0.4444, -0.5175, -0.2216]], grad_fn=<ViewBackward>) tensor([1, 2, 2, 3, 0, 1, 2, 2, 3]) tensor(1.6543, grad_fn=<NllLossBackward>)
Epoch: 1, Loss: 1.6542643308639526
Predicted String: ['e', 'h', 'e', 'h', 'h', 'e', 'h', 'e', 'h']
Epoch: 2, Loss: 1.393038034439087
Predicted String: ['e', 'h', 'e', 'h', 'h', 'e', 'h', 'e', 'h']
Epoch: 3, Loss: 1.219570279121399
Predicted String: ['e', 'h', 'h', 'h', 'h', 'e', 'h', 'l', 'h']
Epoch: 4, Loss: 1.0956099033355713
Predicted String: ['e', 'h', 'l', 'h', 'h', 'e', 'h', 'l', 'h']
Epoch: 5, Loss: 0.9989227652549744
Predicted S