# Work in Progress

In [1]:
import sys
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

In [12]:
# data I/O
data = open('./data/train_corpus.txt', 'r').read() # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_idx = { ch:i for i,ch in enumerate(chars) }
idx_to_char = { i:ch for i,ch in enumerate(chars) }

data has 858720 characters, 67 unique.


In [15]:
# BATCHED is WHY YOU COMING FASS
class RNN(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(RNN, self).__init__()
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        
        self.Wxh = nn.Parameter(torch.randn(hidden_size, vocab_size) * 1e-2) # Xavier init
        self.Whh = nn.Parameter(torch.randn(hidden_size, hidden_size) * 1e-2)
        self.Why = nn.Parameter(torch.randn(vocab_size, hidden_size) * 1e-2)
        self.bh = nn.Parameter(torch.zeros(hidden_size, 1))
        self.by = nn.Parameter(torch.zeros(vocab_size, 1))
        
    def forward(self, input, hidden):
        hidden = torch.tanh(torch.mm(self.Wxh, input) + torch.mm(self.Whh, hidden + self.bh))
        output = torch.mm(self.Why, hidden) + self.by # unnormalized log probs
        output = torch.softmax(output, 0) # probs
        return output, hidden
    
    def init_hidden(self, batch_size):
        return torch.zeros(hidden_size, batch_size)
    
# hyperparameters
hidden_size = 51 # size of hidden layer
seq_length = 23 # number of time steps per batch
lr = 1e-2
n_epochs = 100

def train(data, vocab_size, hidden_size, seq_length, lr, n_epochs, verbose=False):
    rnn = RNN(vocab_size, hidden_size)

    criterion = nn.NLLLoss()
    optimizer = optim.Adam(rnn.parameters(), lr=lr)
    batch_size = len(data) // seq_length
    for epoch in range(n_epochs):
        loss = 0
        rnn.zero_grad()
        hidden = rnn.init_hidden(batch_size)
        inputs = torch.zeros(seq_length, vocab_size, batch_size)
        targets = torch.zeros(seq_length, batch_size, dtype=torch.long)
        for batch_i in range(batch_size):
            s = data[batch_i*seq_length:(batch_i+1)*seq_length+1]
            for t in range(len(s) - 1):
                idx = char_to_idx[s[t]]
                inputs[t, idx, batch_i] = 1 # Set OHE for index at step t
                target_idx = char_to_idx[s[t+1]]
                targets[t, batch_i] = target_idx # Set target index at step t
        for t in range(seq_length):
            output, hidden = rnn(inputs[t], hidden)
            loss += criterion(torch.log(output).T, targets[t])
        loss /= seq_length
        loss.backward()
        optimizer.step()
        if epoch%(n_epochs//10)==0 or verbose: print("Epoch {}, Loss: {}".format(epoch, loss))
    return rnn

model = train(data, vocab_size, hidden_size, seq_length, lr, n_epochs, True)

Epoch 0, Loss: 4.203723907470703
Epoch 1, Loss: 4.188800811767578
Epoch 2, Loss: 4.158730983734131
Epoch 3, Loss: 4.088043689727783
Epoch 4, Loss: 3.8199455738067627
Epoch 5, Loss: 3.422121524810791
Epoch 6, Loss: 3.2545337677001953
Epoch 7, Loss: 3.180149793624878
Epoch 8, Loss: 3.13564133644104
Epoch 9, Loss: 3.116584062576294
Epoch 10, Loss: 3.12450909614563
Epoch 11, Loss: 3.128697156906128
Epoch 12, Loss: 3.1158785820007324
Epoch 13, Loss: 3.099794387817383
Epoch 14, Loss: 3.094857692718506
Epoch 15, Loss: 3.097886085510254
Epoch 16, Loss: 3.0977981090545654
Epoch 17, Loss: 3.0914549827575684
Epoch 18, Loss: 3.0842177867889404
Epoch 19, Loss: 3.0783193111419678
Epoch 20, Loss: 3.072036027908325
Epoch 21, Loss: 3.0666706562042236
Epoch 22, Loss: 3.0630154609680176
Epoch 23, Loss: 3.060438394546509
Epoch 24, Loss: 3.0571579933166504
Epoch 25, Loss: 3.051342487335205
Epoch 26, Loss: 3.044339418411255
Epoch 27, Loss: 3.038580894470215
Epoch 28, Loss: 3.0340278148651123
Epoch 29, Loss:

In [16]:
def test(data, model):
    offset = 2
    batch_size = 1
    seq_length = 100
    s = data[offset*seq_length:(offset+1)*seq_length]
    with torch.no_grad():
        hidden = model.init_hidden(batch_size)
        inputs = torch.zeros(seq_length, vocab_size, batch_size)
        targets = torch.zeros(seq_length, batch_size, dtype=torch.long)

        for t in range(seq_length - 1):
            idx = char_to_idx[s[t]]
            inputs[t, idx, 0] = 1 # Set OHE for index at step t
            target_idx = char_to_idx[s[t+1]]
            targets[t, 0] = target_idx # Set target index at step t

        print("Wanted: " + s)
        sys.stdout.write("Predic: " + s[0])
        for t in range(seq_length - 1):
            output, hidden = model(inputs[t], hidden)
            next_char = idx_to_char[int(output.argmax())]
            sys.stdout.write(next_char)
test(data, model)

Wanted: tinuation of "The Lord of the Rings" trilogy is so huge that a column of words can not adequately de
Predic: thntnn nnttn t ee te   an tee tent   teen n  tt a  ten  teen ante  n ean te    aentte  an   nn    t 