# Recurrent Neural Networks for text generation - character models

The first kind of models we'll work with today are character models. The LSTM will be fed raw text, character by character. It will then try to predict what the next character will be.

In [None]:
# imports
import numpy as np
import sys
import torch
import torch.nn as nn
from torch.autograd import Variable
dtype = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor
dtype

First, let's load the data we want to play with. By default, you can play with a sample from Shakespeare's works as gathered by Andrej Karpathy:

In [None]:
!mkdir data
!wget -O data/tadeusz.txt https://wolnelektury.pl/media/book/txt/pan-tadeusz.txt

The text looks like this:

In [None]:
with open('data/tadeusz.txt') as f:
    lines_txt = f.readlines()
lines_txt

We need to split it into single characters, though:

In [None]:
chars_txt = []
for line in lines_txt:
    chars_txt.extend(line[:])
chars_txt

We also need to translate the characters into something interpretable by a neural network. We'll use one-hot vectors for that purpose.

In [None]:
# frst, we'll create a set of all available characters
chars = sorted(list(set(chars_txt)))
# then, mappings of characters to integers
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))
char_to_onehot = {}
for i, c in int_to_char.items():
    char_to_onehot[c] = np.zeros(len(chars))
    char_to_onehot[c][i] = 1

len(chars), len(chars_txt)

Our data will be character sequences. As the model has to predict the next character in a sequence, the target data will be just that:

In [None]:
sequence_length = 100
X_l = []
Y_l = []
for i in range(0, len(chars_txt) - sequence_length, 1):
	seq_in = chars_txt[i:i + sequence_length]
	seq_out = chars_txt[i + sequence_length]
	X_l.append([char_to_onehot[char] for char in seq_in])
	Y_l.append(char_to_int[seq_out])
len(X_l)

In order to transform the data into PyTorch Variables, we'll transform it into np.arrays first. Notice we only use 100k first sequences. Too many sequences present a risk of memory errors!

We'll also split the data into training and test sets here:

In [None]:
X = np.reshape(X_l[:100000], (-1, sequence_length, len(chars)))
# X = X / len(chars)

Y = np.array(Y_l)
data_size = X.shape[0]
idx = np.arange(data_size)
np.random.seed(0)
np.random.shuffle(idx)
train_size = int(data_size * 0.7)
# test_size = int(data_size * 0.03)

train_idx = idx[:train_size]
test_idx = idx[train_size:]

X_train = X[train_idx]
Y_train = Y[train_idx]

X_test = X[test_idx]
Y_test = Y[test_idx]

X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

Finally, let's wrap the data into PyTorch Variables:

In [None]:
X_train_var = Variable(torch.Tensor(X_train).type(dtype))
Y_train_var = Variable(torch.Tensor(Y_train).type(dtype).long())

X_test_var = Variable(torch.Tensor(X_test).type(dtype))
Y_test_var = Variable(torch.Tensor(Y_test).type(dtype).long())

X_train_var.size()

Next, we can define the model. It will be a very shallow network consisting of just 3 layers. It's enough for our purpose, though!

In [None]:
class CharacterModel(nn.Module):
    def __init__(self, hidden_dim, lstm_layers_no=3, vocab_size=len(chars)):
        super(CharacterModel, self).__init__()
        self.lstm_layers_no = lstm_layers_no
        self.hidden_dim = hidden_dim
        self.lstm_layer = nn.LSTM(vocab_size, hidden_dim, lstm_layers_no, dropout=0.2)
        self.dropout_layer = nn.Dropout(0.2)
        self.vec2token = nn.Linear(hidden_dim, vocab_size)
        self.init_hidden()
        
    def init_hidden(self, batch_size=1):
        self.hidden = (Variable(torch.zeros(self.lstm_layers_no, batch_size, self.hidden_dim).type(dtype)),
                Variable(torch.zeros(self.lstm_layers_no, batch_size, self.hidden_dim).type(dtype)))
        return self.hidden
    
    def forward(self, sequence):
        lstm_input = sequence.permute(1, 0, 2)
        lstm_out, self.hidden = self.lstm_layer(lstm_input, self.hidden)        
        tags = self.vec2token(self.dropout_layer(self.hidden[0][self.lstm_layers_no-1]))
        return tags


Having defined the model, we can initialize it. Feel free to play with the hyperparameters!

In [None]:
hidden_dim = 256
lstm_layers_no = 3

model = CharacterModel(hidden_dim, lstm_layers_no=lstm_layers_no).type(dtype)
loss_fun = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-3)
model

On to training!

In [None]:
batch_size = 32
test_batch_size = 32
epochs_no = 100

loss_history = []
for epoch in range(epochs_no):
    model.train()
    train_losses_l = []
    for i in range(100):

        model.zero_grad()
        model.init_hidden(batch_size)
        
        idx = torch.Tensor(np.random.randint(X_train_var.size()[0], size=batch_size)).type(dtype).long()
        sequence_in = X_train_var[idx]
        
        targets = Y_train_var[idx]
        tag_scores = model(sequence_in)
        loss = loss_fun(tag_scores, targets)
        loss.backward()
        optimizer.step()
        
        train_losses_l.append(loss.data.cpu().numpy()[0])
        
    model.eval()
    model.init_hidden(test_batch_size)
    
    test_idx = torch.Tensor(np.random.randint(X_test_var.size()[0], size=test_batch_size)).type(dtype).long()
    test_sequence_in = X_test_var[test_idx]
    test_targets = Y_test_var[test_idx]
    test_tag_scores = model(test_sequence_in)
    test_loss = loss_fun(test_tag_scores, test_targets).data.cpu().numpy().sum()
    train_losses = np.array(train_losses_l)

    loss_history.append((train_losses.mean(), test_loss))

    print(epoch, loss_history[-1] )



Seeing the losses decreasing is one thing. We know the model gets *something* more and more accurately. However, in case of the RNN's there is a simple, cool way to visualize that *something* for oneself!

We can see the model in action by sampling from it. The model will make a prediction beased on some starting sequence. We'll then 'cut off' the first element of the sequence and append the prediction to the it. Then we repeat the process and generate as much text as we want. 

Sampling function:

In [None]:
def sample_from_model(seq_in):
    seq_var = Variable(torch.Tensor(seq_in).type(dtype)) 
    out = model(seq_var)
    probs = nn.functional.softmax(out, dim=1).data.cpu().numpy()[0]
    # to make things less deterministic, instead of taking the character with the highest probability, 
    # we'll sample from all characters with the probability distribution taken from network's predictions
    chosen = np.random.choice(np.arange(probs.shape[0]), p=probs)
    return int(chosen)

Generating function:

In [None]:
def generate(start_seq, seq_len=1000):
    model.hidden = model.init_hidden()
    sys.stdout.write(start_seq)
    seq = [[char_to_onehot[t] for t in start_seq]]
    
    for _ in range(seq_len):
        next_int = sample_from_model(seq)
        next_token = int_to_char[next_int]
        sys.stdout.write(next_token)
        seq = [seq[0][1:] + [char_to_onehot[int_to_char[next_int]]]]
        if len(seq[0]) > sequence_length:
            seq = seq[0, 1:]

I want the play to start with me saying something:

In [None]:
start_sequence = 'Litwo!\n'

In [None]:
model.eval()
generate(start_sequence)