# Recurrent Neural Networks for text generation - character models

The first kind of models we'll work with today are character models. The LSTM will be fed raw text, character by character. It will then try to predict what the next character will be.

In [None]:
# imports
import numpy as np
import sys
import torch
import torch.nn as nn
from torch.autograd import Variable
dtype = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor
dtype

First, let's load the data we want to play with. By default, you can play with a sample from Shakespeare's works as gathered by Andrej Karpathy:

In [None]:
!mkdir da

In [12]:
!mkdi
with open('data/tiny_shakespeare.txt') as f:
    lines_txt = f.readlines()
lines_txt

['First Citizen:\n',
 'Before we proceed any further, hear me speak.\n',
 '\n',
 'All:\n',
 'Speak, speak.\n',
 '\n',
 'First Citizen:\n',
 'You are all resolved rather to die than to famish?\n',
 '\n',
 'All:\n',
 'Resolved. resolved.\n',
 '\n',
 'First Citizen:\n',
 'First, you know Caius Marcius is chief enemy to the people.\n',
 '\n',
 'All:\n',
 "We know't, we know't.\n",
 '\n',
 'First Citizen:\n',
 "Let us kill him, and we'll have corn at our own price.\n",
 "Is't a verdict?\n",
 '\n',
 'All:\n',
 "No more talking on't; let it be done: away, away!\n",
 '\n',
 'Second Citizen:\n',
 'One word, good citizens.\n',
 '\n',
 'First Citizen:\n',
 'We are accounted poor citizens, the patricians good.\n',
 'What authority surfeits on would relieve us: if they\n',
 'would yield us but the superfluity, while it were\n',
 'wholesome, we might guess they relieved us humanely;\n',
 'but they think we are too dear: the leanness that\n',
 'afflicts us, the object of our misery, is as an\n',
 'in

In [17]:
chars_txt = []
for line in lines_txt:
    chars_txt.extend(line[:])
chars_txt
chars = sorted(list(set(chars_txt)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))
char_to_onehot = {}
for i, c in enumerate(chars):
    char_to_onehot[c] = np.zeros(len(chars))
    char_to_onehot[c][i] = 1

len(chars), len(chars_txt)

(65, 1115393)

In [18]:
sequence_length = 100
X_l = []
Y_l = []
for i in range(0, len(chars_txt) - sequence_length, 1):
	seq_in = chars_txt[i:i + sequence_length]
	seq_out = chars_txt[i + sequence_length]
	X_l.append([char_to_onehot[char] for char in seq_in])
	Y_l.append(char_to_int[seq_out])
len(X_l)

1115293

In [19]:
X = np.reshape(X_l[:100000], (-1, sequence_length, len(chars)))
# X = X / len(chars)

Y = np.array(Y_l)
data_size = X.shape[0]
idx = np.arange(data_size)
np.random.seed(0)
np.random.shuffle(idx)
train_size = int(data_size * 0.7)
# test_size = int(data_size * 0.03)

train_idx = idx[:train_size]
test_idx = idx[train_size:]

X_train = X[train_idx]
Y_train = Y[train_idx]

X_test = X[test_idx]
Y_test = Y[test_idx]

X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((70000, 100, 65), (70000,), (30000, 100, 65), (30000,))

In [20]:
X_train_var = Variable(torch.Tensor(X_train).type(dtype))
Y_train_var = Variable(torch.Tensor(Y_train).type(dtype).long())

X_test_var = Variable(torch.Tensor(X_test).type(dtype))
Y_test_var = Variable(torch.Tensor(Y_test).type(dtype).long())

X_train_var.size()

torch.Size([70000, 100, 65])

In [21]:
class CharacterModel(nn.Module):
    def __init__(self, hidden_dim, lstm_layers_no=3, vocab_size=len(chars)):
        super(CharacterModel, self).__init__()
        self.lstm_layers_no = lstm_layers_no
        self.hidden_dim = hidden_dim
        self.lstm_layer = nn.LSTM(vocab_size, hidden_dim, lstm_layers_no, dropout=0.2)
        self.dropout_layer = nn.Dropout(0.2)
        self.vec2token = nn.Linear(hidden_dim, vocab_size)
        self.init_hidden()
        
    def init_hidden(self, batch_size=1):
        self.hidden = (Variable(torch.zeros(self.lstm_layers_no, batch_size, self.hidden_dim).type(dtype)),
                Variable(torch.zeros(self.lstm_layers_no, batch_size, self.hidden_dim).type(dtype)))
        return self.hidden
    
    def forward(self, sequence):
        lstm_input = sequence.permute(1, 0, 2)
        lstm_out, self.hidden = self.lstm_layer(lstm_input, self.hidden)        
        tags = self.vec2token(self.dropout_layer(self.hidden[0][self.lstm_layers_no-1]))
        return tags


In [23]:
hidden_dim = 256
lstm_layers_no = 3

model = CharacterModel(hidden_dim, lstm_layers_no=lstm_layers_no).type(dtype)
loss_fun = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-3)
model

CharacterModel(
  (lstm_layer): LSTM(65, 256, num_layers=3, dropout=0.2)
  (dropout_layer): Dropout(p=0.2)
  (vec2token): Linear(in_features=256, out_features=65, bias=True)
)

In [None]:
batch_size = 32
test_batch_size = 32

loss_history = []
for epoch in range(100):
    model.train()
    train_losses_l = []
    for i in range(100):

        model.zero_grad()
        model.init_hidden(batch_size)
        
        idx = torch.Tensor(np.random.randint(X_train_var.size()[0], size=batch_size)).type(dtype).long()
        sequence_in = X_train_var[idx]
        
        targets = Y_train_var[idx]
        tag_scores = model(sequence_in)
        loss = loss_fun(tag_scores, targets)
        loss.backward()
        optimizer.step()
        
        train_losses_l.append(loss.data.cpu().numpy()[0])
        
    model.eval()
    model.init_hidden(test_batch_size)
    
    test_idx = torch.Tensor(np.random.randint(X_test_var.size()[0], size=test_batch_size)).type(dtype).long()
    test_sequence_in = X_test_var[test_idx]
    test_targets = Y_test_var[test_idx]
    test_tag_scores = model(test_sequence_in)
    test_loss = loss_fun(test_tag_scores, test_targets).data.cpu().numpy().sum()
    train_losses = np.array(train_losses_l)

    loss_history.append((train_losses.mean(), test_loss))

    print(epoch, loss_history[-1] )



0 (3.4863608, 3.646985)
1 (3.339032, 3.341372)
2 (3.337944, 3.3679142)
3 (3.3464468, 3.3090913)
4 (3.3029232, 2.8658276)
5 (3.0570211, 2.7868025)
6 (2.8285668, 2.6766706)
7 (2.7017505, 2.558306)
8 (2.6464465, 2.5052142)
9 (2.6011212, 2.6455219)
10 (2.5368218, 2.5493386)
11 (2.4417045, 2.270588)
12 (2.4761324, 2.4144309)
13 (2.4376059, 2.084959)
14 (2.3785481, 2.4059052)
15 (2.3337333, 1.980797)
16 (2.279416, 2.2035859)
17 (2.265589, 1.9688361)
18 (2.2018619, 2.124069)
19 (2.2165895, 2.4075263)
20 (2.2173326, 2.1571362)
21 (2.1687315, 2.3965766)
22 (2.1633942, 2.459287)
23 (2.144189, 2.4964976)
24 (2.0788395, 2.4067836)
25 (2.0929406, 2.112423)
26 (2.0702775, 2.0118032)
27 (2.005149, 2.2715619)
28 (1.9768362, 2.2246041)
29 (1.9888457, 1.8950909)
30 (1.9543633, 1.8806186)
31 (1.9185491, 1.5953488)
32 (1.9360341, 2.1984751)
33 (1.9791597, 1.9131256)


In [None]:
def sample_from_model(seq_in):
    seq_var = Variable(torch.Tensor(seq_in).type(dtype))
    out = model(seq_var)
    probs = nn.functional.softmax(out, dim=1).data.cpu().numpy()[0]
    chosen = np.random.choice(np.arange(probs.shape[0]), p=probs)
#     chosen = probs.argmax()
    return int(chosen)

In [None]:
def generate(start_seq, seq_len=1000):
    model.hidden = model.init_hidden()
    sys.stdout.write(start_seq)
    seq = [[char_to_onehot[t] for t in start_seq]]
    
    for _ in range(seq_len):
        next_int = sample_from_model(seq)
        next_token = int_to_char[next_int]
        sys.stdout.write(next_token)
        seq = [seq[0][1:] + [char_to_onehot[int_to_char[next_int]]]]

In [None]:
xD = 'ROMEO:'

In [None]:
model.hidden = model.init_hidden()
model.eval()
generate(xD)