# Recurrent Neural Networks - word model

In [None]:
from os import listdir
from os.path import isfile, join
import string
import re
import numpy as np
import matplotlib.pyplot as plt
import sys
import torch
import torch.nn as nn
from torch.autograd import Variable
dtype = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.cuda.FloatTensor
dtype

In this example, we'll play with a model that works with whole words. We'll try to learn and generate song by Bob Dylan!

In [None]:
!mkdir data
!mkdir data/lyrics
!wget -O data/lyrics.zip http://ujeb.se/fdGfO
!yes |unzip data/lyrics.zip -d data/lyrics
!ls data/lyrics

In [None]:
lyrics_path = 'data/lyrics'
files = [f for f in listdir(lyrics_path) if isfile(join(lyrics_path, f))]
len(files)

In [None]:
def song_from_file(file, lyrics_path=lyrics_path):
    with open(join(lyrics_path, file)) as f:
        return f.readlines()

In order to show the model more explicitly where songs start end, and where lines split, we'll introduce a couple of 'special' tokens:  

In [None]:
start_t = '<START>'
end_t = '<END>'
unk_t = '<UNK>'
newline_t = '<NEWL>'

The below function will split the lyrics into text tokens:

In [None]:
def song_to_tokens(song_lines):
    result = [start_t]
    for line in song_lines:
        line = re.sub(r'[^\w\s]','', line)
        words = line.split()
        words = [w.lower() for w in words]
        result.extend(words)
        result.append(newline_t)
    result.append(end_t)
    return result

In [None]:
s = song_from_file(files[0])
song_to_tokens(s)

In [None]:
songs = [song_from_file(f) for f in files]
songs_tokens = [song_to_tokens(s) for s in songs]
all_tokens = []
for t in songs_tokens:
    all_tokens.extend(t)
len(all_tokens)

All songs consist of over 140k tokens. How many distinct tokens do we have, though?

In [None]:
sorted_tokens_all = sorted(list(set(all_tokens)))
token_to_int_all = dict((t, i) for i, t in enumerate(sorted_tokens_all))
int_to_token_all = dict((i, t) for i, t in enumerate(sorted_tokens_all))
len(token_to_int_all), len(int_to_token_all)

Whew! That's a lot! Let's try count the times each token appears in the lyrics.

In [None]:
token_count_all = dict((t, 0) for t in sorted_tokens_all)
for t in all_tokens:
    token_count_all[t] +=1

In [None]:
counts = list(token_count_all.values())
x_min=0
x_max=20
plt.hist(counts, range=(x_min, x_max))
plt.show()

Nearly half of the tokens have appeared just once throughout the whole Bob Dylan's creative process! As those tokens are quite rare, we'll drop them from our vocabulary - they will be represented by `<UNK>` token. This will make things much easier for the network, as it will have 5k fewer classes to choose from.

In [None]:
vocab_threshold = 1

sorted_tokens = [unk_t] + [t for t in sorted_tokens_all if token_count_all[t] > vocab_threshold]

token_to_int = dict((t, i) for i, t in enumerate(sorted_tokens))
int_to_token = dict((i, t) for i, t in enumerate(sorted_tokens))

print(sorted_tokens[0])
(len(token_to_int),
len(int_to_token), 
(start_t in sorted_tokens), 
(end_t in sorted_tokens), 
(unk_t in sorted_tokens), 
(newline_t in sorted_tokens))


In [None]:
token_count = dict((t, 0) for t in sorted_tokens)
for t in all_tokens:
    try:
        token_count[t] += 1
    except:
        token_count[unk_t] +=1

In [None]:
counts = list(token_count.values())
x_min = 0
x_max = 20
plt.hist(counts, range=(x_min, x_max))
plt.show()

Now for every OOV (out-of-vocabulary) word in the data, the `<UNK>` token will be assigned to it.

In [None]:
sequence_length = 30
X_l = []
Y_l = []

for song in songs_tokens:
    for i in range(len(song) - sequence_length):
        sequence_in = song[i: i + sequence_length]
        sequence_out = song[i + sequence_length]
        X_l.append([token_to_int.get(token, token_to_int[unk_t]) for token in sequence_in])
        Y_l.append(token_to_int.get(sequence_out, token_to_int[unk_t]))

len(X_l)

Just like before, let's wrap the data into np.arrays and then PyTorch Variables:

In [None]:
X = np.reshape(X_l, (-1, sequence_length))
Y = np.array(Y_l)
data_size = X.shape[0]
idx = np.arange(data_size)
np.random.seed(0)
np.random.shuffle(idx)
train_size = int(data_size * 0.9)
# test_size = int(data_size * 0.03)

train_idx = idx[:train_size]
test_idx = idx[train_size:]

X_train = X[train_idx]
Y_train = Y[train_idx]

X_test = X[test_idx]
Y_test = Y[test_idx]

X_train.shape, Y_train.shape, X_test.shape, Y_test.shape,

In [None]:
X_train_var = Variable(torch.Tensor(X_train).type(dtype)).long()
Y_train_var = Variable(torch.Tensor(Y_train).type(dtype).long())

X_test_var = Variable(torch.Tensor(X_test).type(dtype)).long()
Y_test_var = Variable(torch.Tensor(Y_test).type(dtype).long())

X_train_var.size()

Now we can define the model. It will be very similiar to the previously used character model. The only major change is the word embedding layer at the top, which gradually learns vector representations of input tokens.

In [None]:
class WordNN(nn.Module):
    def __init__(self, embed_dim, hidden_dim, vocab_size, targets_dim, lstm_layers_no=3):
        super(WordNN, self).__init__()
        self.lstm_layers_no = lstm_layers_no
        self.hidden_dim = hidden_dim
        self.embed_layer = nn.Embedding(vocab_size, embed_dim)
        self.lstm_layer = nn.LSTM(embed_dim, hidden_dim, lstm_layers_no, dropout=0.3)
        self.dropout_layer = nn.Dropout(0.3)
        self.vec2token = nn.Linear(hidden_dim, targets_dim)
        self.hidden = self.init_hidden()
        
    def init_hidden(self, batch_size=1):
        self.hidden = (Variable(torch.zeros(self.lstm_layers_no, batch_size, self.hidden_dim).type(dtype)),
                Variable(torch.zeros(self.lstm_layers_no, batch_size, self.hidden_dim).type(dtype)))
    
    def forward(self, sequence):
        embeddings = self.embed_layer(sequence)
        lstm_input = embeddings.permute(1, 0, 2)
        lstm_out, self.hidden = self.lstm_layer(lstm_input, self.hidden)
        
        tags = self.vec2token(self.dropout_layer(self.hidden[0][self.lstm_layers_no-1]))
        return tags


In [None]:
embed_dim = 64
hidden_dim = 1024

model = WordNN(embed_dim, hidden_dim, len(token_to_int), len(int_to_token)).type(dtype)
loss_fun = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
model

In [None]:
model.train()

batch_size = 32
test_batch_size = 32
epochs_no = 100

loss_history = []
for epoch in range(epochs_no):
    model.train()
    train_losses_l = []
    for i in range(100):

        model.zero_grad()
        model.init_hidden(batch_size)
        
        idx = torch.Tensor(np.random.randint(X_train_var.size()[0], size=batch_size)).type(dtype).long()
        sequence_in = X_train_var[idx]
        
        targets = Y_train_var[idx]
        tag_scores = model(sequence_in)
        loss = loss_fun(tag_scores, targets)
        loss.backward()
        optimizer.step()
        
        train_losses_l.append(loss.data.cpu().numpy()[0])
        
    model.eval()
    model.init_hidden(test_batch_size)
    
    test_idx = torch.Tensor(np.random.randint(X_test_var.size()[0], size=test_batch_size)).type(dtype).long()
    test_sequence_in = X_test_var[test_idx]
    test_targets = Y_test_var[test_idx]
    test_tag_scores = model(test_sequence_in)
    test_loss = loss_fun(test_tag_scores, test_targets).data.cpu().numpy().sum()
    train_losses = np.array(train_losses_l)

    loss_history.append((train_losses.mean(), test_loss))

    print(epoch, loss_history[-1] )

In [None]:
def sample_from_model(seq_in):
    seq_var = Variable(torch.Tensor(seq_in).type(dtype).long())
    out = model(seq_var)
    probs = nn.functional.softmax(out, dim=1).data.cpu().numpy()[0]
    chosen = np.random.choice(np.arange(probs.shape[0]), p=probs)
#     chosen = probs.argmax()
    return int(chosen)
    

In [None]:
def generate(start_seq):
    model.init_hidden()
    sys.stdout.write(' '.join(start_seq))
    seq = [[token_to_int.get(t, unk_t) for t in start_seq]]
    done = False
    
    while not done:
        next_int = sample_from_model(seq)
        next_token = int_to_token[next_int]
        if next_token == '<END>':
            done = True
        if next_token == '<NEWL>':
            next_token = '\n'
        sys.stdout.write(' ' + next_token)
        seq = [seq[0][1:] + [next_int]]
        if len(seq[0]) > sequence_length:
            seq = seq[0, 1:]

In [None]:
start_sequence = [ '<START>']
model.eval()
generate(start_sequence)
