In [1]:
import os
import re
import string
import math
import random
import numpy as np
from collections import Counter
import pickle
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset
from torch.utils.data.dataloader import DataLoader

seed = 1999

In [2]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [3]:
# check torch is using GPU acceleration
print(torch.backends.mps.is_available())
print(torch.backends.mps.is_built())

True
True


In [4]:
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using {device} device")

Using mps device


In [5]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [6]:
remove_punct = True

punctuation_dict = {
    "...": "||elipsis||",
    ".": "||period||",
    ",": "||comma||",
    "\"": "||quotation||",
    ";": "||semicolon||",
    "!": "||exclamation||",
    "?": "||question||",
    "(": "||lparathensis||",
    ")": "||rparathensis||",
    "[": "||lsquare||",
    "]": "||rsquare||",
    "-": "||dash||",
    "'": "",
    "\n": "",
    "$": "||dollar||"
}

def clean_text(text, remove_punct):
    text = text.lower().replace("'", "").replace("\n", " ")
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' '+punctuation+' ')
#     if remove_punct:
#         for punctuation in string.punctuation:
#             text = text.replace(punctuation, '')
#     else:
#         for punct in punctuation_dict.keys():
#             text = text.replace(punct, " "+punctuation_dict[punct]+" ")
    text = ''.join([char for char in text
                    if char in string.ascii_lowercase+string.punctuation+' '])
    text = re.sub('  +', ' ', text)
    return text

def read_script(filename, remove_punct=True):
    with open(filename, 'r') as f:
        contents = f.read()
    return clean_text(contents, remove_punct)

In [7]:
dirname = "GoT"
scripts = [read_script(os.path.join(dirname, script),
                       remove_punct=remove_punct)
           for script in sorted(os.listdir(dirname))]
corpus = ' '.join(scripts)
tokens = corpus.split(" ")

In [8]:
corpus[:200]

'prologue the comets tail spread across the dawn , a red slash that bled above the crags of dragonstone like a wound in the pink and purple sky . the maester stood on the windswept balcony outside his '

In [9]:
tokens[:10]

['prologue',
 'the',
 'comets',
 'tail',
 'spread',
 'across',
 'the',
 'dawn',
 ',',
 'a']

In [10]:
def create_lookup_tables(text):
    Couted_text= Counter(text)
    sorted_vocab = sorted(Couted_text,
                          key=Couted_text.get,
                          reverse=True)
    
    # create int_to_vocab dictionaries
    int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
    vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}
    
    return (vocab_to_int, int_to_vocab)

vocab_to_int, int_to_vocab = create_lookup_tables(tokens)

In [11]:
print(len(vocab_to_int))
print(len(int_to_vocab))

24114
24114


In [12]:
# with open(f'{dirname}_token_emb.pkl', 'r') as f:
#     token_embeddings = pickle.load(f)

In [13]:
use_pretrained = True
cbow = False

if use_pretrained:
    print("using pretrained word embeddings from spaCy...")
    import spacy
    nlp = spacy.load('en_core_web_md')
    docs = list(nlp.pipe(tokens, n_process=6))
    token_embeddings = torch.tensor([doc.vector for doc in docs])
else:
    print("training word2vec...")
    import gensim
    word2vec = gensim.models.Word2Vec(sentences=[tokens],
                                      vector_size=300,
                                      window=5,
                                      workers=6,
                                      sg=1,
                                      min_count=1,
                                      seed=seed)
    word2vec.train([tokens],
                   total_examples=word2vec.corpus_count,
                   epochs=50)
    token_embeddings = torch.tensor([word2vec.wv[token] for token in tokens])

using pretrained word embeddings from spaCy...


  token_embeddings = torch.tensor([doc.vector for doc in docs])


In [14]:
with open(f'{dirname}_token_emb.pkl', 'wb') as f:
    pickle.dump(token_embeddings, f)

In [15]:
if not use_pretrained:
    print(len(word2vec.wv.key_to_index))

In [16]:
token_embeddings.shape

torch.Size([2084563, 300])

In [17]:
tokens[:12]

['prologue',
 'the',
 'comets',
 'tail',
 'spread',
 'across',
 'the',
 'dawn',
 ',',
 'a',
 'red',
 'slash']

In [18]:
token_embeddings[:12,]

tensor([[ 3.3344,  1.6953,  4.4473,  ..., -5.4445, -4.2309, -4.4592],
        [-5.1043,  2.3496,  3.2472,  ..., -7.6875, -2.5128,  0.6934],
        [-3.3126,  1.2278, -6.2548,  ..., -3.9095, -2.8027, -0.2607],
        ...,
        [-9.3629,  9.2761, -7.2708,  ...,  2.6801, -6.8160,  3.5737],
        [-6.9878,  1.1615, -8.0692,  ..., -0.3003, -3.9543, -2.6215],
        [-2.4895,  1.0770, -4.3209,  ..., -1.6725,  3.4914, -1.5722]])

In [19]:
def split_dataset(x_data,
                  y_data,
                  train_size = 0.8,
                  batch_size = 64,
                  seed = 42):
    # first split data into train set, test/valid set
    train_index, test_index = train_test_split(range(len(y_data)),
                                               test_size=(1-train_size),
                                               shuffle=True,
                                               random_state=seed)
    
    x_train = x_data[train_index]
    y_train = y_data[train_index]
    x_test = x_data[test_index]
    y_test = y_data[test_index]
    
    train = TensorDataset(x_train, y_train)
    test = TensorDataset(x_test, y_test)
    train_loader = DataLoader(dataset=train,
                              batch_size=batch_size,
                              shuffle=False)
    test_loader = DataLoader(dataset=test,
                             batch_size=batch_size,
                             shuffle=False)

    return train_loader, test_loader

## Fixed-window model

In [20]:
def create_dataset_for_Bengio(window_size, tokens, token_embeddings, vocab_to_int):
    X = []
    y = []
    for i in range(window_size-1, len(tokens)):
        # obtain word embeddings for the previous (window_size-1) words
        X.append(token_embeddings[i-(window_size-1):i,].flatten())
        # obtain token_id for the current word
        y.append(vocab_to_int[tokens[i]])
    return torch.stack(X), torch.tensor(y)

In [21]:
window_size = 3
train_size = 0.8
batch_size = 64
X, y = create_dataset_for_Bengio(window_size, tokens, token_embeddings, vocab_to_int)
print(X.shape)
print(y.shape)
train_loader, test_loader = split_dataset(x_data=X,
                                          y_data=y,
                                          train_size=train_size,
                                          batch_size=batch_size,
                                          seed=seed)

torch.Size([2084561, 600])
torch.Size([2084561])


In [22]:
class Bengio_et_al_word2vec(nn.Module):
    def __init__(self, n, embedding_dim, hidden_dim, vocab_size):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear((n-1)*embedding_dim, hidden_dim),
            nn.Sigmoid(),
            nn.Linear(hidden_dim, vocab_size)
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

In [23]:
embedding_dim = token_embeddings.shape[1]
hidden_dim = int((window_size-1)*embedding_dim/2)
set_seed(seed)
model = Bengio_et_al_word2vec(n=window_size,
                              embedding_dim=embedding_dim,
                              hidden_dim=hidden_dim,
                              vocab_size=len(vocab_to_int)).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

print(model)

Bengio_et_al_word2vec(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=600, out_features=300, bias=True)
    (1): Sigmoid()
    (2): Linear(in_features=300, out_features=24114, bias=True)
  )
)


In [24]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        
        # Zero gradients
        optimizer.zero_grad()

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()

        if batch % 10000 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"perplexity: {math.exp(loss)}")
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_perp, test_loss, correct = 0, 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            loss = loss_fn(pred, y).item()
            test_loss += loss
            test_perp += math.exp(loss)
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    test_perp /= num_batches
    correct /= size
    print(f"Test:\n - Accuracy: {(100*correct):>0.1f}\n"
          f"- Avg loss: {test_loss:>8f}\n",
          f"- Avg perplexity: {test_perp}\n")
    
def train_and_test(model, epochs, train_loader, test_loader, loss_fn, optimizer):
    for t in range(0, epochs+1, 500000):
        print(f"Epoch {t}\n-------------------------------")
        train(train_loader, model, loss_fn, optimizer)
        test(test_loader, model, loss_fn)
    print("Done!")

In [25]:
epochs = 1000000
train_and_test(model=model,
               epochs=epochs,
               train_loader=train_loader,
               test_loader=test_loader,
               loss_fn=loss_fn,
               optimizer=optimizer)

Epoch 0
-------------------------------
perplexity: 25382.982776132405
loss: 10.141834  [   64/1667648]
perplexity: 216.67187519881273
loss: 5.378384  [640064/1667648]
perplexity: 160.79597078654174
loss: 5.080136  [1280064/1667648]
Test:
 - Accuracy: 14.4%
- Avg loss: 5.627925
 - Avg perplexity: 330.31095512090286

Epoch 500000
-------------------------------
perplexity: 133.00092244079445
loss: 4.890356  [   64/1667648]
perplexity: 170.73113150408014
loss: 5.140090  [640064/1667648]
perplexity: 154.4577088973685
loss: 5.039920  [1280064/1667648]
Test:
 - Accuracy: 14.3%
- Avg loss: 5.770628
 - Avg perplexity: 430.07917302528324

Epoch 1000000
-------------------------------
perplexity: 134.1252610186134
loss: 4.898774  [   64/1667648]
perplexity: 226.3375768967899
loss: 5.422028  [640064/1667648]
perplexity: 121.6579892700085
loss: 4.801214  [1280064/1667648]
Test:
 - Accuracy: 14.2%
- Avg loss: 5.872339
 - Avg perplexity: 550.3235868151647

Done!


In [26]:
def generate_text(input, model, steps, window_size, seed=42, T=1):
    set_seed(seed)
    device = "cpu"
    model = model.to(device)
    model.eval()
    text = clean_text(input, remove_punct)
    print(f"initial input text: {text}")
    text_tokens = text.split(' ')
    if len(text_tokens) < window_size-1:
        text_tokens = ['' for i in range(window_size-1)] + text_tokens
    docs = list(nlp.pipe(text_tokens))
    token_embeddings = torch.tensor([doc.vector for doc in docs])
    emb_size = token_embeddings.shape[1]
    input_vec = token_embeddings[token_embeddings.shape[0]-(window_size-1):,].flatten().to(device)
    softmax = nn.Softmax(dim=0)
    with torch.no_grad():
        for i in range(steps):
            pred = model(input_vec)
            prob = torch.pow(softmax(pred), 1/T)
            prob = prob/prob.sum()
            word = int_to_vocab[random.choices(list(range(len(prob))), weights=prob)[0]]
            print("input tokens: "
                  f"{text_tokens[token_embeddings.shape[0]-(window_size-1)+i:]}"
                  " || next word:",
                  f"{word}")
            text_tokens.append(word)
            new_word_embedding = torch.tensor(nlp(word).vector).to(device)
            input_vec = torch.concatenate((input_vec[emb_size:], new_word_embedding), 0)
    return ' '.join(text_tokens)

Try different values of $T$ remembering that:
- $T$ small corresponds to just picking the most likely word
- $T$ large corresponds to random choice over the vocabulary

In [27]:
input_sequence = "meanwhile in winterfell, jon and catelyn walk to"
# input_sequence = "morning tony, can you take meadow to"

In [28]:
generate_text(input_sequence,
              model=model,
              steps=50,
              window_size=window_size,
              seed=seed,
              T=1)

initial input text: meanwhile in winterfell , jon and catelyn walk to
input tokens: ['walk', 'to'] || next word: to
input tokens: ['to', 'to'] || next word: his
input tokens: ['to', 'his'] || next word: teeth
input tokens: ['his', 'teeth'] || next word: about
input tokens: ['teeth', 'about'] || next word: no
input tokens: ['about', 'no'] || next word: notice
input tokens: ['no', 'notice'] || next word: .
input tokens: ['notice', '.'] || next word: those
input tokens: ['.', 'those'] || next word: secret
input tokens: ['those', 'secret'] || next word: whispers
input tokens: ['secret', 'whispers'] || next word: .
input tokens: ['whispers', '.'] || next word: as
input tokens: ['.', 'as'] || next word: well
input tokens: ['as', 'well'] || next word: .
input tokens: ['well', '.'] || next word: you
input tokens: ['.', 'you'] || next word: fell
input tokens: ['you', 'fell'] || next word: had
input tokens: ['fell', 'had'] || next word: done
input tokens: ['had', 'done'] || next word: or
input t

'meanwhile in winterfell , jon and catelyn walk to to his teeth about no notice . those secret whispers . as well . you fell had done or else . your lord father , lord of casterly and the solar behind him . once it is hurting so was too long . victarion took men know about day ,'

Increasing window size:

In [29]:
model_dict = {3: model}
for window_size in [4,5,6]:
    X, y = create_dataset_for_Bengio(window_size,
                                     tokens,
                                     token_embeddings,
                                     vocab_to_int)
    train_loader, test_loader = split_dataset(x_data=X,
                                              y_data=y,
                                              train_size=train_size,
                                              batch_size=batch_size,
                                              seed=seed)
    embedding_dim = token_embeddings.shape[1]
    hidden_dim = int((window_size-1)*embedding_dim/2)
    set_seed(seed)
    model_dict[window_size] = Bengio_et_al_word2vec(n=window_size,
                                                    embedding_dim=embedding_dim,
                                                    hidden_dim=hidden_dim,
                                                    vocab_size=len(vocab_to_int)).to(device)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model_dict[window_size].parameters(), lr=1e-3)
    print(f"##### window_size: {window_size}")
    print(model_dict[window_size])
    
    train_and_test(model=model_dict[window_size],
                   epochs=epochs,
                   train_loader=train_loader,
                   test_loader=test_loader,
                   loss_fn=loss_fn,
                   optimizer=optimizer)

##### window_size: 4
Bengio_et_al_word2vec(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=900, out_features=450, bias=True)
    (1): Sigmoid()
    (2): Linear(in_features=450, out_features=24114, bias=True)
  )
)
Epoch 0
-------------------------------
perplexity: 26702.04267301553
loss: 10.192495  [   64/1667648]
perplexity: 321.5304559221259
loss: 5.773092  [640064/1667648]
perplexity: 159.68819202574832
loss: 5.073223  [1280064/1667648]
Test:
 - Accuracy: 14.1%
- Avg loss: 5.682492
 - Avg perplexity: 390.1127882027954

Epoch 500000
-------------------------------
perplexity: 176.68868855985428
loss: 5.174389  [   64/1667648]
perplexity: 352.7773414545179
loss: 5.865837  [640064/1667648]
perplexity: 168.97748996173638
loss: 5.129766  [1280064/1667648]
Test:
 - Accuracy: 14.2%
- Avg loss: 5.789943
 - Avg perplexity: 629.3706449895562

Epoch 1000000
-------------------------------
perplexity: 207.7656002683925
loss: 5.336411  [   64/1667648]
perplexity: 311.43626951131

In [30]:
generate_text(input_sequence,
              model=model_dict[4],
              steps=50,
              window_size=4,
              seed=seed,
              T=1)

initial input text: meanwhile in winterfell , jon and catelyn walk to
input tokens: ['catelyn', 'walk', 'to'] || next word: the
input tokens: ['walk', 'to', 'the'] || next word: king
input tokens: ['to', 'the', 'king'] || next word: of
input tokens: ['the', 'king', 'of'] || next word: wisdom
input tokens: ['king', 'of', 'wisdom'] || next word: men
input tokens: ['of', 'wisdom', 'men'] || next word: wife
input tokens: ['wisdom', 'men', 'wife'] || next word: ,
input tokens: ['men', 'wife', ','] || next word: ned
input tokens: ['wife', ',', 'ned'] || next word: snow
input tokens: [',', 'ned', 'snow'] || next word: ,
input tokens: ['ned', 'snow', ','] || next word: the
input tokens: ['snow', ',', 'the'] || next word: truth
input tokens: [',', 'the', 'truth'] || next word: door
input tokens: ['the', 'truth', 'door'] || next word: .
input tokens: ['truth', 'door', '.'] || next word: i
input tokens: ['door', '.', 'i'] || next word: shall
input tokens: ['.', 'i', 'shall'] || next word: have
in

'meanwhile in winterfell , jon and catelyn walk to the king of wisdom men wife , ned snow , the truth door . i shall have called beneath long , lord and pushed her father , send the time they brothels south . arya did not presume loud , but it was stabbed with walls was chained as a'

In [31]:
generate_text(input_sequence,
              model=model_dict[5],
              steps=50,
              window_size=5,
              seed=seed,
              T=1)

initial input text: meanwhile in winterfell , jon and catelyn walk to
input tokens: ['and', 'catelyn', 'walk', 'to'] || next word: his
input tokens: ['catelyn', 'walk', 'to', 'his'] || next word: and
input tokens: ['walk', 'to', 'his', 'and'] || next word: such
input tokens: ['to', 'his', 'and', 'such'] || next word: matters
input tokens: ['his', 'and', 'such', 'matters'] || next word: rose
input tokens: ['and', 'such', 'matters', 'rose'] || next word: south
input tokens: ['such', 'matters', 'rose', 'south'] || next word: .
input tokens: ['matters', 'rose', 'south', '.'] || next word: these
input tokens: ['rose', 'south', '.', 'these'] || next word: muzzle
input tokens: ['south', '.', 'these', 'muzzle'] || next word: came
input tokens: ['.', 'these', 'muzzle', 'came'] || next word: .
input tokens: ['these', 'muzzle', 'came', '.'] || next word: she
input tokens: ['muzzle', 'came', '.', 'she'] || next word: meant
input tokens: ['came', '.', 'she', 'meant'] || next word: ,
input tokens: [

'meanwhile in winterfell , jon and catelyn walk to his and such matters rose south . these muzzle came . she meant , i fear to give out for the dagger . we would make to remain , the damp own breath . look to give free leagues . my lady . somehow frey felt on its stone .'

In [32]:
generate_text(input_sequence,
              model=model_dict[6],
              steps=50,
              window_size=6,
              seed=seed,
              T=1)

initial input text: meanwhile in winterfell , jon and catelyn walk to
input tokens: ['jon', 'and', 'catelyn', 'walk', 'to'] || next word: his
input tokens: ['and', 'catelyn', 'walk', 'to', 'his'] || next word: father
input tokens: ['catelyn', 'walk', 'to', 'his', 'father'] || next word: and
input tokens: ['walk', 'to', 'his', 'father', 'and'] || next word: buried
input tokens: ['to', 'his', 'father', 'and', 'buried'] || next word: off
input tokens: ['his', 'father', 'and', 'buried', 'off'] || next word: onto
input tokens: ['father', 'and', 'buried', 'off', 'onto'] || next word: the
input tokens: ['and', 'buried', 'off', 'onto', 'the'] || next word: fever
input tokens: ['buried', 'off', 'onto', 'the', 'fever'] || next word: shouted
input tokens: ['off', 'onto', 'the', 'fever', 'shouted'] || next word: him
input tokens: ['onto', 'the', 'fever', 'shouted', 'him'] || next word: .
input tokens: ['the', 'fever', 'shouted', 'him', '.'] || next word: she
input tokens: ['fever', 'shouted', 'him

'meanwhile in winterfell , jon and catelyn walk to his father and buried off onto the fever shouted him . she died , i dont do was cursed here . ser jaime thought the people was smooth . the pardon are angry . night of me seemed another claim in the way right that big who sniffed on the'

## Letting the model train the embeddings

In [33]:
class Bengio_et_al(nn.Module):
    def __init__(self, n, embedding_dim, hidden_dim, vocab_size):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Embedding(vocab_size, embedding_dim),
            nn.Flatten(),
            nn.Linear((n-1)*embedding_dim, hidden_dim),
            nn.Sigmoid(),
            nn.Linear(hidden_dim, vocab_size)
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

## Recurrent Neural Network

In [34]:
class RNN_word2vec(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_layers, vocab_size):
        super().__init__()
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.rnn = nn.RNN(embedding_dim,
                          hidden_dim,
                          num_layers,
                          nonlinearity='relu',
                          batch_first=True)
        self.linear = nn.Linear(hidden_dim, vocab_size)

    def forward(self, embeddings, hidden=None):
        outputs, hidden = self.rnn(embeddings, hidden)
        logits = self.linear(outputs)
        return logits, hidden
    
    def init_zero_hidden(self, batch_size=1):
        if batch_size==0:
            return torch.zeros(self.num_layers,
                               self.hidden_dim,
                               requires_grad=False).to(device)
        else:
            return torch.zeros(self.num_layers,
                               batch_size,
                               self.hidden_dim,
                               requires_grad=False).to(device)

In [35]:
class RNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_layers, vocab_size):
        super().__init__()
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim,
                          hidden_dim,
                          num_layers,
                          nonlinearity='relu',
                          batch_first=True)
        self.linear = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_ids, hidden=None):
        embeddings = self.embedding(input_ids)
        outputs, hidden = self.rnn(embeddings, hidden)
        logits = self.linear(outputs)
        return logits, hidden
    
    def init_zero_hidden(self, batch_size=1):
        if batch_size==0:
            return torch.zeros(self.num_layers,
                               self.hidden_dim,
                               requires_grad=False).to(device)
        else:
            return torch.zeros(self.num_layers,
                               batch_size,
                               self.hidden_dim,
                               requires_grad=False).to(device)

In [36]:
def train_RNN(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Compute prediction error for each token in sequence
        pred, _ = model(X)
        loss = 0
        for c in range(y.shape[1]):
            loss += loss_fn(pred[:,c], y[:,c])
        # divide our loss by the length of the sequence
        # to obtain average CE loss for this batch
        loss /= y.shape[1]

        # Backpropagation
        loss.backward()
        optimizer.step()

        if batch % 1000 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"perplexity: {math.exp(loss)}")
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test_RNN(dataloader, model, loss_fn):
    num_batches = len(dataloader)
    model.eval()
    test_perp, test_loss, correct = 0, 0, 0
    size = 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred, _ = model(X)                
            size += y.shape[0]*y.shape[1]
            loss = 0
            for c in range(y.shape[1]):
                loss += loss_fn(pred[:,c], y[:,c]).item()
                correct += (pred[:,c].argmax(1) == y[:,c]).type(torch.float).sum().item()
                size += 1
            loss /= y.shape[1]
            test_loss += loss
            test_perp += math.exp(loss)
    test_loss /= num_batches
    test_perp /= num_batches
    correct /= size
    print(f"Test:\n - Accuracy: {(100*correct):>0.1f}\n"
          f"- Avg loss: {test_loss:>8f}\n",
          f"- Avg perplexity: {test_perp}\n")
    
def train_and_test_RNN(model, epochs, train_loader, test_loader, loss_fn, optimizer):
    for t in range(0, epochs+1, 500000):
        print(f"Epoch {t}\n-------------------------------")
        train_RNN(train_loader, model, loss_fn, optimizer)
        test_RNN(test_loader, model, loss_fn)
    print("Done!")

In [37]:
def create_dataset_for_RNN(length_of_sequences, tokens, token_embeddings, vocab_to_int):
    X = []
    y = []
    for i in range(0, len(tokens)-length_of_sequences, length_of_sequences):
        if token_embeddings is None:
            # obtain token-ids for the next i:i+length_of_sequences words
            X.append([vocab_to_int[token] for token in tokens[i:(i+length_of_sequences)]])
        else:   
            # obtain word embeddings for the next i:i+length_of_sequences words
            X.append(token_embeddings[i:(i+length_of_sequences)])
        # obtain token_ids for the next (i+1):(i+length_of_sequences+1) words
        # we're sequentially predicting with an RNN
        y.append([vocab_to_int[token]
                  for token in tokens[(i+1):(i+length_of_sequences+1)]])
    if token_embeddings is None:
        return torch.tensor(X), torch.tensor(y)
    else:
        return torch.stack(X), torch.tensor(y)

In [38]:
X,y = create_dataset_for_RNN(50, tokens, token_embeddings, vocab_to_int)
train_loader, test_loader = split_dataset(x_data=X,
                                          y_data=y,
                                          train_size=train_size,
                                          batch_size=batch_size,
                                          seed=seed)

embedding_dim = 300
hidden_dim = 200
set_seed(seed)
RNN_w2v = RNN_word2vec(embedding_dim=embedding_dim,
                       hidden_dim=hidden_dim,
                       num_layers=1,
                       vocab_size=len(vocab_to_int)).to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(RNN_w2v.parameters(), lr=1e-3)
print(RNN_w2v)

RNN_word2vec(
  (rnn): RNN(300, 200, batch_first=True)
  (linear): Linear(in_features=200, out_features=24114, bias=True)
)


In [39]:
print(X.shape)
print(y.shape)

torch.Size([41691, 50, 300])
torch.Size([41691, 50])


In [40]:
train_and_test_RNN(model=RNN_w2v,
                   epochs=epochs,
                   train_loader=train_loader,
                   test_loader=test_loader,
                   loss_fn=loss_fn,
                   optimizer=optimizer)

Epoch 0
-------------------------------
perplexity: 59205.54517157895
loss: 10.988770  [   64/33352]
Test:
 - Accuracy: 16.2%
- Avg loss: 5.098456
 - Avg perplexity: 164.160492856811

Epoch 500000
-------------------------------
perplexity: 145.9900732012472
loss: 4.983539  [   64/33352]
Test:
 - Accuracy: 17.0%
- Avg loss: 4.985477
 - Avg perplexity: 146.64926832916578

Epoch 1000000
-------------------------------
perplexity: 114.53178193429657
loss: 4.740852  [   64/33352]
Test:
 - Accuracy: 17.5%
- Avg loss: 4.979248
 - Avg perplexity: 145.8150903935469

Done!


In [41]:
X,y = create_dataset_for_RNN(50, tokens, None, vocab_to_int)
train_loader, test_loader = split_dataset(x_data=X,
                                          y_data=y,
                                          train_size=train_size,
                                          batch_size=batch_size,
                                          seed=seed)

In [42]:
print(X.shape)
print(y.shape)

torch.Size([41691, 50])
torch.Size([41691, 50])


In [43]:
X[0]

tensor([11030,     2, 13332,  1596,  1135,   187,     2,   821,     1,     4,
          111,  3378,    16,  4789,   375,     2, 12067,     6,   905,    72,
            4,  1069,    13,     2,   912,     3,  1031,   485,     0,     2,
          179,   253,    28,     2,  7013,  4535,   467,     8,  1244,     0,
           15,     9,   105,     2,   993,   124,     1,   155,    99,  2124])

In [44]:
y[0]

tensor([    2, 13332,  1596,  1135,   187,     2,   821,     1,     4,   111,
         3378,    16,  4789,   375,     2, 12067,     6,   905,    72,     4,
         1069,    13,     2,   912,     3,  1031,   485,     0,     2,   179,
          253,    28,     2,  7013,  4535,   467,     8,  1244,     0,    15,
            9,   105,     2,   993,   124,     1,   155,    99,  2124,     0])

In [45]:
set_seed(seed)
RNNLM = RNN(embedding_dim=embedding_dim,
            hidden_dim=hidden_dim,
            num_layers=1,
            vocab_size=len(vocab_to_int)).to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(RNNLM.parameters(), lr=1e-3)
print(RNNLM)

RNN(
  (embedding): Embedding(24114, 300)
  (rnn): RNN(300, 200, batch_first=True)
  (linear): Linear(in_features=200, out_features=24114, bias=True)
)


In [46]:
train_and_test_RNN(model=RNNLM,
                   epochs=epochs,
                   train_loader=train_loader,
                   test_loader=test_loader,
                   loss_fn=loss_fn,
                   optimizer=optimizer)

Epoch 0
-------------------------------
perplexity: 26385.820603912714
loss: 10.180582  [   64/33352]
Test:
 - Accuracy: 15.7%
- Avg loss: 5.169092
 - Avg perplexity: 176.17416807423493

Epoch 500000
-------------------------------
perplexity: 158.30894824662178
loss: 5.064548  [   64/33352]
Test:
 - Accuracy: 17.1%
- Avg loss: 4.984224
 - Avg perplexity: 146.45302120152473

Epoch 1000000
-------------------------------
perplexity: 117.72990259179983
loss: 4.768393  [   64/33352]
Test:
 - Accuracy: 17.9%
- Avg loss: 4.916634
 - Avg perplexity: 136.9206951957755

Done!


In [47]:
def generate_text_RNN(input,
                      model,
                      steps,
                      word2vec,
                      lstm=False,
                      seed=42,
                      T=1):
    set_seed(seed)
    device = "cpu"
    model = model.to(device)
    model.eval()
    text = clean_text(input, remove_punct)
    print(f"initial input text: {text}")
    text_tokens = text.split(' ')
    if word2vec:
        docs = list(nlp.pipe(text_tokens))
        X = torch.tensor([doc.vector for doc in docs]).to(device)
    else:
        X = torch.tensor([vocab_to_int[token] for token in text_tokens])
    softmax = nn.Softmax(dim=0)
    with torch.no_grad():
        hidden = torch.zeros(model.num_layers,
                             model.hidden_dim,
                             requires_grad=False).to(device)
        if lstm:
            context = torch.zeros(model.num_layers,
                                  model.hidden_dim,
                                  requires_grad=False).to(device)
        for i in range(steps):
            if lstm:
                pred, (hidden, context) = model(X, (hidden, context))
            else:
                pred, hidden = model(X, hidden)
            prob = torch.pow(softmax(pred[-1]), 1/T)
            prob = prob/prob.sum()
            word = int_to_vocab[random.choices(list(range(len(prob))), weights=prob)[0]]
            print(f"next word: {word}")
            text_tokens.append(word)
            if word2vec:
                X = torch.tensor([nlp(word).vector]).to(device)
            else:
                X = torch.tensor([vocab_to_int[word]])
    return ' '.join(text_tokens)

In [48]:
generate_text_RNN(input_sequence,
                  model=RNN_w2v,
                  steps=50,
                  word2vec=True,
                  seed=seed,
                  T=1)

initial input text: meanwhile in winterfell , jon and catelyn walk to
next word: the
next word: wall
next word: ,
next word: merchants
next word: feet
next word: dancing
next word: and
next word: thumped
next word: water
next word: with
next word: the
next word: common
next word: sisters
next word: .
next word: her
next word: mind
next word: was
next word: small
next word: light
next word: outside
next word: .
next word: they
next word: had
next word: gone
next word: to
next word: battle
next word: ,
next word: since
next word: the
next word: king
next word: sat
next word: up
next word: before
next word: the
next word: cave
next word: and
next word: pulled
next word: hams
next word: through
next word: the
next word: doors
next word: .
next word: the
next word: deserter
next word: rose
next word: wide
next word: at
next word: bay
next word: as
next word: a


'meanwhile in winterfell , jon and catelyn walk to the wall , merchants feet dancing and thumped water with the common sisters . her mind was small light outside . they had gone to battle , since the king sat up before the cave and pulled hams through the doors . the deserter rose wide at bay as a'

In [49]:
generate_text_RNN(input_sequence,
                  model=RNNLM,
                  steps=50,
                  word2vec=False,
                  seed=seed,
                  T=1)

initial input text: meanwhile in winterfell , jon and catelyn walk to
next word: the
next word: gods
next word: of
next word: bushy
next word: cruel
next word: consumed
next word: ,
next word: drinking
next word: insisted
next word: that
next word: the
next word: free
next word: folk
next word: had
next word: only
next word: lifted
next word: his
next word: teeth
next word: ;
next word: instead
next word: the
next word: twins
next word: ,
next word: an
next word: eye
next word: see
next word: to
next word: cross
next word: the
next word: way
next word: down
next word: to
next word: lord
next word: snow
next word: ,
next word: that
next word: night
next word: soil
next word: every
next word: time
next word: to
next word: know
next word: .
next word: shouldnt
next word: be
next word: poured
next word: off
next word: such
next word: coming
next word: .


'meanwhile in winterfell , jon and catelyn walk to the gods of bushy cruel consumed , drinking insisted that the free folk had only lifted his teeth ; instead the twins , an eye see to cross the way down to lord snow , that night soil every time to know . shouldnt be poured off such coming .'

## LSTM

In [50]:
class LSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_layers, vocab_size):
        super().__init__()
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim,
                            num_layers,
                            batch_first=True)
        self.linear = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_ids, hidden_and_context=None):
        embeddings = self.embedding(input_ids)
        outputs, hidden_and_context = self.lstm(embeddings, hidden_and_context)
        logits = self.linear(outputs)
        return logits, hidden_and_context
    
    def init_zero_hidden(self, batch_size=1):
        if batch_size==0:
            return torch.zeros(self.num_layers,
                               self.hidden_dim,
                               requires_grad=False).to(device)
        else:
            return torch.zeros(self.num_layers,
                               batch_size,
                               self.hidden_dim,
                               requires_grad=False).to(device)

In [51]:
set_seed(seed)
LSTMLM = LSTM(embedding_dim=embedding_dim,
              hidden_dim=hidden_dim,
              num_layers=1,
              vocab_size=len(vocab_to_int)).to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(LSTMLM.parameters(), lr=1e-3)
print(LSTMLM)

LSTM(
  (embedding): Embedding(24114, 300)
  (lstm): LSTM(300, 200, batch_first=True)
  (linear): Linear(in_features=200, out_features=24114, bias=True)
)


In [52]:
train_and_test_RNN(model=LSTMLM,
                   epochs=epochs,
                   train_loader=train_loader,
                   test_loader=test_loader,
                   loss_fn=loss_fn,
                   optimizer=optimizer)

Epoch 0
-------------------------------
perplexity: 24458.742656332503
loss: 10.104743  [   64/33352]
Test:
 - Accuracy: 14.0%
- Avg loss: 5.487979
 - Avg perplexity: 242.37346972074675

Epoch 500000
-------------------------------
perplexity: 257.6838623382885
loss: 5.551733  [   64/33352]
Test:
 - Accuracy: 15.7%
- Avg loss: 5.218495
 - Avg perplexity: 185.10562069915218

Epoch 1000000
-------------------------------
perplexity: 186.11408468509555
loss: 5.226360  [   64/33352]
Test:
 - Accuracy: 16.6%
- Avg loss: 5.076339
 - Avg perplexity: 160.5536762701723

Done!


In [53]:
generate_text_RNN(input_sequence,
                  model=LSTMLM,
                  steps=50,
                  word2vec=False,
                  lstm=True,
                  seed=seed,
                  T=1)

initial input text: meanwhile in winterfell , jon and catelyn walk to
next word: the
next word: wall
next word: ,
next word: awkward
next word: two
next word: afloat
next word: .
next word: perhaps
next word: still
next word: raised
next word: the
next word: blade
next word: at
next word: the
next word: word
next word: of
next word: his
next word: lips
next word: as
next word: shed
next word: a
next word: whole
next word: -
next word: faced
next word: and
next word: aegon
next word: ,
next word: sniffed
next word: ,
next word: the
next word: lantern
next word: left
next word: himself
next word: .
next word: jaime
next word: thought
next word: of
next word: garnet
next word: screaming
next word: .
next word: is
next word: it
next word: .
next word: penny
next word: frey
next word: remained
next word: from
next word: throw
next word: into
next word: the


'meanwhile in winterfell , jon and catelyn walk to the wall , awkward two afloat . perhaps still raised the blade at the word of his lips as shed a whole - faced and aegon , sniffed , the lantern left himself . jaime thought of garnet screaming . is it . penny frey remained from throw into the'