In [None]:
#This cell sets up the model.
import re
import json
import sys
import collections
import os
import torch
import torch.nn as nn
from datetime import datetime


# single-direction RNN, optionally tied embeddings
class Emb_RNNLM(nn.Module):
    def __init__(self, params, use_LSTM=True):
        super(Emb_RNNLM, self).__init__()
        self.vocab_size = params['vocab_size']
        self.d_emb = params['d_emb']
        self.n_layers = params['num_layers']
        self.d_hid = params['d_hid']
        self.embeddings = nn.Embedding(self.vocab_size, self.d_emb)
        self.use_LSTM = use_LSTM
        if use_LSTM:
            print('Using LSTM model')
            self.i2R = nn.LSTM(self.d_emb, self.d_hid, batch_first=True, num_layers=self.n_layers) #input to recurrent layer, default nonlinearity is tanh
        else:
            # input to recurrent layer, default nonlinearity is tanh
            self.i2R = nn.RNN(
                self.d_emb, self.d_hid, batch_first=True, num_layers = self.n_layers
            )
        # recurrent to output layer
        self.R2o = nn.Linear(self.d_hid, self.vocab_size)

    def forward(self, train_datum):
        embs = torch.unsqueeze(self.embeddings(train_datum), 0)
        if self.use_LSTM:
            output, (hidden, context) = self.i2R(embs)
        else:
            output, hidden = self.i2R(embs)
        return self.R2o(output) #The second and third returned values are not used for training but for probing the model to see what it is encoding.


In [None]:
import re
import json
import sys
import collections
import os
import random
import torch
import torch.nn as nn

verbose = False
sentences = collections.defaultdict(lambda: [])
models = {}
book_titles = ['Dickens4novels.txt', 'JaneAustenComplete.txt']
input('Make sure that the two book title files are uploaded to Colab for this session \nand then press Enter to continue.')

for book_title in book_titles:
    words = ['<s>', '<e>']
    if os.path.isfile(book_title):
        print('Processing file', book_title)
        with open(book_title, 'r') as f0:
            sentence_buffer = ['<s>']
            for i, line in enumerate(f0.readlines()):
                if i % 1000 == 0:
                    print('Processed', i, 'lines.')
                line = line.rstrip()
                if len(line) < 1:
                    continue
                if re.search(r'^[A-Z][A-Z][A-Z]', line):
                    continue
                if line[0] == '[':
                    continue
                line = re.sub(r'([\.,;:!\?”])', r' \1', line)
                line = re.sub(r'(“)', r'\1 ', line)
                line = re.sub(r'[_‘]', '', line)
                line = re.sub('—', ' ', line)
                line = re.sub(r'[^a-zA-Z\.’ ]', '', line)
                #print(line)
                buffer_empty = True
                lal = line.split()
                for wd in lal:
                    if buffer_empty == False:
                        sentence_buffer.append(wd.lower())
                        if wd not in words:
                            words.append(wd.lower())
                    buffer_empty = False
                    if wd in ['.', '!', '?', ':', ';']:
                        sentences[book_title].append(sentence_buffer + ['<e>'])
                        sentence_buffer = ['<s>']
                        buffer_empty = True
    else:
        print('No file found with  name', book_title)
        exit()

    wd2ix = {}
    total_words = len(words)
    print('total words', total_words)
    for i, word in enumerate(words):
        wd2ix[word] = i
        if verbose and i < 100: print(word)
    sentences_as_indices = [torch.LongTensor([wd2ix[w] for w in sent])
        for sent in sentences[book_title]
      ]
    #training_data = torch.stack(sentences_as_indices, 0)

    params = {'vocab_size': total_words, 'd_emb': 128, 'num_layers': 1, 'd_hid': 128, 'lr': 0.0003, 'epochs': 5}

    models[book_title] = Emb_RNNLM(params)



Make sure that the two book title files are uploaded to Colab for this session 
and then press Enter to continue.
Processing file Dickens4novels.txt
Processed 0 lines.
Processed 1000 lines.
Processed 2000 lines.
Processed 3000 lines.
Processed 4000 lines.
Processed 5000 lines.
Processed 6000 lines.
Processed 7000 lines.
Processed 8000 lines.
Processed 9000 lines.
Processed 10000 lines.
Processed 11000 lines.
Processed 12000 lines.
Processed 13000 lines.
Processed 14000 lines.
Processed 15000 lines.
Processed 16000 lines.
Processed 17000 lines.
Processed 18000 lines.
Processed 19000 lines.
Processed 20000 lines.
Processed 21000 lines.
Processed 22000 lines.
Processed 23000 lines.
Processed 24000 lines.
Processed 25000 lines.
Processed 26000 lines.
Processed 27000 lines.
Processed 28000 lines.
Processed 29000 lines.
Processed 30000 lines.
Processed 31000 lines.
Processed 32000 lines.
Processed 33000 lines.
Processed 34000 lines.
Processed 35000 lines.
Processed 36000 lines.
Processed 370

In [None]:
total_words
params['vocab_size']
sentences_as_indices[0]


tensor([    0, 46034, 11226, 46152, 10992, 45936, 61115, 60148, 61436, 60998,
        47198, 58296, 60856, 52873,    15, 46355, 42788,    18,  5500, 37723,
           21, 61002, 61791,   832, 59665, 59993,    27,    28, 60856, 54628,
           30, 43719, 61945,    33, 61115, 60998,    34, 61180, 59665, 52873,
           36, 53733,    38,    39,    40, 61945,    41, 58248,    43, 61791,
           44,    45, 46152, 61791,    46,    47, 59665, 37723,    48,    49,
           50, 61710,    52,    53,    54,    55,    39, 44570, 61945, 43805,
        59738, 59993,    59,    60, 61791,    61,    62,    63, 46152, 61791,
        50529, 45970, 61945, 59665, 61457, 56398,    68,    69, 53733,    70,
        59993, 59664,    72, 52873,    15, 46122, 61492, 54628, 45197,    76,
        42788,    77,    78,     1])

In [None]:
#This cell trains with the model and can be skipped if training has already been done and there is a checkpoint.
#Now train
for book_title in book_titles:
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimiser = torch.optim.Adam(models[book_title].parameters(), lr=params['lr'])

    for epoch in range(params['epochs']):
        ep_loss = 0
        random.shuffle(sentences_as_indices)
        for j, train_datum in enumerate(sentences_as_indices):
            if len(train_datum) < 4:
                continue
            #print('td', train_datum)
            preds = models[book_title](train_datum)
            print('ps',preds.size())
            print('vs',params['vocab_size'])
            print('ts', targets.size())
            preds = preds[:, :-1, :].contiguous().view(-1, params['vocab_size'])
            #preds = preds[:, :-1, :]
            #targets = torch.unsqueeze(train_datum, 0)
            targets = targets[:, 1:].contiguous().view(-1)

            #print(preds.size(), targets.size())
            loss = criterion(preds, targets)
            #print('loss', loss.detach())
            if torch.isnan(loss):
                print(train_datum)
            loss.backward()
            optimiser.step()
            optimiser.zero_grad()
            ep_loss += loss.detach()
            if j > 0 and j % 1000 == 0:
                print('processed', j, 'training examples')
        print('epoch', epoch, 'epoch loss', ep_loss / len(sentences_as_indices))



ps torch.Size([1, 17, 69233])
vs 62028
ts torch.Size([1, 18])


RuntimeError: shape '[-1, 62028]' is invalid for input of size 1107728

In [None]:
import random
import numpy as np
import torch
import torch.nn as nn
import os
softmax = nn.Softmax(dim=-1)

book_title = 'GreatExpectations_nll.txt'
if os.path.exists(save_path):
    checkpoint = torch.load(save_path)
    print('Loading checkpoint')
    models[book_title].load_state_dict(checkpoint['net_state_dict'])
    #optimiser.load_state_dict(checkpoint['optimiser_state_dict'])
    models[book_title].eval()
else:
    print('No checkpoint found')
    exit()

choose_randomly = False
choose_randomly = True

with torch.no_grad():
    accumulated_words = ['there']
    accumulated_words = ['since']
    accumulated_words = ['when']
    accumulated_words = ['even', 'if']
    #accumulated_words = ['<s>']
    indices_of_accumulated_words = torch.LongTensor([wd2ix[wd] for wd in accumulated_words])
    next_wd = models[book_title](indices_of_accumulated_words)
    #print(next_wd_as_list)
    if choose_randomly:
        next_wd_as_list = softmax(next_wd[0,-1,:]).numpy().tolist()
        next_wd_as_list = torch.exp(next_wd[0,-1,:]).numpy().tolist()
        chosen_wd = random.choices(words, next_wd_as_list)[0]
    else:
        next_wd_as_array = next_wd[0,-1,:].numpy()
        best_index = np.argmax(next_wd_as_array)
        chosen_wd = words[best_index]
    accumulated_words.append(chosen_wd)
    for i in range(40):
        indices_of_accumulated_words = torch.LongTensor([wd2ix[wd] for wd in accumulated_words])
        next_wd = models[book_title](indices_of_accumulated_words)
        if choose_randomly:
            next_wd_as_list = softmax(next_wd[0,-1,:]).numpy().tolist()
            next_wd_as_list = torch.exp(next_wd[0,-1,:]).numpy().tolist()
            chosen_wd = random.choices(words, next_wd_as_list)[0]
        else:
            next_wd_as_array = next_wd[0,-1,:].numpy()
            best_index = np.argmax(next_wd_as_array)
            chosen_wd = words[best_index]
        if chosen_wd == '<e>':
            break
        accumulated_words.append(chosen_wd)
    for wd in accumulated_words:
        print(wd, end=' ')
    print()



Loading checkpoint
even if i had derived from the forge that else had had happened out it . 


Results:
 by that little knowledge i shall be sure of myself but for hold for that he would retort yourself of what he could hardly only fraud she was done and i looked down at the dressingtable how in my choice occasion

there was a long breath in the same place and the wind on the wall of the river .

even if i had been in my eyes and the whole of the house and the two was going to bed .

In [None]:
#Run TSNE on the embeddings.
import json
import torch
from google.colab import files
files.upload()
from tsne import Hbeta, x2p, pca, tsne
with torch.no_grad():
    embeddings = models[book_title].embeddings.weight.numpy()
    print(embeddings.shape)

Y = tsne(embeddings, 2, 50, 30.0)
Y_as_list = y.tolist()
with open('drive/MyDrive/Colab Notebooks/Y.json', 'w') as f4:
    json.dump(Y_as_list, f4)


KeyboardInterrupt: ignored

In [None]:
import re
import json
import random
import collections
import os

'''This notebook uses trigrams to create random artificial sentences
based on the text in Charles Dickens' novel Great Expectations.'''

trigram_dict = collections.defaultdict(lambda: collections.defaultdict(lambda: []))
vocab = ['<s>', '<e>']
sentences = []
with open('drive/MyDrive/Colab Notebooks/GreatExpectations_nll.txt', 'r') as f0:
    sentence_buffer = []
    for line in f0.readlines():
        line = line.rstrip()
        if re.search(r'^[A-Z][A-Z][A-Z]', line):
            continue
        line = line.replace(".", " .")
        line = line.replace("!", " .")
        line = line.replace(";", " .")
        line = line.replace("?", " .")
        line = line.replace("—", " ")
        line = re.sub(r'[^a-zA-Z\.’ ]', '', line)
        lal = line.split()
        for wd in lal:
            wd = wd.lower()
            sentence_buffer.append(wd)
            if wd in ['.', '!']:
                sentences.append(['<s>'] + sentence_buffer + ['<e>'])
                sentence_buffer = []
            if wd not in vocab:
                vocab.append(wd)

wd2ix = {}
ix2wd = {}
for i, wd in enumerate(vocab):
    wd2ix[wd] = i
    ix2wd[str(i)] = wd


ix2wd = {str(ix): wd for (wd, ix) in wd2ix.items()} #Is this line redundant?

print('There are', len(vocab), 'words.')
for i, s in enumerate(sentences):
    for j, wd in enumerate(s):
        if j < len(s) - 2:
            trigram_dict[wd][s[j+1]].append(wd2ix[s[j+2]])

with open('drive/MyDrive/trigram_dict.json', 'w') as f1:
    json.dump(trigram_dict, f1)
with open('drive/MyDrive/wd2ix.json', 'w') as f2:
    json.dump(wd2ix, f2)
with open('drive/MyDrive/ix2wd.json', 'w') as f3:
    json.dump(ix2wd, f3)

current_wd = '<s>'
next_wd = random.choice(list(trigram_dict['<s>'])) #Some random key of the dictionary that is the value of key '<s>'
print(next_wd, end=' ')
while True:
    if next_wd == '<e>':
        print()
        break
    #print(current_wd, end=' ')
    #print(bigram_dict[current_wd])
    #print(random.choice(bigram_dict[current_wd]))
    pred_wd = ix2wd[str(random.choice(trigram_dict[current_wd][next_wd]))]
    if pred_wd == '<e>':
        print()
        break
    print(pred_wd, end=' ')
    current_wd = next_wd
    next_wd = pred_wd

#One run yielded: assistance was sent for me near the door .