<a href="https://colab.research.google.com/github/bdostert/colab-neural-networks/blob/main/RNN_for_words_letter_by_letter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch #Pytorch is a Python module that can create neural networks  and automatically do backpropogation for training a network.
import torch.nn as nn #Torch.nn is a submodule of torch that can create various types of networks and functions that operate on them.


# single-direction RNN, optionally tied embeddings
class Emb_RNN(nn.Module):
    def __init__(self, params, use_LSTM=False):
        super(Emb_RNN, self).__init__()
        self.d_embs = params['d_emb'] #dimension of embeddings
        self.d_hid =  params['d_hid'] #dimension of hidden layer
        self.embeddings= nn.Embedding(params['num_chs'], self.d_embs) #A separate embeddding for each character in char list
        self.use_LSTM = use_LSTM #LSTM is more powerful than a simple RNN
        # input to recurrent layer, default nonlinearity is tanh
        if use_LSTM:
            self.i2R = nn.LSTMCell(self.d_embs, self.d_hid)
        else:
            self.i2R = nn.RNNCell(self.d_embs, self.d_hid)
        # recurrent to output layer
        self.R2o = nn.Linear(self.d_hid, params['num_chs'])
        if self.d_embs == self.d_hid:
            self.R2o.weight = self.embeddings.weight


    def forward(self, ch_indices):
        preds = [] #initialize list of predictions, each of which is a score for each character
        for j, ch_ix in enumerate(ch_indices):
            emb = self.embeddings(ch_ix) #Get the embedding of the character
            emb = torch.unsqueeze(emb, 0)
            if self.use_LSTM:
                if j == 0:
                    hidden, context = self.i2R(emb) #We don't supply the hidden or context the first time.
                                                     #Pytorch will default it to zeroes.
                else:
                    hidden, context = self.i2R(emb, (hidden, context))
            else:
                if j == 0:
                    hidden = self.i2R(emb)
                else:
                    hidden = self.i2R(emb, hidden)
            preds.append(self.R2o(hidden))
        return torch.stack(preds, dim=1) #The predictions of the characters are stacked into one matrix. Each row is a prediction set.






In [None]:
import torch
import torch.nn as nn
import numpy as np
import re
import sys
import collections
import os
import random

verbose = False
play = True

num_epochs = 3

d_emb = 64 #Hyperparameters can be changed here.
n_layers = 1
d_hid = 64
lr = 0.003
use_LSTM = True
if use_LSTM:
    model_type = 'lstm'
else:
    model_type = 'rnn'


def train(net, words, params):
    criterion = nn.CrossEntropyLoss()
    optimiser = torch.optim.Adam(net.parameters(), lr=lr)
    if os.path.exists(params['save_path']):
        checkpoint = torch.load(params['save_path'])
        print('Loading checkpoint')
        net.load_state_dict(checkpoint['net_state_dict'])
        optimiser.load_state_dict(checkpoint['optimiser_state_dict'])
        net.train()

    for epoch in range(num_epochs):
        ep_loss = 0.
        num_tested = 0
        num_correct = 0
        your_epoch_score = 0
        your_epoch_score2 = 0
        model_epoch_score = 0
        for counter, i in enumerate(torch.randperm(len(words))): #Randonly choose a word.
            pred = net(words[i]) #Predict on the model.
            pred = pred[:,:-1,:].contiguous().view(-1, pred.size(-1)) #Offset the predictions from the target
                    #so that we predict the next character based on the previous character.
            target = words[i][1:]
            target = target.contiguous().view(-1)
            target = target.long()
            #print('pt', pred.size(), target.size())
            with torch.no_grad():
                pred_numpy = np.argmax(pred.numpy(), axis=1).tolist()
                target_numpy = target.numpy().tolist()
                matched_chars = [c1 for c1, c2 in zip(pred_numpy, target_numpy) if c1 == c2]
                num_tested += len(target_numpy)
                num_correct += len(matched_chars)
                if play and counter != 0 and counter % 1000 == 0:
                    your_score = 0
                    your_score2 = 0
                    model_score = 0
                    print('Player 1\'s score so far for this epoch', your_epoch_score)
                    print('Player 2\'s score so far for this epoch', your_epoch_score2)
                    print('The model\'s score so far for this epoch', model_epoch_score)
                    #print(''.join([ix2ch[str(c)] for c in pred_numpy]), ''.join([ix2ch[str(c)] for c in target_numpy]))
                    for k in range(2, len(target_numpy)):
                        #print(''.join([ix2ch[str(c)] for c in target_numpy[:k]]), ''.join([ix2ch[str(c)] for c in target_numpy[:k]])+ix2ch[str(pred_numpy[k])])
                        print('Player 1 guess the continuation of', ''.join([ix2ch[str(c)] for c in target_numpy[:k-1]]), 'and press Enter')
                        guess = input()
                        print('Player 1 guessed', guess, 'The model guessed', ''.join([ix2ch[str(c)] for c in pred_numpy[:k-1]])+ix2ch[str(pred_numpy[k])])
                        print('Player 2 guess the continuation of', ''.join([ix2ch[str(c)] for c in target_numpy[:k-1]]), 'and press Enter')
                        guess2 = input()
                        print('Player 2 guessed', guess2, 'The model guessed', ''.join([ix2ch[str(c)] for c in pred_numpy[:k-1]])+ix2ch[str(pred_numpy[k])])
                        print('The actual target is', ''.join([ix2ch[str(c)] for c in target_numpy[:k]]))
                        if guess == ''.join([ix2ch[str(c)] for c in target_numpy[:k]]):
                            your_score += 1
                            your_epoch_score += 1
                        if guess2 == ''.join([ix2ch[str(c)] for c in target_numpy[:k]]):
                            your_score2 += 1
                            your_epoch_score2 += 1
                        if ''.join([ix2ch[str(c)] for c in target_numpy[:k-1]])+ix2ch[str(pred_numpy[k])] == ''.join([ix2ch[str(c)] for c in target_numpy[:k]]):
                            model_score += 1
                            model_epoch_score += 1
                        print('Player 1\'s score:', your_score, '\nPlayer 2\'s score:', your_score2, '\nModel score:', model_score)
                    print()
                else:
                    if counter % 1000 == 0:
                        print('Trained on', counter, 'words so far in this epoch')
            loss = criterion(pred, target)
            if torch.isnan(loss):
                with torch.no_grad():
                    print(pred, target, words[i], ix2ch[i])
                    exit()
            loss.backward()
            optimiser.step()
            optimiser.zero_grad()
            ep_loss += loss.detach()
        print('Epoch', epoch, 'Accuracy', round(num_correct / num_tested, 4), 'Loss', ep_loss)
        print('Saving checkpoint')
        torch.save({'net_state_dict': net.state_dict(),  'optimiser_state_dict': optimiser.state_dict()}, params['save_path'])


In [None]:
import json
words = collections.defaultdict(lambda: [])
words_as_indices = {}
models = {} #Put the models in a dictionary in case we train on multiple models for multiple datasets.
path = '' #No directory needs to be specified if the files are uploaded here.
#word_files = ['english']
word_files = ['ukwords.txt']
input('Make sure that the data files are uploaded to Colab for this session \nand then press Enter to continue.')
for word_file in word_files:
    chars = []
    print(path+word_file)
    if os.path.isfile(path+word_file):
        print('Processing file', word_file)
        with open(path+word_file, 'r') as f0:
            for i, line in enumerate(f0.readlines()):
                if i % 1000 == 0:
                    print('Processed', i, 'lines.')
                line = line.rstrip()
                line = '#' + line + '#'
                if len(line) < 4:
                    continue
                words[word_file].append(line)
                for ch in line:
                     if ch not in chars:
                         chars.append(ch.lower())
    else:
        print('No file found with  name', word_file)
        exit()

    ch2ix = {}
    ix2ch = {}
    total_chars = len(chars)
    print('total chars', total_chars)
    for i, char in enumerate(chars):
        ch2ix[char] = i #Set up dictionaries for converting characters to indices and vice versa.
        ix2ch[str(i)] = char
    with open(word_file+'.ch2ix.json', 'w') as f1:
        json.dump(ch2ix, f1)
    with open(word_file+'.ix2ch.json', 'w') as f2:
        json.dump(ix2ch, f2)
    words_as_indices[word_file] = [torch.LongTensor([ch2ix[c] for c in word])
        for word in words[word_file]
      ]

    params = {'num_chs': total_chars,
              'd_emb': d_emb,
              'num_layers': n_layers,
              'd_hid': d_hid,
              'lr': lr,
              'epochs': num_epochs,
              'save_path': word_file+'.'+model_type+'.d_emb'+str(d_emb)+'.n_layers'+str(n_layers)+'.d_hid'+str(d_hid)+'.lr'+str(lr)+'.pth'}


    models[word_file] = Emb_RNN(params, use_LSTM)
    train(models[word_file], words_as_indices[word_file], params)


Make sure that the data files are uploaded to Colab for this session 
and then press Enter to continue.
ukwords.txt
Processing file ukwords.txt
Processed 0 lines.
Processed 1000 lines.
Processed 2000 lines.
Processed 3000 lines.
Processed 4000 lines.
Processed 5000 lines.
Processed 6000 lines.
Processed 7000 lines.
Processed 8000 lines.
Processed 9000 lines.
Processed 10000 lines.
Processed 11000 lines.
Processed 12000 lines.
total chars 27
Loading checkpoint
Trained on 0 words so far in this epoch
Player 1's score so far for this epoch 0
Player 2's score so far for this epoch 0
The model's score so far for this epoch 0
Player 1 guess the continuation of s and press Enter
st
Player 1 guessed st The model guessed sr
Player 2 guess the continuation of s and press Enter
sa
Player 2 guessed sa The model guessed sr
The actual target is st
Player 1's score: 1 
Player 2's score: 0 
Model score: 0
Player 1 guess the continuation of st and press Enter
str
Player 1 guessed str The model guessed 

KeyboardInterrupt: Interrupted by user