In [1]:
%matplotlib inline

# Extending Character Level Recurrent Networks for Sequence Generation

We extend the approach outlined in `char_rnn_one_file_code_gen.ipynb` to improve efficiency and to accomodate other recurrent layer types. Notably, we introduce:

* Training will use multiple files instead of a single file
* Validation sets will be introduced to avoid overfitting on our training data
* Mini-batches for training speedup

Our outlined task is still the same.

**Given a sequence of characters, predict the next likely character in the sequence.**

In [2]:
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import random
from json import load
from math import floor
from time import time
from statistics import mean

## Preparing the Data

We will be training on a set of 180 preprocessed Python files (and validating on a set of 20 other files) arbitrarily sampled from [GitHub BigQuery Python Extracts](https://bigquery.cloud.google.com/table/fh-bigquery:github_extracts.contents_py_201802snap?pli=1).

We limit the characters that our neural network can produce to a subset of standard ASCII.
* `ORD 2*, 3*, 9, 10, 32-126`
  * `ORD 0` for padding (special, used in batch_size > 1 with variable length sequences)
  * `ORD 2` for start of text (special, predicted sequence start)
  * `ORD 3` for end of text (special, sequence prediction ends)
  * `ORD 9` horizontal tab "\t"
  * `ORD 10` NL line feed, new line "\n"
  * `ORD 32-126` Space, Punctuation, Digits, English Letters

NOTE: The full dataset contains files written using non standard characters. For the models in this notebook, we ensure that all Python files within our dataset are composed only of ASCII characters that we accept.

In [3]:
# Possible Characters for neural network
VALID_UNICODE_IDS = (0, 2, 3, 9, 10) + tuple(range(32, 127))
for uid in VALID_UNICODE_IDS:
    if uid <= 32:
        print("{}: {}".format(uid, repr(chr(uid))))
        continue
    print(chr(uid), end="")
print()

# Special Characters
PAD = chr(0)
FILE_START = chr(2)
FILE_END = chr(3)

CHARACTERS = set(chr(id) for id in VALID_UNICODE_IDS)
INT2CHAR = dict(enumerate(CHARACTERS))
CHAR2INT = {char: idx for idx, char in INT2CHAR.items()}

with open("./data/train.json", "r") as f:
    training_data = load(f)
with open("./data/validate.json", "r") as f:
    validation_data = load(f)

print("~" * 25)
print("Num Training Files: {}".format(len(training_data)))
print("Num Validation Files: {}".format(len(validation_data)))

# SANITY_CHECK, MEMORIZE ONE FILE
with open("./data/test.py", "r") as f:
    text = f.read()
training_data = ((FILE_START, ) + tuple(text) + (FILE_END,),)
validation_data = ((FILE_START, ) + tuple(text) + (FILE_END,),)
print("SANITY Training Files: {}".format(len(training_data)))
print("SANITY Validation Files: {}".format(len(validation_data)))

0: '\x00'
2: '\x02'
3: '\x03'
9: '\t'
10: '\n'
32: ' '
!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
~~~~~~~~~~~~~~~~~~~~~~~~~
Num Training Files: 180
Num Validation Files: 20
SANITY Training Files: 1
SANITY Validation Files: 1


Let's randomly take a look at what is contained within our training data.

In [4]:
train_sample = random.choice(training_data)
print("".join(train_sample))

"""Predict Test"""
import sys
from os import getcwd

def main():
    sys.stdout.write(getcwd())
    for i in range(0, 10):
        print("{} : Boop".format(i), i)
    return False

if __name__ == "__main__":
    main()



# Utilize CUDA & GPU

Training neural networks can be slow. Utilize GPUs if they are available to us.

In [5]:
device = torch.device("cpu")
HAS_CUDA = False
if torch.cuda.is_available():
    HAS_CUDA = True
    print("CUDA is available")
    print(" • Number of CUDA devices: {}".format(torch.cuda.device_count()))
    device = torch.device("cuda")
    print(" • Current Device Name: {}".format(torch.cuda.get_device_name(device)))
    print(" • Device CUDA Capability: {}".format(torch.cuda.get_device_capability(device)))

else:
    print("CUDA is not available.")

# force GPU off.
HAS_CUDA = False
device = torch.device("cpu")

CUDA is available
 • Number of CUDA devices: 1
 • Current Device Name: TITAN Xp
 • Device CUDA Capability: (6, 1)


## Batching Sliding Window Algorithm

We extend from the previously created sliding window algorithm, but rather than predicting the next character, we output the next sequence Y as one character shifted over:

* Given some iterable, we want a generator that yields X, Y pairs for evaluation.
* We want a sequence of a given context length as X, and the sequence one character shifted over as Y.

We want our sliding window algorithm to yield these pairs in batches, for more efficient computation. Iterable is no longer a single file, but an iterator over multiple files.

In [6]:
def batch_sliding_window_generator(files_iterable, batch_size, max_window_size=None, gen_forever=True):
    """Sliding window generator for batching files of ASCII characters"""
    while True:
        for iterable in files_iterable:
            if max_window_size is None:
                window_size = len(iterable) - 1
            else:
                window_size = min(max_window_size, len(iterable) - 1)

            inp_batch = []
            target_batch = []
            # each batch should consist of a single file
            for batch_item in range(batch_size):
                start_index = random.randint(0, len(iterable) - window_size)
                end_index = start_index + window_size + 1
                chunk = iterable[start_index:end_index]
                inp_batch.append(chunk[:-1])
                target_batch.append(chunk[1:])
            
            yield inp_batch, target_batch
        if not gen_forever:
            break

Here are the first few generator outputs.
Note the provided batch sizes and window sizes.

In [7]:
batch_size = 2
max_window_size = 15

gen = batch_sliding_window_generator(training_data, batch_size, max_window_size)
for i in range(3):
    inp, target = next(gen)
    print("Batch {}".format(i))
    print("Input:")
    print(*inp, sep="\n")
    print("Target: ")
    print(*target, sep="\n")
    print("~" * 25)

Batch 0
Input:
('r', 'i', 't', 'e', '(', 'g', 'e', 't', 'c', 'w', 'd', '(', ')', ')', '\n')
('o', 'r', 't', ' ', 's', 'y', 's', '\n', 'f', 'r', 'o', 'm', ' ', 'o', 's')
Target: 
('i', 't', 'e', '(', 'g', 'e', 't', 'c', 'w', 'd', '(', ')', ')', '\n', ' ')
('r', 't', ' ', 's', 'y', 's', '\n', 'f', 'r', 'o', 'm', ' ', 'o', 's', ' ')
~~~~~~~~~~~~~~~~~~~~~~~~~
Batch 1
Input:
('o', 'r', 'm', 'a', 't', '(', 'i', ')', ',', ' ', 'i', ')', '\n', ' ', ' ')
('r', 'a', 'n', 'g', 'e', '(', '0', ',', ' ', '1', '0', ')', ':', '\n', ' ')
Target: 
('r', 'm', 'a', 't', '(', 'i', ')', ',', ' ', 'i', ')', '\n', ' ', ' ', ' ')
('a', 'n', 'g', 'e', '(', '0', ',', ' ', '1', '0', ')', ':', '\n', ' ', ' ')
~~~~~~~~~~~~~~~~~~~~~~~~~
Batch 2
Input:
('\n', 'f', 'r', 'o', 'm', ' ', 'o', 's', ' ', 'i', 'm', 'p', 'o', 'r', 't')
('_', '"', ':', '\n', ' ', ' ', ' ', ' ', 'm', 'a', 'i', 'n', '(', ')', '\n')
Target: 
('f', 'r', 'o', 'm', ' ', 'o', 's', ' ', 'i', 'm', 'p', 'o', 'r', 't', ' ')
('"', ':', '\n', ' ', ' ', ' 

## Batch Characters to Tensors

Our batch of inputs and outputs must be converted into Tensors for training.

We follow the default matrix convention used by the PyTorch comunity.

> Tensor’s data will be of size `T x B x *`, where `T` is the length of the longest sequence and `B` is the batch size. 

To make a batch training example, we join a bunch our sequences of one-hot characters into a matrix of size `(window_size, batch_size, num_chars)`.

In [8]:
def char_to_tensor(char, num_chars=len(CHARACTERS)):
    tensor = torch.zeros(1, 1, num_chars, device=device)
    tensor[0][0][CHAR2INT[char]] = 1
    return tensor

def charseq_to_tensor(charseq, num_chars=len(CHARACTERS)):
    window_size = len(charseq)
    tensor = torch.zeros(window_size, 1, num_chars, device=device)
    for seq_idx, seq_item in enumerate(charseq):
        tensor[seq_idx][0][CHAR2INT[seq_item]] = 1
    return tensor

def charseqs_to_tensor(charseqs, num_chars=len(CHARACTERS)):
    batch_size = len(charseqs)
    window_size = max([len(v) for v in charseqs])
    assert all(len(v) == window_size for v in charseqs)
    
    tensor = torch.zeros(window_size, batch_size, num_chars, device=device)
    
    for batch_elm_idx, batch_item in enumerate(charseqs):
        for seq_idx, seq_item in enumerate(batch_item):
            tensor[seq_idx][batch_elm_idx][CHAR2INT[seq_item]] = 1
    return tensor

The tensor inputs and outputs for the generator function we defined earlier are shown here.

In [None]:
gen = batch_sliding_window_generator(training_data, batch_size, max_window_size)

inp_val, target_val = next(gen)
inp_tensor = charseqs_to_tensor(inp_val)
target_tensor = charseqs_to_tensor(target_val)

print("batch_size: {}, max_window_size: {}".format(batch_size, max_window_size))
print("~" * 25)
print(*inp_val, sep="\n")
print(inp_tensor.size())
#_, idxs = inp_tensor.topk(1)
print(*target_val, sep="\n")
print(target_tensor.size())

batch_size: 2, max_window_size: 15
~~~~~~~~~~~~~~~~~~~~~~~~~
('i', 'n', 't', '(', '"', '{', '}', ' ', ':', ' ', 'B', 'o', 'o', 'p', '"')
('e', 't', 'u', 'r', 'n', ' ', 'F', 'a', 'l', 's', 'e', '\n', '\n', 'i', 'f')
torch.Size([15, 2, 100])
('n', 't', '(', '"', '{', '}', ' ', ':', ' ', 'B', 'o', 'o', 'p', '"', '.')
('t', 'u', 'r', 'n', ' ', 'F', 'a', 'l', 's', 'e', '\n', '\n', 'i', 'f', ' ')
torch.Size([15, 2, 100])


## Creating the Network

We are going to create our recurrent neural network. Rather than using our own pure recurrent neural network defined in the last notebook, we will be using the recurrent layers provided by PyTorch.

* **RNN**: [Recurrent Neural Network](https://pytorch.org/docs/stable/nn.html#rnn)
* **GRU**: [Gated Recurrent Unit](https://pytorch.org/docs/stable/nn.html#gru)
* **LSTM**: [Long Short-Term Memory](https://pytorch.org/docs/stable/nn.html#lstm)

Our model is composed of the following:
0. Recurrent Layer (RNN/GRU/LSTM)
0. Linear Layer
0. Softmax Activation

In [None]:
class CharRNN(nn.Module):
    def __init__(self, input_size, output_size,
                 hidden_size=128, recurrent_type="LSTM", recurrent_layers=1, recurrent_dropout=0):
        super(CharRNN, self).__init__()

        self.recurrent_type = recurrent_type.upper()
        self.hidden_size = hidden_size
        self.recurrent_layers = recurrent_layers
        self.recurrent_dropoout = recurrent_dropout
        
        rn_kwargs = {
            "input_size": input_size,
            "hidden_size": hidden_size,
            "num_layers": recurrent_layers,
            "dropout": recurrent_dropout,
        }
        
        if self.recurrent_type == "RNN":
            self.rnn = nn.RNN(**rn_kwargs)
        elif self.recurrent_type == "LSTM":
            self.rnn = nn.LSTM(**rn_kwargs)
        elif self.recurrent_type == "GRU":
            self.rnn = nn.GRU(**rn_kwargs)
        else:
            raise "Invalid recurrent layer type: {}".format(recurrent_type)
        
        self.decoder = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=2)
    
    def forward(self, inp_val, hidden):
        for_decoder, hidden = self.rnn(inp_val, hidden)
        for_softmax = self.decoder(for_decoder)
        output = self.softmax(for_softmax)
        return output, hidden
    
    def init_hidden(self, batch_size):
        if self.recurrent_type == "LSTM":
            return (
                torch.zeros(self.recurrent_layers, batch_size, self.hidden_size, device=device),
                torch.zeros(self.recurrent_layers, batch_size, self.hidden_size, device=device)
            )
        return torch.zeros(self.recurrent_layers, batch_size, self.hidden_size, device=device)

Initilize this network with values appropriate for the character prediction task.

In [None]:
n_chars = len(CHARACTERS)
batch_size = 4

char_rnn = CharRNN(n_chars, n_chars)
if HAS_CUDA:
    char_rnn.cuda(device)

Before each batch, the RNN's hidden state should be initialized.

In [None]:
max_window_size = 20
gen = batch_sliding_window_generator(
    training_data, batch_size, max_window_size, gen_forever=False)
inp_charseq, target_charseq = next(gen)
inp_val = charseqs_to_tensor(inp_charseq)

hidden = char_rnn.init_hidden(batch_size)

output, hidden = char_rnn(inp_val, hidden)
print("Input:", inp_val.size())
print("Output:", output.size())

Input: torch.Size([20, 4, 100])
Output: torch.Size([20, 4, 100])


## Preparing to Train our Model

We will create a helper function to convert the network predicted output back to a human readable character.

In [None]:
def readable_from_output(output):
    """Convert the network output predictions back to readable values."""
    _, top_i = output.topk(1)
    window_size, batch_size, _ = top_i.size()
    batches_chars = []
    for batch_ele_idx in range(batch_size):
        batch_chars = []
        for seq_ele_idx in range(window_size):
            char_idx = top_i[seq_ele_idx][batch_ele_idx].item()
            char = INT2CHAR[char_idx], char_idx
            batch_chars.append(char)
        batches_chars.append(batch_chars)
    return batches_chars

def pretty_print_output(batches_chars):
    for seq_idx, seq in enumerate(batches_chars):
        print("Batch {}:".format(seq_idx), "".join([char for char, _ in seq]))

batches_chars = readable_from_output(output)
pretty_print_output(batches_chars)

Batch 0: ]]]]]]]]]]]]]]]]]]]]
Batch 1: ]]]]]]]]]]]]]]]]]]]]
Batch 2: ]]]]]]]]]]]]]]]]]]]]
Batch 3: ]]]]]]]]]]]]]]]]]]]]


Show the network some examples, have it make predictions, and then inform the network when the predictions are correct.
We have a classification problem, so Negative Log Likelihood loss is appropriate.

For classifing input into `C` number of classes, the following loss functions are useful:

* **Cross Entropy Loss**: [CrossEntropyLoss docs](https://pytorch.org/docs/stable/nn.html#crossentropyloss)
* **Negative Log Likelihood Loss**: [NLLLoss docs](https://pytorch.org/docs/stable/nn.html#nllloss)

In [None]:
# criterion = nn.CrossEntropyLoss()
criterion = nn.NLLLoss()

Each training loop will:
0. Create input and target tensors
0. Initialize a zerored hidden state
0. Read each letter in and keep the hidden state for the next letter
0. Compare the final output to the target
0. Back-propagate
0. Return the output and loss

In [None]:
def train(target_tensor, input_tensor, optimizer):
    window_size, batch_size, _ = input_tensor.size()
    hidden = char_rnn.init_hidden(batch_size)
    
    # Set model gradients to zero
    char_rnn.zero_grad()
    loss = 0
    
    hidden = char_rnn.init_hidden(batch_size)
    correct = []
    for i in range(window_size):
        output, hidden = char_rnn(input_tensor.narrow(0, i, 1), hidden)
        _, loss_output_tensor = target_tensor[i].topk(1)
        loss += criterion(output.view(1, -1), loss_output_tensor.view(1))
    
    loss.backward()
    optimizer.step()
    # diagnose training issues
    # import pdb; pdb.set_trace()
    
    return output, loss.item() / window_size

### Evaluation
We want to also evaluate the effectiveness of our model on a validation set, without updating the model.

This is done in the following function.

In [None]:
def evaluate(target_tensor, input_tensor):
    window_size, batch_size, _ = input_tensor.size()
    hidden = char_rnn.init_hidden(batch_size)
    
    # Set model gradients to zero
    char_rnn.zero_grad()
    loss = 0
    match = []
    
    hidden = char_rnn.init_hidden(batch_size)
    for i in range(window_size):
        output, hidden = char_rnn(input_tensor.narrow(0, i, 1), hidden)
        _, target_character = target_tensor[i].topk(1)
        _, predict_character = output.topk(1)
        
        if target_character.view(1).item() == predict_character.view(1).item():
            match.append(1)
        else:
            match.append(0)
        
        loss += criterion(output.view(1, -1), target_character.view(1))
    
    # loss.backward()
    accuracy = sum(match)/len(match)
    return accuracy, loss.item() / window_size

## Training our Model

Let's train our model now.

In [None]:
# Initialize required variables
n_chars = len(CHARACTERS)
batch_size = 1
char_rnn = CharRNN(n_chars, n_chars, recurrent_type="LSTM")
if HAS_CUDA:
    char_rnn.cuda(device)
learning_rate = 0.0005
char_rnn_optimizer = torch.optim.Adam(char_rnn.parameters(), lr=learning_rate)

# Helper function
def time_since(since):
    delta = time() - since
    sec = int(delta)
    mins = floor(sec / 60)
    sec -= mins * 60
    return "{}m {:0>2d}s".format(mins, sec)

# Keep track of all the trainng that is done
train_epoch_losses = []
eval_epoch_losses = []
batch_iter_idx = 0

In [None]:
# variable which can change in-between trainng
n_epochs = 1500
print_every_train = 443
print_every_epoch = 100
max_window_size = None

In [None]:
start = time()

for epoch_idx in range(n_epochs):
    # This generator will terminate when it finishes the file.
    gen = batch_sliding_window_generator(
        training_data, batch_size, max_window_size, gen_forever=False)
    for input_char_seq, target_char_seq in gen:
        input_tensor = charseqs_to_tensor(input_char_seq)
        target_tensor = charseqs_to_tensor(target_char_seq)
        
        pred_output, train_loss = train(target_tensor, input_tensor, char_rnn_optimizer)

        if batch_iter_idx % print_every_train == 0:            
            print("Epoch {:1d} {:.1f}%: iter {:5d} ({}) | Train Loss: {:.5f}".format(
                len(train_epoch_losses), epoch_idx/n_epochs * 100, batch_iter_idx,
                time_since(start), train_loss))
        batch_iter_idx += 1

    # Add current loss avg to list of losses after each epoch
    train_epoch_losses.append(train_loss)

    # Run our model on our validation data set to determine validation loss and accuracy
    gen = batch_sliding_window_generator(
        validation_data, batch_size, max_window_size, gen_forever=False)
    
    eval_accuracies = []
    eval_losses = []
    for input_char_seq, target_char_seq in gen:
        input_tensor = charseqs_to_tensor(input_char_seq)
        target_tensor = charseqs_to_tensor(target_char_seq)
        eval_accuracy, eval_loss = evaluate(target_tensor, input_tensor)
        eval_accuracies.append(eval_accuracy)
        eval_losses.append(eval_loss)

    mean_eval_loss = mean(eval_losses)
    mean_eval_acc = mean(eval_accuracies)

    eval_epoch_losses.append(mean_eval_loss)
    
    if len(train_epoch_losses) % print_every_epoch == 0:
        print("> Epoch {:1d} {:.1f}%: ({}) | Train Loss: {:.5f}, Mean Eval Loss: {:.5f}, Mean Acc: {:.2f}".format(
            len(train_epoch_losses), epoch_idx/n_epochs * 100,
            time_since(start), train_loss, mean_eval_loss, mean_eval_acc))

Epoch 0 0.0%: iter     0 (0m 00s) | Train Loss: 4.59795
> Epoch 100 6.6%: (0m 17s) | Train Loss: 3.14074, Mean Eval Loss: 3.14207, Mean Acc: 0.19
> Epoch 200 13.3%: (0m 34s) | Train Loss: 2.87004, Mean Eval Loss: 2.86263, Mean Acc: 0.20
> Epoch 300 19.9%: (0m 51s) | Train Loss: 2.74334, Mean Eval Loss: 2.72821, Mean Acc: 0.23
> Epoch 400 26.6%: (1m 08s) | Train Loss: 2.30188, Mean Eval Loss: 2.29751, Mean Acc: 0.28
Epoch 443 29.5%: iter   443 (1m 15s) | Train Loss: 2.12387
> Epoch 500 33.3%: (1m 25s) | Train Loss: 1.89270, Mean Eval Loss: 1.89095, Mean Acc: 0.40
> Epoch 600 39.9%: (1m 42s) | Train Loss: 1.47570, Mean Eval Loss: 1.47778, Mean Acc: 0.57
> Epoch 700 46.6%: (1m 59s) | Train Loss: 1.11539, Mean Eval Loss: 1.12013, Mean Acc: 0.74
> Epoch 800 53.3%: (2m 16s) | Train Loss: 0.83196, Mean Eval Loss: 0.83565, Mean Acc: 0.89
Epoch 886 59.1%: iter   886 (2m 31s) | Train Loss: 1.57955
> Epoch 900 59.9%: (2m 33s) | Train Loss: 1.49931, Mean Eval Loss: 1.48600, Mean Acc: 0.63
> Epoch 

## Plotting the Training Process
Plotting the historical loss from train_epoch_losses show how well the network is learning.

In [None]:
plt.figure()
plt.title("Batching Char-RNN Training Loss over Epoch")
plt.xlabel("Epoch")
plt.ylabel("Negative Log Likelihood")
plt.plot(train_epoch_losses)

In [None]:
plt.figure()
plt.title("Batching Char-RNN Evaluation Loss over Epoch")
plt.xlabel("Epoch")
plt.ylabel("Negative Log Likelihood")
plt.plot(eval_epoch_losses)

## Using User Specified Input
Let's test this ourselves by supplying our own Python source code.

In [None]:
def generate(rnn, prime_str=FILE_START, print_output=True, max_gen_len=1000, temperature=None):
    hidden = rnn.init_hidden(1)
    if not prime_str.startswith(FILE_START):
        prime_str = FILE_START + prime_str

    input_seq = list(prime_str)
    # use priming sequence to construct the hidden state
    input_tensor = charseq_to_tensor(input_seq)
    for i in range(len(input_seq)):
        output, hidden = rnn(input_tensor.narrow(0, i, 1), hidden)
    
    if print_output:
        print("".join(input_seq))
        print("~~~~Predict~~~~")
        print("".join(input_seq))

    # predict until max_len or FILE_END character is reached
    predicted = input_seq[:]
    for i in range(max_gen_len):
        if temperature is not None:
            # Sample from the network as a multinomial distribution
            output_dist = output.data.view(-1).div(temperature).exp()
            top_i = torch.multinomial(output_dist, 1)[0]
            char = INT2CHAR[top_i.item()]
        else:
            _, pred_char_idx = output.topk(1)
            char = INT2CHAR[pred_char_idx.item()]
        if print_output:
            print(char, end="")
        predicted.append(char)
        if char == FILE_END:
            return predicted
        input_tensor = char_to_tensor(char)
        output, hidden = rnn(input_tensor, hidden)
    
    if print_output:
        print("~max_gen_len reached~")
    return predicted

In [None]:
output = generate(char_rnn)

What happens when you use this with your own text?

In [None]:
output = generate(char_rnn, "ale")