In [1]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F

## Load data

In [2]:
with open('data/anna.txt', 'r') as f:
    text = f.read()

In [3]:
text[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

## Tokenization

We create a couple dictionaries to convert the characters to and from integers. Encoding the characters as integers makes it easier to use as input in the network.

In [4]:
# encode the text and map each character to an integer and vice versa

# we create 2 dictionaries
# int2char
# char2int
chars = tuple(set(text))
#print(chars)
int2char = dict(enumerate(chars))
#print(int2char)
char2int = {ch: ii for ii, ch in int2char.items()}
#print(char2int)

# encode the text
encoded = np.array([char2int[ch] for ch in text])

In [5]:
encoded[:100]

array([46, 37, 77,  8, 57, 56, 25, 59,  7, 60, 60, 60, 21, 77,  8,  8, 62,
       59, 63, 77, 19, 54,  2, 54, 56, 27, 59, 77, 25, 56, 59, 77,  2,  2,
       59, 77,  2, 54, 58, 56, 71, 59, 56, 13, 56, 25, 62, 59, 45, 75, 37,
       77,  8,  8, 62, 59, 63, 77, 19, 54,  2, 62, 59, 54, 27, 59, 45, 75,
       37, 77,  8,  8, 62, 59, 54, 75, 59, 54, 57, 27, 59, 29, 47, 75, 60,
       47, 77, 62, 49, 60, 60, 39, 13, 56, 25, 62, 57, 37, 54, 75])

## Pre-processing the data

But, our LSTM expects an input that is one-hot encoded.

In [6]:
def one_hot_encode(arr, n_labels):
    
    # initialize the encoded array
    one_hot = np.zeros((arr.size, n_labels), dtype=np.float32)
    
    # fill the appropriate elements with ones
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    #print(one_hot)
    # finally reshape it to get backthe original way
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

In [7]:
# check that the function works as expected
test_seq = np.array([[3, 5, 1]])
one_hot = one_hot_encode(test_seq, 10)

print(one_hot)

[[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]]


## Making training mini-batches

In [8]:
def get_batches(arr, batch_size, seq_length):
    '''Create a generator that returns batches of size
       batch_size x seq_length from arr.
       
       Arguments
       ---------
       arr: Array you want to make batches from
       batch_size: Batch size, the number of sequences per batch
       seq_length: Number of encoded chars in a sequence
    '''
    
    ## TODO: Get the number of batches we can make
    batch_size_total = batch_size * seq_length
    n_batches = len(arr) // batch_size_total
    #print(n_batches)
    ## TODO: Keep only enough characters to make full batches
    arr = arr[:n_batches*batch_size_total]
    #print(len(arr))
    ## TODO: Reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    #print(arr.shape)

    ## TODO: Iterate over the batches using a window of size seq_length
    for n in range(0, arr.shape[1], seq_length):
        # The features
        x = arr[:, n:n+seq_length]
        #print(x)
        # The targets, shifted by one
        y = np.zeros_like(x)
        #print(y)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

### Test the implementation

In [9]:
batches = get_batches(encoded, 8, 50)
x, y = next(batches)

In [10]:
x, y = next(batches)
# printing out the first 10 items in a sequence
print('x\n', x[:10, :10])
print('\ny\n', y[:10, :10])

x
 [[37 77  8  8 62 59 63 77 19 54]
 [78  3 37 67 59 19 56 25 61 62]
 [75 33 59 57 29 47 77 25 33 27]
 [59 47 37 54 61 37 59 75 29 75]
 [23  2 56 59 47 54 57 37 59 27]
 [56 59 57 29 59 37 54 19 49 59]
 [27 59 25 29 29 19 67 60 63 29]
 [56 59 57 77 58 56 75 59 45  8]]

y
 [[77  8  8 62 59 63 77 19 54  2]
 [ 3 37 67 59 19 56 25 61 62 64]
 [33 59 57 29 47 77 25 33 27 59]
 [47 37 54 61 37 59 75 29 75 56]
 [ 2 56 59 47 54 57 37 59 27 45]
 [59 57 29 59 37 54 19 49 59 21]
 [59 25 29 29 19 67 60 63 29 25]
 [59 57 77 58 56 75 59 45  8 59]]


# Define network

## Model Structure

In `__init__`:
- Create and store the dictonaries
- Define an LSTN layer that takes as params: an input size(number of characters), a hidden layer size `n_hidden`, a number of layers `n_layers`, a dropout prob, and a batch_first boolean
- define a dropout layer,
- define FC with params: input size `n_hidden` and output size(number of characters)
- Finally initialize the weights

`input_size` is the number of characters this cell expects to see as sequential input, and `n_hidden` is the number of units in the hidden layers in the cell.

Finally, in the `forward` function, we can stack up the LSTM cells into layers using `.view`. With this, you pass in a list of cells and it will send the output of one cell into the next cell.

In [11]:
# check if GPU is available
train_on_gpu = torch.cuda.is_available()
if(train_on_gpu):
    print('Training on GPU!')
else: 
    print('No GPU available, training on CPU; consider making n_epochs very small.')

No GPU available, training on CPU; consider making n_epochs very small.


In [12]:
class CharRNN(nn.Module):
    
    def __init__(self, tokens, n_hidden=256, n_layers=2, 
                 drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        # dict
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        # layers of model
        self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        self.dropout = nn.Dropout(drop_prob)
        
        self.fc = nn.Linear(n_hidden, len(self.chars))
        
    def forward(self, x, hidden):
        
        # get the outpus and the new hidden state from the lstm
        out, hidden = self.lstm(x, hidden)
        
        out = self.dropout(out)
        
        # reshape
        out = out.contiguous().view(-1, self.n_hidden)
        
        out = self.fc(out)
        
        return out, hidden
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden

# Time to Train

A couple of details about training: 
>* Within the batch loop, we detach the hidden state from its history; this time setting it equal to a new *tuple* variable because an LSTM has a hidden state that is a tuple of the hidden and cell states.
* We use [`clip_grad_norm_`](https://pytorch.org/docs/stable/_modules/torch/nn/utils/clip_grad.html) to help prevent exploding gradients.

In [13]:
def train(net, data, epochs=10, batch_size=10, seq_length=50, 
          lr=0.001, clip=5, val_frac=0.1, print_every=10):
    ''' Training a network 
    
        Arguments
        ---------
        
        net: CharRNN network
        data: text data to train the network
        epochs: Number of epochs to train
        batch_size: Number of mini-sequences per mini-batch, aka batch size
        seq_length: Number of character steps per mini-batch
        lr: learning rate
        clip: gradient clipping
        val_frac: Fraction of data to hold out for validation
        print_every: Number of steps for printing training and validation loss
    
    '''
    net.train()
    
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    # create training and validation data
    val_idx = int(len(data)*(1-val_frac))
    data, val_data = data[:val_idx], data[val_idx:]
    
    if(train_on_gpu):
        net.cuda()
        
    counter = 0
    n_chars = len(net.chars)
    
    for e in range(epochs):
        # initilaze hidden state
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(data, batch_size, seq_length):
            counter += 1
            # one hot encode our data and make them torch tensors
            x = one_hot_encode(x, n_chars)
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)

            if(train_on_gpu):
                inputs, targets = inputs.cuda(), targets.cuda()

            h = tuple([each.data for each in h])

            # zero accumulated grads
            net.zero_grad()

            # get the output from the model
            output, h = net(inputs, h)

            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(batch_size*seq_length).long())
            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()

            # loss stats
            if counter % print_every == 0:
                # Get validation loss
                val_h = net.init_hidden(batch_size)
                val_losses = []
                net.eval()
                for x, y in get_batches(val_data, batch_size, seq_length):
                    # One-hot encode our data and make them Torch tensors
                    x = one_hot_encode(x, n_chars)
                    x, y = torch.from_numpy(x), torch.from_numpy(y)

                    # Creating new variables for the hidden state, otherwise
                    # we'd backprop through the entire training history
                    val_h = tuple([each.data for each in val_h])

                    inputs, targets = x, y
                    if(train_on_gpu):
                        inputs, targets = inputs.cuda(), targets.cuda()

                    output, val_h = net(inputs, val_h)
                    val_loss = criterion(output, targets.view(batch_size*seq_length).long())

                    val_losses.append(val_loss.item())

                net.train() # reset to train mode after iterationg through validation data

                print("Epoch: {}/{}...".format(e+1, epochs),
                        "Step: {}...".format(counter),
                        "Loss: {:.4f}...".format(loss.item()),
                        "Val Loss: {:.4f}".format(np.mean(val_losses)))

In [14]:
## TODO: set your model hyperparameters
# define and print the net
n_hidden=32
n_layers=2

net = CharRNN(chars, n_hidden, n_layers)
print(net)

CharRNN(
  (lstm): LSTM(83, 32, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=32, out_features=83, bias=True)
)


In [15]:
batch_size = 64
seq_length = 50
n_epochs =  3# start small if you are just testing initial behavior

# train the model
train(net, encoded, epochs=n_epochs, batch_size=batch_size, seq_length=seq_length, lr=0.001, print_every=10)

Epoch: 1/3... Step: 10... Loss: 4.3389... Val Loss: 4.3298
Epoch: 1/3... Step: 20... Loss: 4.1698... Val Loss: 4.1358
Epoch: 1/3... Step: 30... Loss: 3.7976... Val Loss: 3.7027
Epoch: 1/3... Step: 40... Loss: 3.5007... Val Loss: 3.3857
Epoch: 1/3... Step: 50... Loss: 3.3485... Val Loss: 3.2254
Epoch: 1/3... Step: 60... Loss: 3.2835... Val Loss: 3.1712
Epoch: 1/3... Step: 70... Loss: 3.2909... Val Loss: 3.1530
Epoch: 1/3... Step: 80... Loss: 3.2285... Val Loss: 3.1444
Epoch: 1/3... Step: 90... Loss: 3.2568... Val Loss: 3.1402
Epoch: 1/3... Step: 100... Loss: 3.2261... Val Loss: 3.1374
Epoch: 1/3... Step: 110... Loss: 3.2291... Val Loss: 3.1352
Epoch: 1/3... Step: 120... Loss: 3.2116... Val Loss: 3.1332
Epoch: 1/3... Step: 130... Loss: 3.1998... Val Loss: 3.1322
Epoch: 1/3... Step: 140... Loss: 3.2135... Val Loss: 3.1303
Epoch: 1/3... Step: 150... Loss: 3.2184... Val Loss: 3.1289
Epoch: 1/3... Step: 160... Loss: 3.1864... Val Loss: 3.1282
Epoch: 1/3... Step: 170... Loss: 3.1768... Val Lo

Epoch: 3/3... Step: 1380... Loss: 2.5839... Val Loss: 2.5083
Epoch: 3/3... Step: 1390... Loss: 2.6429... Val Loss: 2.5065
Epoch: 3/3... Step: 1400... Loss: 2.6308... Val Loss: 2.5059
Epoch: 3/3... Step: 1410... Loss: 2.5499... Val Loss: 2.5033
Epoch: 3/3... Step: 1420... Loss: 2.5558... Val Loss: 2.5015
Epoch: 3/3... Step: 1430... Loss: 2.5848... Val Loss: 2.5019
Epoch: 3/3... Step: 1440... Loss: 2.5964... Val Loss: 2.4997
Epoch: 3/3... Step: 1450... Loss: 2.6019... Val Loss: 2.4980
Epoch: 3/3... Step: 1460... Loss: 2.5903... Val Loss: 2.4967
Epoch: 3/3... Step: 1470... Loss: 2.6042... Val Loss: 2.4950
Epoch: 3/3... Step: 1480... Loss: 2.6425... Val Loss: 2.4931
Epoch: 3/3... Step: 1490... Loss: 2.6329... Val Loss: 2.4945
Epoch: 3/3... Step: 1500... Loss: 2.5550... Val Loss: 2.4909
Epoch: 3/3... Step: 1510... Loss: 2.5854... Val Loss: 2.4891
Epoch: 3/3... Step: 1520... Loss: 2.5944... Val Loss: 2.4867
Epoch: 3/3... Step: 1530... Loss: 2.5323... Val Loss: 2.4853
Epoch: 3/3... Step: 1540

## Checkpoint

In [16]:
# change the name, for saving multiple files
model_name = 'rnn_x_epoch.net'

checkpoint = {'n_hidden': net.n_hidden,
              'n_layers': net.n_layers,
              'state_dict': net.state_dict(),
              'tokens': net.chars}

with open(model_name, 'wb') as f:
    torch.save(checkpoint, f)

# Predict next character

In [21]:
def predict(net, char, h=None, top_k=None):
    '''
    Given a character, predict the next character
    Returns the predicted charater and hidden state
    '''
    # tensor inputs
    x = np.array([[net.char2int[char]]])
    x = one_hot_encode(x, len(net.chars))
    inputs = torch.from_numpy(x)
    
    if(train_on_gpu):
        inputs = inputs.cuda()
        
    # detach hidden state from history
    h = tuple([each.data for each in h])
    # get the output of the model
    out, h = net(inputs, h)
    
    # get the character prob
    p = F.softmax(out, dim=1).data
    
    if(train_on_gpu):
        p = p.cpu()
        
    if top_k is None:
        top_ch = np.arange(len(net.chars))
    else:
        p, top_ch = p.topk(top_k)
        top_ch = top_ch.numpy().squeeze()
    
    # select the likely next char with some element of randomness
    p = p.numpy().squeeze()
    char = np.random.choice(top_ch, p=p/p.sum())
    
    return net.int2char[char], h

### Priming and generating text 

Typically you'll want to prime the network so you can build up a hidden state. Otherwise the network will start out generating characters at random. In general the first bunch of characters will be a little rough since it hasn't built up a long history of characters to predict from.

In [22]:
def sample(net, size, prime='The', top_k=None):
        
    if(train_on_gpu):
        net.cuda()
    else:
        net.cpu()
    
    net.eval() # eval mode
    
    # First off, run through the prime characters
    chars = [ch for ch in prime]
    h = net.init_hidden(1)
    for ch in prime:
        char, h = predict(net, ch, h, top_k=top_k)

    chars.append(char)
    
    # Now pass in the previous character and get a new one
    for ii in range(size):
        char, h = predict(net, chars[-1], h, top_k=top_k)
        chars.append(char)

    return ''.join(chars)

### Loading a checkpoint

In [23]:
# Here we have loaded in a model that trained over 20 epochs `rnn_20_epoch.net`
with open('rnn_x_epoch.net', 'rb') as f:
    checkpoint = torch.load(f)
    
loaded = CharRNN(checkpoint['tokens'], n_hidden=checkpoint['n_hidden'], n_layers=checkpoint['n_layers'])
loaded.load_state_dict(checkpoint['state_dict'])

<All keys matched successfully>

In [24]:
# Sample using a loaded model
print(sample(loaded, 2000, top_k=5, prime="And Levin said"))

And Levin said her ween tan tatet ot te siters.
 Ir wont tar to waris winn herestes on tas ans at annd terand whe and honsses hhildet te ta to ate to, torered wan at tires the hild he wilotit har orere aten ass tind
his and
she an wosed he andise he watit asd ho her, an whind as wond hes tin she wase hhite thated ther wesann weto at thhe her an wat ans tat,, ad adetes or tores toet, on hind hit han ass hit ass weronde ar ant tat os arter ans the had he sirenn to sant toet
ar os and to her ho saters
to hin sot astes thos ad toe arese ther ad wosr ot an wint tile whos ande adesann wete went wer hit agilr to site hhe sennn as han tins
hos whint
hit, the she sent ond tora to ta thhar at ther hee sered to tire an adeted
tite te weed hont an sons witatad hhes te wan teet, hils he sotins ore wint to hire, het at to waris ad wene to tite are thit sasses ad har to hhe what ant thor
tasers his to ad, hin wilt
wond ol to hitite se toe tat thar os te sotit at ol,e. "Ied wes ot weend this hhas ote 