# character-wise RNN

![Overview](https://github.com/udacity/deep-learning/raw/78c91a5607ecfdc29b762e45c082d7ca5047c8a1/intro-to-rnns/assets/charseq.jpeg)

In [1]:

import time
from collections import namedtuple
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(0)

## Loading data

In [2]:
!curl http://www.gutenberg.org/files/2591/2591-0.txt --create-dirs -o .pytorch/trialReport/trial.txt
with open('.pytorch/trialReport/trial.txt') as r:
    reports = r.read()
reports[:100]

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0  547k    0   346    0     0    311      0  0:30:01  0:00:01  0:30:00   312
 11  547k   11 64594    0     0  44486      0  0:00:12  0:00:01  0:00:11 44516
100  547k  100  547k    0     0   269k      0  0:00:02  0:00:02 --:--:--  269k


'ï»؟The Project Gutenberg EBook of Grimmsâ€™ Fairy Tales, by The Brothers Grimm\n\nThis eBook is for th'

In [3]:
import re
words = re.split(r'\W+', reports)
words = [word.lower() for word in words]
print(words[:100])

['ï', 'the', 'project', 'gutenberg', 'ebook', 'of', 'grimmsâ', 'fairy', 'tales', 'by', 'the', 'brothers', 'grimm', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', 'you', 'may', 'copy', 'it', 'give', 'it', 'away', 'or', 're', 'use', 'it', 'under', 'the', 'terms', 'of', 'the', 'project', 'gutenberg', 'license', 'included', 'with', 'this', 'ebook', 'or', 'online', 'at', 'www', 'gutenberg', 'org', 'title', 'grimmsâ', 'fairy', 'tales', 'author', 'the', 'brothers', 'grimm', 'translator', 'edgar', 'taylor', 'and', 'marian', 'edwardes', 'posting', 'date', 'december', '14', '2008', 'ebook', '2591', 'release', 'date', 'april', '2001', 'last', 'updated', 'november', '7', '2016', 'language', 'english', 'character', 'set', 'encoding', 'utf', '8', 'start', 'of', 'this']


In [4]:
reports = ' '.join(words)

## Tokenization

In [5]:
vocab = tuple(set(reports))
print('vocabs:\n', vocab)
int_to_vocab = dict(enumerate(vocab))
print('int to vocab:\n', int_to_vocab)
vocab_to_int = {v: i for i,v in int_to_vocab.items()}
encoded = np.array([vocab_to_int[v] for v in reports], dtype=np.int32)

vocabs:
 ('p', '1', 'r', 'i', 'ک', '0', 'ï', 'g', 'q', '7', 'w', 'u', 'n', 'a', '4', 'd', '5', '9', '3', 't', 'k', ' ', 'c', 'x', 'j', 's', 'y', 'œ', 'b', 'e', 'h', 'l', 'z', '_', '8', '2', 'm', 'â', '6', 'v', 'f', 'o')
int to vocab:
 {0: 'p', 1: '1', 2: 'r', 3: 'i', 4: 'ک', 5: '0', 6: 'ï', 7: 'g', 8: 'q', 9: '7', 10: 'w', 11: 'u', 12: 'n', 13: 'a', 14: '4', 15: 'd', 16: '5', 17: '9', 18: '3', 19: 't', 20: 'k', 21: ' ', 22: 'c', 23: 'x', 24: 'j', 25: 's', 26: 'y', 27: 'œ', 28: 'b', 29: 'e', 30: 'h', 31: 'l', 32: 'z', 33: '_', 34: '8', 35: '2', 36: 'm', 37: 'â', 38: '6', 39: 'v', 40: 'f', 41: 'o'}


In [6]:
encoded[:100]

array([ 6, 21, 19, 30, 29, 21,  0,  2, 41, 24, 29, 22, 19, 21,  7, 11, 19,
       29, 12, 28, 29,  2,  7, 21, 29, 28, 41, 41, 20, 21, 41, 40, 21,  7,
        2,  3, 36, 36, 25, 37, 21, 40, 13,  3,  2, 26, 21, 19, 13, 31, 29,
       25, 21, 28, 26, 21, 19, 30, 29, 21, 28,  2, 41, 19, 30, 29,  2, 25,
       21,  7,  2,  3, 36, 36, 21, 19, 30,  3, 25, 21, 29, 28, 41, 41, 20,
       21,  3, 25, 21, 40, 41,  2, 21, 19, 30, 29, 21, 11, 25, 29])

## One-hot Encoding

In [7]:
def one_hot_encode(arr, n_labels):

    return np.eye(n_labels,n_labels,  dtype=np.float32)[arr]
# check that the function works as expected
test_seq = np.array([[3, 5, 1]])
one_hot = one_hot_encode(test_seq, 8)

print(one_hot)

[[[0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0.]]]


## Batching

```
                M steps ( seq length )
               xxx                 xxx
               x                     x
               x                     x                                Starting sequence:
               x                     x                                [1 2 3 4 5 6 7 8 9 10 11 12]
               x                     x
N batch size   x                     x                                Batch size = 2
(No. of steps) x                     x                                [1 2 3 4 5 6]
               x                     x                                [7 8 9 10 11 12]
               x                     x
               x                     x                                Seq length = 3
               x                     x
               x                     x                                  ┌─────┐
               x                     x                                [ │1 2 3│ 4 5 6]
               x                     x                                [ │7 8 9│ 10 11 12]
               x                     x                                  └─────┘
               xxx                 xxx

            xxxxxxxxxxxxxx   xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
                         xxxxx
                          xx

                          k= No. of batches = total chars/ N.M
```

In [8]:
def create_batches(arr, batch_size, seq_length):
    batch_size_total = batch_size * seq_length
    n_batches = len(arr) // batch_size_total
    arr = arr[:n_batches*batch_size_total]
    arr = arr.reshape((batch_size,-1))

    for n in range(0, arr.shape[1], seq_length):
        x = arr[:, n:n+seq_length]
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

In [9]:
batches = create_batches(encoded, 8, 50)
x, y= next(batches)
print('x:\n', x[:10, :10])
print('\ny:\n', y[:10, :10])

x:
 [[ 6 21 19 30 29 21  0  2 41 24]
 [11 19 21 19 30 29 21 29 31 15]
 [21 19 30 29 21 10 13 26 21 22]
 [21 15 41 21 26 41 11  2 21 19]
 [30 26 21 10 30 13 19 21 25 30]
 [21 37 21 30 13 12 25 21 10 29]
 [29  2 21  3 12 25 19 29 13 15]
 [21 28 11 19 21 25 30 11 15 15]]

y:
 [[21 19 30 29 21  0  2 41 24 29]
 [19 21 19 30 29 21 29 31 15 29]
 [19 30 29 21 10 13 26 21 22 41]
 [15 41 21 26 41 11  2 21 19 13]
 [26 21 10 30 13 19 21 25 30 41]
 [37 21 30 13 12 25 21 10 29 12]
 [ 2 21  3 12 25 19 29 13 15 21]
 [28 11 19 21 25 30 11 15 15 29]]


## Defining model

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [11]:
class textgenRNN(nn.Module):
    def __init__(self,n_output, n_hidden=256, n_layers=2, drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        self.n_output = n_output

        self.lstm = nn.LSTM(n_output, n_hidden, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout1 = nn.Dropout(drop_prob)
        self.fc1 = nn.Linear(n_hidden, n_output)

    def forward(self, x, hidden):
        out, hid = self.lstm(x, hidden)
        out = self.dropout1(out)

        out = out.contiguous().view(-1, self.n_hidden)

        out = self.fc1(out)
        return out, hid

    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        hidden = (torch.zeros(self.n_layers, batch_size, self.n_hidden).to(device),
                  torch.zeros(self.n_layers, batch_size, self.n_hidden).to(device))

        return hidden

In [12]:
from torchsummary import summary
n_hidden=512
n_layers=2
model = textgenRNN(len(vocab), n_hidden=n_hidden, n_layers=n_layers)
model.to(device)
print(model)

textgenRNN(
  (lstm): LSTM(42, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout1): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=512, out_features=42, bias=True)
)


In [13]:
import os
if os.path.isdir('checkpoint') and os.path.isfile('./checkpoint/trialckpt.t7'):
    model.load_state_dict(torch.load('./checkpoint/trialckpt.t7', map_location=torch.device(device)))

## Training

In [14]:
from sklearn.model_selection import train_test_split
epochs=100
batch_size=32
seq_length=64
lr=0.01
clip=5
test_portion=0.1

# change model mode to train
model.train()

optim = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

# create training and validation data
data, test_data = train_test_split(encoded, test_size=test_portion, shuffle=False, random_state=0)

In [None]:
counter = 0
n_vocab = len(vocab)
for e in range(epochs):
    # initialize hidden state
    h = model.init_hidden(batch_size)

    for x, y in create_batches(data, batch_size, seq_length):
        counter += 1

        # One-hot encode our data and make them Torch tensors
        x = one_hot_encode(x, n_vocab)
        inputs, targets = torch.from_numpy(x), torch.from_numpy(y)

        inputs, targets = inputs.to(device), targets.to(device)

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        model.zero_grad()

        # get the output from the model
        output, h = model(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output, targets.view(batch_size*seq_length).long())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optim.step()

        # loss stats
        if counter % 100 == 0:
            # Get validation loss
            val_h = model.init_hidden(batch_size)
            val_losses = []
            model.eval()
            for x, y in create_batches(test_data, batch_size, seq_length):
                # One-hot encode our data and make them Torch tensors
                x = one_hot_encode(x, n_vocab)
                x, y = torch.from_numpy(x), torch.from_numpy(y)

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                inputs, targets = x, y
                inputs, targets = inputs.to(device), targets.to(device)

                output, val_h = model(inputs, val_h)
                val_loss = criterion(output, targets.view(batch_size*seq_length).long())

                val_losses.append(val_loss.item())

            print('==> Saving model ...')

            if not os.path.isdir('checkpoint'):
                os.mkdir('checkpoint')
            torch.save(model.state_dict(), './checkpoint/trialckpt.t7')

            model.train() # reset to train mode after iterationg through validation data

            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.4f}...".format(loss.item()),
                  "Val Loss: {:.4f}".format(np.mean(val_losses)))

==> Saving model ...
Epoch: 1/100... Step: 100... Loss: 2.8470... Val Loss: 2.8846
==> Saving model ...
Epoch: 1/100... Step: 200... Loss: 2.5865... Val Loss: 2.6741
==> Saving model ...
Epoch: 2/100... Step: 300... Loss: 2.4895... Val Loss: 2.5240
==> Saving model ...
Epoch: 2/100... Step: 400... Loss: 2.3406... Val Loss: 2.4048
==> Saving model ...
Epoch: 3/100... Step: 500... Loss: 2.2576... Val Loss: 2.3761
==> Saving model ...
Epoch: 3/100... Step: 600... Loss: 2.2109... Val Loss: 2.2806
==> Saving model ...
Epoch: 4/100... Step: 700... Loss: 2.1588... Val Loss: 2.2261
==> Saving model ...
Epoch: 4/100... Step: 800... Loss: 2.0656... Val Loss: 2.1785


# Predicting next character

In [None]:
def predict(model, inputs, h=None, top_k= None):
    x = np.array([[vocab_to_int[inputs]]])
    x = one_hot_encode(x, len(vocab))
    inp = torch.from_numpy(x).to(device)

    # get access to hidden state
    h = tuple([i.data for i in h])
    # pass inputs and hidden state to model
    out, h = model(inp, h)

    # get prob of the char
    p = F.softmax(out, dim=1).data
    p = p.to('cpu')

    if top_k is None:
        top_ch = np.arange(len(vocab))
    else:
        p, top_ch = p.topk(top_k)
        top_ch = top_ch.numpy().squeeze()

    p = p.numpy().squeeze()
    v = np.random.choice(top_ch, p=p/p.sum())

    return int_to_vocab[v], h


# Sample text

In [None]:
def sample(model, size, prime='the ', top_k=None):

    model.to(device)

    model.eval() # eval mode

    # First off, run through the prime characters
    chars = [ch for ch in prime]
    h = model.init_hidden(1)
    for ch in prime:
        char, h = predict(model, ch, h, top_k=top_k)

    chars.append(char)

    # Now pass in the previous character and get a new one
    for ii in range(size):
        char, h = predict(model, chars[-1], h, top_k=top_k)
        chars.append(char)

    return ''.join(chars)
print(sample(model, 2000, prime='the ', top_k=26))