# character-wise RNN

![Overview](https://github.com/udacity/deep-learning/raw/78c91a5607ecfdc29b762e45c082d7ca5047c8a1/intro-to-rnns/assets/charseq.jpeg)

In [28]:

import time
from collections import namedtuple
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(0)

## Loading data

In [29]:
!curl https://raw.githubusercontent.com/udacity/deep-learning/master/intro-to-rnns/anna.txt --create-dirs -o .pytorch/trialReport/trial.txt
with open('.pytorch/trialReport/trial.txt') as r:
    reports = r.read()
reports[:100]

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100 1978k  100 1978k    0     0  7696k      0 --:--:-- --:--:-- --:--:-- 7696k


'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

## Tokenization

In [30]:
vocab = tuple(set(reports))
print('vocabs:\n', vocab)
int_to_vocab = dict(enumerate(vocab))
print('int to vocab:\n', int_to_vocab)
vocab_to_int = {v: i for i,v in int_to_vocab.items()}
encoded = np.array([vocab_to_int[v] for v in reports], dtype=np.int32)

vocabs:
 ('X', 'y', 'D', 'p', '?', '4', 'z', 'R', 'I', 'i', 'v', '9', 'w', 'T', 'F', '8', 'J', 'Y', 'K', 'S', 'W', '5', '1', 'M', 'U', 'a', 'k', ':', '3', 'H', 'g', 'o', 'u', 'm', '\n', '!', ' ', '(', ')', 'r', 'd', 'C', 'e', 'N', 'E', 'c', 'L', 'P', 'b', '_', '*', 'A', 's', '&', '-', '0', 'V', '2', 'l', 'O', '.', '`', '7', 'f', '/', "'", 'j', 'x', 'G', '%', 'Q', ',', ';', 'Z', 'n', 'q', 'h', 't', '"', '@', '6', 'B', '$')
int to vocab:
 {0: 'X', 1: 'y', 2: 'D', 3: 'p', 4: '?', 5: '4', 6: 'z', 7: 'R', 8: 'I', 9: 'i', 10: 'v', 11: '9', 12: 'w', 13: 'T', 14: 'F', 15: '8', 16: 'J', 17: 'Y', 18: 'K', 19: 'S', 20: 'W', 21: '5', 22: '1', 23: 'M', 24: 'U', 25: 'a', 26: 'k', 27: ':', 28: '3', 29: 'H', 30: 'g', 31: 'o', 32: 'u', 33: 'm', 34: '\n', 35: '!', 36: ' ', 37: '(', 38: ')', 39: 'r', 40: 'd', 41: 'C', 42: 'e', 43: 'N', 44: 'E', 45: 'c', 46: 'L', 47: 'P', 48: 'b', 49: '_', 50: '*', 51: 'A', 52: 's', 53: '&', 54: '-', 55: '0', 56: 'V', 57: '2', 58: 'l', 59: 'O', 60: '.', 61: '`', 62: '7', 

In [31]:
encoded[:100]

array([41, 76, 25,  3, 77, 42, 39, 36, 22, 34, 34, 34, 29, 25,  3,  3,  1,
       36, 63, 25, 33,  9, 58,  9, 42, 52, 36, 25, 39, 42, 36, 25, 58, 58,
       36, 25, 58,  9, 26, 42, 72, 36, 42, 10, 42, 39,  1, 36, 32, 74, 76,
       25,  3,  3,  1, 36, 63, 25, 33,  9, 58,  1, 36,  9, 52, 36, 32, 74,
       76, 25,  3,  3,  1, 36,  9, 74, 36,  9, 77, 52, 36, 31, 12, 74, 34,
       12, 25,  1, 60, 34, 34, 44, 10, 42, 39,  1, 77, 76,  9, 74],
      dtype=int32)

## One-hot Encoding

In [32]:
def one_hot_encode(arr, n_labels):

    return np.eye(n_labels,n_labels,  dtype=np.float32)[arr]
# check that the function works as expected
test_seq = np.array([[3, 5, 1]])
one_hot = one_hot_encode(test_seq, 8)

print(one_hot)

[[[0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0.]]]


## Batching

                M steps ( seq length )
               xxx                 xxx
               x                     x
               x                     x                                Starting sequence:
               x                     x                                [1 2 3 4 5 6 7 8 9 10 11 12]
               x                     x
N batch size   x                     x                                Batch size = 2
(No. of steps) x                     x                                [1 2 3 4 5 6]
               x                     x                                [7 8 9 10 11 12]
               x                     x
               x                     x                                Seq length = 3
               x                     x
               x                     x                                  ┌─────┐
               x                     x                                [ │1 2 3│ 4 5 6]
               x                     x                                [ │7 8 9│ 10 11 12]
               x                     x                                  └─────┘
               xxx                 xxx

            xxxxxxxxxxxxxx   xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
                         xxxxx
                          xx

                          k= No. of batches = total chars/ N.M

In [33]:
def create_batches(arr, batch_size, seq_length):
    batch_size_total = batch_size * seq_length
    n_batches = len(arr) // batch_size_total
    arr = arr[:n_batches*batch_size_total]
    arr = arr.reshape((batch_size,-1))

    for n in range(0, arr.shape[1], seq_length):
        x = arr[:, n:n+seq_length]
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

In [34]:
batches = create_batches(encoded, 8, 50)
x, y= next(batches)
print('x:\n', x[:10, :10])
print('\ny:\n', y[:10, :10])

x:
 [[41 76 25  3 77 42 39 36 22 34]
 [52 31 74 36 77 76 25 77 36 25]
 [42 74 40 36 31 39 36 25 36 63]
 [52 36 77 76 42 36 45 76  9 42]
 [36 52 25 12 36 76 42 39 36 77]
 [45 32 52 52  9 31 74 36 25 74]
 [36 51 74 74 25 36 76 25 40 36]
 [59 48 58 31 74 52 26  1 60 36]]

y:
 [[76 25  3 77 42 39 36 22 34 34]
 [31 74 36 77 76 25 77 36 25 77]
 [74 40 36 31 39 36 25 36 63 31]
 [36 77 76 42 36 45 76  9 42 63]
 [52 25 12 36 76 42 39 36 77 42]
 [32 52 52  9 31 74 36 25 74 40]
 [51 74 74 25 36 76 25 40 36 52]
 [48 58 31 74 52 26  1 60 36 78]]


## Defining model

In [35]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [36]:
class textgenRNN(nn.Module):
    def __init__(self,n_output, n_hidden=256, n_layers=2, drop_prob=0.3, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        self.n_output = n_output

        self.lstm = nn.LSTM(n_output, n_hidden, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout1 = nn.Dropout(drop_prob)
        self.fc1 = nn.Linear(n_hidden, n_output)

    def forward(self, x, hidden):
        out, hid = self.lstm(x, hidden)
        out = self.dropout1(out)

        out = out.contiguous().view(-1, self.n_hidden)

        out = self.fc1(out)
        return out, hid

    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))

        return hidden

In [37]:
from torchsummary import summary
n_hidden=512
n_layers=10
model = textgenRNN(len(vocab), n_hidden=n_hidden, n_layers=n_layers)
model.to(device)
print(model)

textgenRNN(
  (lstm): LSTM(83, 512, num_layers=10, batch_first=True, dropout=0.3)
  (dropout1): Dropout(p=0.3, inplace=False)
  (fc1): Linear(in_features=512, out_features=83, bias=True)
)


In [38]:
import os
if os.path.isdir('checkpoint') and os.path.isfile('./checkpoint/trialckpt.t7'):
    model.load_state_dict(torch.load('./checkpoint/trialckpt.t7', map_location=torch.device(device)))

## Training

In [39]:
from sklearn.model_selection import train_test_split
epochs=20
batch_size=128
seq_length=100
lr=0.030
clip=5
test_portion=0.1

# change model mode to train
model.train()

optim = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

# create training and validation data
data, test_data = train_test_split(encoded, test_size=test_portion, shuffle=False, random_state=0)

In [40]:
counter = 0
n_vocab = len(vocab)
for e in range(epochs):
    # initialize hidden state
    h = model.init_hidden(batch_size)

    for x, y in create_batches(data, batch_size, seq_length):
        counter += 1

        # One-hot encode our data and make them Torch tensors
        x = one_hot_encode(x, n_vocab)
        inputs, targets = torch.from_numpy(x), torch.from_numpy(y)

        inputs, targets = inputs.to(device), targets.to(device)

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        model.zero_grad()

        # get the output from the model
        output, h = model(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output, targets.view(batch_size*seq_length).long())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optim.step()

        # loss stats
        if counter % 100 == 0:
            # Get validation loss
            val_h = model.init_hidden(batch_size)
            val_losses = []
            model.eval()
            for x, y in create_batches(test_data, batch_size, seq_length):
                # One-hot encode our data and make them Torch tensors
                x = one_hot_encode(x, n_vocab)
                x, y = torch.from_numpy(x), torch.from_numpy(y)

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                inputs, targets = x, y
                inputs, targets = inputs.to(device), targets.to(device)

                output, val_h = model(inputs, val_h)
                val_loss = criterion(output, targets.view(batch_size*seq_length).long())

                val_losses.append(val_loss.item())

            print('==> Saving model ...')

            if not os.path.isdir('checkpoint'):
                os.mkdir('checkpoint')
            torch.save(model.state_dict(), './checkpoint/trialckpt.t7')

            model.train() # reset to train mode after iterationg through validation data

            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.4f}...".format(loss.item()),
                  "Val Loss: {:.4f}".format(np.mean(val_losses)))

==> Saving model ...
Epoch: 1/20... Step: 100... Loss: 3.1196... Val Loss: 3.1285
==> Saving model ...
Epoch: 2/20... Step: 200... Loss: 3.1090... Val Loss: 3.1296
==> Saving model ...
Epoch: 3/20... Step: 300... Loss: 3.1288... Val Loss: 3.1281
==> Saving model ...
Epoch: 3/20... Step: 400... Loss: 3.1156... Val Loss: 3.1244
==> Saving model ...
Epoch: 4/20... Step: 500... Loss: 3.1048... Val Loss: 3.1301
==> Saving model ...
Epoch: 5/20... Step: 600... Loss: 3.0841... Val Loss: 3.1279
==> Saving model ...
Epoch: 6/20... Step: 700... Loss: 3.1013... Val Loss: 3.1278
==> Saving model ...
Epoch: 6/20... Step: 800... Loss: 3.1136... Val Loss: 3.1302
==> Saving model ...
Epoch: 7/20... Step: 900... Loss: 3.0893... Val Loss: 3.1302
==> Saving model ...
Epoch: 8/20... Step: 1000... Loss: 3.1272... Val Loss: 3.1289
==> Saving model ...
Epoch: 8/20... Step: 1100... Loss: 3.1170... Val Loss: 3.1281
==> Saving model ...
Epoch: 9/20... Step: 1200... Loss: 3.0981... Val Loss: 3.1275
==> Saving mo

# Predicting next character

In [41]:
def predict(model, inputs, h=None, top_k= None):
    x = np.array([[vocab_to_int[inputs]]])
    x = one_hot_encode(x, len(vocab))
    inp = torch.from_numpy(x).to(device)

    # get access to hidden state
    h = tuple([i.data for i in h])
    # pass inputs and hidden state to model
    out, h = model(inp, h)

    # get prob of the char
    p = F.softmax(out, dim=1).data
    p = p.to('cpu')

    if top_k is None:
        top_ch = np.arange(len(vocab))
    else:
        p, top_ch = p.topk(top_k)
        top_ch = top_ch.numpy().squeeze()

    p = p.numpy().squeeze()
    v = np.random.choice(top_ch, p=p/p.sum())

    return int_to_vocab[v], h


# Sample text

In [44]:
def sample(model, size, prime='The ', top_k=None):

    model.to(device)

    model.eval() # eval mode

    # First off, run through the prime characters
    chars = [ch for ch in prime]
    h = model.init_hidden(1)
    for ch in prime:
        char, h = predict(model, ch, h, top_k=top_k)

    chars.append(char)

    # Now pass in the previous character and get a new one
    for ii in range(size):
        char, h = predict(model, chars[-1], h, top_k=top_k)
        chars.append(char)

    return ''.join(chars)
print(sample(model, 2000, prime='The ', top_k=5))

The      etea   to   o ta  oooeaet et at o a a o  oe e ttoee oteet   tetoe otota t e  a  t eo eteatoae     oeto  te eae at  ota  t   oe etet oetaea  t    otoooeeat e a tt aoa t ote etataateea   o tt etatt aeo e  ote aa  eaat  ae    eat    oaetaot  a ot eete oea   ta  teeoaetoe tta ae  teae oetote taeaetoe a tee t aoee ea e   ato  a  ttoa  e   e a aae  o  eta      a  aoeo e     a eooo e  eoeao tta e  too e      a a o e oa ao a e t eto otet t a tea  oet aat  at eae t  ee eeoeeoa tt o tett to ota    oot oaete    aoe a e  tttaaoeo te a tae o aao oaee to  e  e eeaa a e  oeotaeo  ee e a e  ot too ttoeeaoe oe eae taotaoetoo oetoe e taoet e o at  a   atee te t oeeoeaooaettta  ao to eteo e to  totae tttea    tt eeeoeo eo et o ae  et  ot  a  a  eea   ae te t  e  a  e  o  e  a te to ota  t ooee   aa a   aa t  ee oe to teao  t  aa   eoto oa te a  eeoaoaet   t oo  o   o  tet ao oaao t attte a  eota a ee  o   ot oaaa t e  eea  ototoe ot oa   ta e  ta a  aet   a o o    eoeae et te oe e a ae otetooo o