# character-wise RNN

![Overview](https://github.com/udacity/deep-learning/raw/78c91a5607ecfdc29b762e45c082d7ca5047c8a1/intro-to-rnns/assets/charseq.jpeg)

In [65]:

import time
from collections import namedtuple
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(0)

## Loading data

In [66]:
!curl https://raw.githubusercontent.com/udacity/deep-learning/master/intro-to-rnns/anna.txt > .pytorch/trialReport/trial.txt
with open('.pytorch/trialReport/trial.txt') as r:
    reports = r.read()
reports[:100]

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100 1978k  100 1978k    0     0  14.2M      0 --:--:-- --:--:-- --:--:-- 14.2M


'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

## Tokenization

In [67]:
vocab = sorted(set(reports))
print('vocabs:\n', vocab)
int_to_vocab = dict(enumerate(vocab))
print('int to vocab:\n', int_to_vocab)
vocab_to_int = {v: i for i,v in int_to_vocab.items()}
encoded = np.array([vocab_to_int[v] for v in reports], dtype=np.int32)

vocabs:
 ['\n', ' ', '!', '"', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
int to vocab:
 {0: '\n', 1: ' ', 2: '!', 3: '"', 4: '$', 5: '%', 6: '&', 7: "'", 8: '(', 9: ')', 10: '*', 11: ',', 12: '-', 13: '.', 14: '/', 15: '0', 16: '1', 17: '2', 18: '3', 19: '4', 20: '5', 21: '6', 22: '7', 23: '8', 24: '9', 25: ':', 26: ';', 27: '?', 28: '@', 29: 'A', 30: 'B', 31: 'C', 32: 'D', 33: 'E', 34: 'F', 35: 'G', 36: 'H', 37: 'I', 38: 'J', 39: 'K', 40: 'L', 41: 'M', 42: 'N', 43: 'O', 44: 'P', 45: 'Q', 46: 'R', 47: 'S', 48: 'T', 49: 'U', 50: 'V', 51: 'W', 52: 'X', 53: 'Y', 54: 'Z', 55: '_', 56: '`', 57: 'a', 58: 'b', 59: 'c', 60: 'd', 61: 'e', 62: 'f', 

In [68]:
encoded[:100]

array([31, 64, 57, 72, 76, 61, 74,  1, 16,  0,  0,  0, 36, 57, 72, 72, 81,
        1, 62, 57, 69, 65, 68, 65, 61, 75,  1, 57, 74, 61,  1, 57, 68, 68,
        1, 57, 68, 65, 67, 61, 26,  1, 61, 78, 61, 74, 81,  1, 77, 70, 64,
       57, 72, 72, 81,  1, 62, 57, 69, 65, 68, 81,  1, 65, 75,  1, 77, 70,
       64, 57, 72, 72, 81,  1, 65, 70,  1, 65, 76, 75,  1, 71, 79, 70,  0,
       79, 57, 81, 13,  0,  0, 33, 78, 61, 74, 81, 76, 64, 65, 70],
      dtype=int32)

## One-hot Encoding

In [69]:
def one_hot_encode(arr, n_labels):

    return np.eye(n_labels,n_labels,  dtype=np.float32)[arr]
# check that the function works as expected
test_seq = np.array([[3, 5, 1]])
one_hot = one_hot_encode(test_seq, 8)

print(one_hot)

[[[0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0.]]]


## Batching

                M steps ( seq length )
               xxx                 xxx
               x                     x
               x                     x                                Starting sequence:
               x                     x                                [1 2 3 4 5 6 7 8 9 10 11 12]
               x                     x
N batch size   x                     x                                Batch size = 2
(No. of steps) x                     x                                [1 2 3 4 5 6]
               x                     x                                [7 8 9 10 11 12]
               x                     x
               x                     x                                Seq length = 3
               x                     x
               x                     x                                  ┌─────┐
               x                     x                                [ │1 2 3│ 4 5 6]
               x                     x                                [ │7 8 9│ 10 11 12]
               x                     x                                  └─────┘
               xxx                 xxx

            xxxxxxxxxxxxxx   xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
                         xxxxx
                          xx

                          k= No. of batches = total chars/ N.M

In [70]:
def create_batches(arr, batch_size, seq_length):
    batch_size_total = batch_size * seq_length
    n_batches = len(arr) // batch_size_total
    arr = arr[:n_batches*batch_size_total]
    arr = arr.reshape((batch_size,-1))

    for n in range(0, arr.shape[1], seq_length):
        x = arr[:, n:n+seq_length]
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

In [71]:
batches = create_batches(encoded, 8, 50)
x, y= next(batches)
print('x:\n', x[:10, :10])
print('\ny:\n', y[:10, :10])

x:
 [[31 64 57 72 76 61 74  1 16  0]
 [75 71 70  1 76 64 57 76  1 57]
 [61 70 60  1 71 74  1 57  1 62]
 [75  1 76 64 61  1 59 64 65 61]
 [ 1 75 57 79  1 64 61 74  1 76]
 [59 77 75 75 65 71 70  1 57 70]
 [ 1 29 70 70 57  1 64 57 60  1]
 [43 58 68 71 70 75 67 81 13  1]]

y:
 [[64 57 72 76 61 74  1 16  0  0]
 [71 70  1 76 64 57 76  1 57 76]
 [70 60  1 71 74  1 57  1 62 71]
 [ 1 76 64 61  1 59 64 65 61 62]
 [75 57 79  1 64 61 74  1 76 61]
 [77 75 75 65 71 70  1 57 70 60]
 [29 70 70 57  1 64 57 60  1 75]
 [58 68 71 70 75 67 81 13  1  3]]


## Defining model

In [72]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [73]:
class textgenRNN(nn.Module):
    def __init__(self,n_output, n_hidden=256, n_layers=2, drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        self.n_output = n_output

        self.lstm = nn.LSTM(n_output, n_hidden, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout1 = nn.Dropout(drop_prob)
        self.fc1 = nn.Linear(n_hidden, n_output)

    def forward(self, x, hidden):
        out, hid = self.lstm(x, hidden)
        out = self.dropout1(out)

        out = out.contiguous().view(-1, self.n_hidden)

        out = self.fc1(out)
        return out, hid

    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))

        return hidden

In [74]:
from torchsummary import summary
n_hidden=512
n_layers=3
model = textgenRNN(len(vocab), n_hidden=n_hidden, n_layers=n_layers)
model.to(device)
print(model)

textgenRNN(
  (lstm): LSTM(83, 512, num_layers=3, batch_first=True, dropout=0.5)
  (dropout1): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=512, out_features=83, bias=True)
)


In [75]:
import os
if os.path.isdir('checkpoint') and os.path.isfile('./checkpoint/trialckpt.t7'):
    model.load_state_dict(torch.load('./checkpoint/trialckpt.t7', map_location=torch.device(device)))

## Training

In [80]:
from sklearn.model_selection import train_test_split
epochs=100
batch_size=125
seq_length=100
lr=0.05
clip=5
test_portion=0.2

# change model mode to train
model.train()

optim = torch.optim.Adagrad(model.parameters(), lr=lr, lr_decay=0.0001)
criterion = nn.CrossEntropyLoss()

# create training and validation data
data, test_data = train_test_split(encoded, test_size=test_portion)

textgenRNN(
  (lstm): LSTM(83, 512, num_layers=3, batch_first=True, dropout=0.5)
  (dropout1): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=512, out_features=83, bias=True)
)

In [81]:
counter = 0
n_vocab = len(vocab)
for e in range(epochs):
    # initialize hidden state
    h = model.init_hidden(batch_size)

    for x, y in create_batches(data, batch_size, seq_length):
        counter += 1

        # One-hot encode our data and make them Torch tensors
        x = one_hot_encode(x, n_vocab)
        inputs, targets = torch.from_numpy(x), torch.from_numpy(y)

        inputs, targets = inputs.to(device), targets.to(device)

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        model.zero_grad()

        # get the output from the model
        output, h = model(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output, targets.view(batch_size*seq_length).long())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optim.step()

        # loss stats
        if counter % 100 == 0:
            # Get validation loss
            val_h = model.init_hidden(batch_size)
            val_losses = []
            model.eval()
            for x, y in create_batches(test_data, batch_size, seq_length):
                # One-hot encode our data and make them Torch tensors
                x = one_hot_encode(x, n_vocab)
                x, y = torch.from_numpy(x), torch.from_numpy(y)

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                inputs, targets = x, y
                inputs, targets = inputs.to(device), targets.to(device)

                output, val_h = model(inputs, val_h)
                val_loss = criterion(output, targets.view(batch_size*seq_length).long())

                val_losses.append(val_loss.item())

            print('==> Saving model ...')

            if not os.path.isdir('checkpoint'):
                os.mkdir('checkpoint')
            torch.save(model.state_dict(), './checkpoint/trialckpt.t7')

            model.train() # reset to train mode after iterationg through validation data

            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.4f}...".format(loss.item()),
                  "Val Loss: {:.4f}".format(np.mean(val_losses)))

==> Saving model ...
Epoch: 1/100... Step: 100... Loss: 3.1150... Val Loss: 3.1057
==> Saving model ...
Epoch: 2/100... Step: 200... Loss: 3.1270... Val Loss: 3.1066
==> Saving model ...
Epoch: 3/100... Step: 300... Loss: 3.1287... Val Loss: 3.1058
==> Saving model ...
Epoch: 4/100... Step: 400... Loss: 3.1007... Val Loss: 3.1050
==> Saving model ...
Epoch: 4/100... Step: 500... Loss: 3.0965... Val Loss: 3.1052
==> Saving model ...
Epoch: 5/100... Step: 600... Loss: 3.1261... Val Loss: 3.1053
==> Saving model ...
Epoch: 6/100... Step: 700... Loss: 3.1104... Val Loss: 3.1051
==> Saving model ...
Epoch: 7/100... Step: 800... Loss: 3.1101... Val Loss: 3.1051
==> Saving model ...
Epoch: 8/100... Step: 900... Loss: 3.1076... Val Loss: 3.1052
==> Saving model ...
Epoch: 8/100... Step: 1000... Loss: 3.1009... Val Loss: 3.1051
==> Saving model ...
Epoch: 9/100... Step: 1100... Loss: 3.0918... Val Loss: 3.1051
==> Saving model ...
Epoch: 10/100... Step: 1200... Loss: 3.0976... Val Loss: 3.1052


KeyboardInterrupt: ignored

# Predicting next character

In [82]:
def predict(model, inputs, h=None, top_k= None):
    x = np.array([[vocab_to_int[inputs]]])
    x = one_hot_encode(x, len(vocab))
    inp = torch.from_numpy(x).to(device)

    # get access to hidden state
    h = tuple([i.data for i in h])
    # pass inputs and hidden state to model
    out, h = model(inp, h)

    # get prob of the char
    p = F.softmax(out, dim=1).data
    p = p.to('cpu')

    if top_k is None:
        top_ch = np.arange(len(vocab))
    else:
        p, top_ch = p.topk(top_k)
        top_ch = top_ch.numpy().squeeze()

    p = p.numpy().squeeze()
    v = np.random.choice(top_ch, p=p/p.sum())

    return int_to_vocab[v], h


# Sample text

In [83]:
def sample(model, size, prime='The ', top_k=None):

    model.to(device)

    model.eval() # eval mode

    # First off, run through the prime characters
    chars = [ch for ch in prime]
    h = model.init_hidden(1)
    for ch in prime:
        char, h = predict(model, ch, h, top_k=top_k)

    chars.append(char)

    # Now pass in the previous character and get a new one
    for ii in range(size):
        char, h = predict(model, chars[-1], h, top_k=top_k)
        chars.append(char)

    return ''.join(chars)
print(sample(model, 1000, prime='The ', top_k=26))

The rnfyd,leshs, ,t,oiay  esklt erleot,welttoiftiekeehatmirik,aapg  t whnouhmegonecmmhebgernefytarndais ho eeriahmn atecs euhfoetpleg.n asertweih

t li   ee tihontmlgserl ai ymr mrtloiopdrm lo r aanfg  oast 
nmyeeatant s t toav  
edr itarrnvvpehowai c ny edy
rieiy lyh
 ,eir npdastdstke
aoa sult
  ntgto eewts w r oaenlh.o  ho eha .iiiv  hft  iaht nw
 uinndeoc.bayecetpaa teriwofeeofyhit suvgnelfrprhhakh .td  uutyyott oe losn y a teaeeth oigkuhcsstvhni  hesanus h     fftn nguhg,mlyyliru,t  tu a  e oeelamt heel
idaismnta nip
 s,ri 
face woeikoieteste ei
to upac yk 
lonhnmin
hh i,ttnshtmi i la h
dchna ik ua ogas slr hed ecrenoewohe tceeonatiethditae s a itw  s iynew irm
wnsss    sofstrcnegwso enth eaansae nto.a niret,s skhhtfuatnut e   tlhdo ry f,hrcom,myssa   o k wttterili,ts dtthev toel tyosf l ua 
drmwa ea.doh mcrld ewadsntlracetd c h glodhwtirp
w h st

sil yhfnanh uy lao nrat fsatie
rn
se d paic
helnwere r,tpn
gdoutdtta,cyo ,inniatrgeitkhmh e  dawdn  nnlt, nn o.one o  aa psovecelrdeelw 