# character-wise RNN

![Overview](https://github.com/udacity/deep-learning/raw/78c91a5607ecfdc29b762e45c082d7ca5047c8a1/intro-to-rnns/assets/charseq.jpeg)

In [3]:

import time
from collections import namedtuple
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np

## Loading data

In [14]:
with open('.pytorch/trialReport/trial.txt') as r:
    reports = r.read()
reports[:100]

'35. ABDOMEN AND PELVIC SONOGRAPHY:\nLiver is normal in size and with normal parenchymal echogenicity '

## Tokenization

In [17]:
vocab = sorted(set(reports))
print('vocabs:\n', vocab)
int_to_vocab = dict(enumerate(vocab))
print('int to vocab:\n', int_to_vocab)
vocab_to_int = {v: i for i,v in int_to_vocab.items()}
encoded = np.array([vocab_to_int[v] for v in reports], dtype=np.int32)

vocabs:
 ['\n', ' ', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', '\\', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'أ', '—']
int to vocab:
 {0: '\n', 1: ' ', 2: '&', 3: "'", 4: '(', 5: ')', 6: '*', 7: ',', 8: '-', 9: '.', 10: '/', 11: '0', 12: '1', 13: '2', 14: '3', 15: '4', 16: '5', 17: '6', 18: '7', 19: '8', 20: '9', 21: ':', 22: ';', 23: '=', 24: 'A', 25: 'B', 26: 'C', 27: 'D', 28: 'E', 29: 'F', 30: 'G', 31: 'H', 32: 'I', 33: 'K', 34: 'L', 35: 'M', 36: 'N', 37: 'O', 38: 'P', 39: 'Q', 40: 'R', 41: 'S', 42: 'T', 43: 'U', 44: 'V', 45: 'W', 46: 'Y', 47: '\\', 48: 'a', 49: 'b', 50: 'c', 51: 'd', 52: 'e', 53: 'f', 54: 'g', 55: 'h', 56: 'i', 57: 'j', 58: 'k', 59: 'l', 60: 'm', 61: 'n', 62: 'o', 63: 'p', 64: 'q', 65: 'r', 66: 's

In [18]:
encoded[:100]

array([14, 16,  9,  1, 24, 25, 27, 37, 35, 28, 36,  1, 24, 36, 27,  1, 38,
       28, 34, 44, 32, 26,  1, 41, 37, 36, 37, 30, 40, 24, 38, 31, 46, 21,
        0, 34, 56, 69, 52, 65,  1, 56, 66,  1, 61, 62, 65, 60, 48, 59,  1,
       56, 61,  1, 66, 56, 73, 52,  1, 48, 61, 51,  1, 70, 56, 67, 55,  1,
       61, 62, 65, 60, 48, 59,  1, 63, 48, 65, 52, 61, 50, 55, 72, 60, 48,
       59,  1, 52, 50, 55, 62, 54, 52, 61, 56, 50, 56, 67, 72,  1])

## One-hot Encoding

In [35]:
def one_hot_encode(arr, n_labels):

    return np.eye(n_labels,n_labels,  dtype=np.float32)[arr]
# check that the function works as expected
test_seq = np.array([[3, 5, 1]])
one_hot = one_hot_encode(test_seq, 8)

print(one_hot)

[[[0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0.]]]


## Batching

                M steps ( seq length )
               xxx                 xxx
               x                     x
               x                     x                                Starting sequence:
               x                     x                                [1 2 3 4 5 6 7 8 9 10 11 12]
               x                     x
N batch size   x                     x                                Batch size = 2
(No. of steps) x                     x                                [1 2 3 4 5 6]
               x                     x                                [7 8 9 10 11 12]
               x                     x
               x                     x                                Seq length = 3
               x                     x
               x                     x                                  ┌─────┐
               x                     x                                [ │1 2 3│ 4 5 6]
               x                     x                                [ │7 8 9│ 10 11 12]
               x                     x                                  └─────┘
               xxx                 xxx

            xxxxxxxxxxxxxx   xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
                         xxxxx
                          xx

                          k= No. of batches = total chars/ N.M

In [36]:
def create_batches(arr, batch_size, seq_length):
    batch_size_total = batch_size * seq_length
    n_batches = len(arr) // batch_size_total
    arr = arr[:n_batches*batch_size_total]
    arr = arr.reshape((batch_size,-1))

    for n in range(0, arr.shape[1], seq_length):
        x = arr[:, n:n+seq_length]
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

In [38]:
batches = create_batches(encoded, 8, 50)
x, y= next(batches)
print('x:\n', x[:10, :10])
print('\ny:\n', y[:10, :10])

x:
 [[14 16  9  1 24 25 27 37 35 28]
 [48 65 67  1 62 53  1 26 25 27]
 [50 48 69 56 67 72  9  0 18 14]
 [61 62 65 60 48 59  1 56 61  1]
 [54 61  1 62 53  1 66 63 48 50]
 [67 55 52  1 48 49 51 62 60 56]
 [65 72  1 49 59 48 51 51 52 65]
 [56 50  1 50 48 69 56 67 72  9]]

y:
 [[16  9  1 24 25 27 37 35 28 36]
 [65 67  1 62 53  1 26 25 27  1]
 [48 69 56 67 72  9  0 18 14  9]
 [62 65 60 48 59  1 56 61  1 66]
 [61  1 62 53  1 66 63 48 50 52]
 [55 52  1 48 49 51 62 60 56 61]
 [72  1 49 59 48 51 51 52 65  1]
 [50  1 50 48 69 56 67 72  9  0]]
