# NLP

- Networks do not understand raw text so all text has to be encoded.
- Then it needs to be one-hot encoded

# Libraries

In [1]:
import torch
from torch import nn
import torch.nn.functional as F

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Data

## Read data

In [2]:
with open('../Data/shakespeare.txt','r', encoding='utf8') as f:
    text = f.read()

In [3]:
print(text[:1000])


                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bud buriest thy content,
  And tender churl mak'st waste in niggarding:
    Pity the world, or else this glutton be,
    To eat the world's due, by the grave and thee.


                     2
  When forty winters shall besiege thy brow,
  And dig deep trenches in thy beauty's field,
  Thy youth's proud livery so gazed on now,
  Will be a tattered weed of small worth held:  
  Then being asked, where all thy beauty lies,
  Where all the treasure of thy lusty days;
  To say within thine own deep su

## Encoding text

We will create a set of all characters, assign an id to each character, and build two dictionaries one with id > text, another with text > id

In [4]:
# Create a set of all unique characters in the text
all_characters = set(text)
len(all_characters)

84

In [5]:
# Create a decoder that reads the ID and returns the character.
# Assign an ID to each character and save it in a dictionary
decoder = dict(enumerate(all_characters))
decoder

{0: 'U',
 1: 'v',
 2: 'Z',
 3: 'X',
 4: '1',
 5: "'",
 6: '8',
 7: '7',
 8: '5',
 9: '9',
 10: 'S',
 11: '|',
 12: '.',
 13: 'H',
 14: ';',
 15: 'y',
 16: 'P',
 17: 'O',
 18: 'E',
 19: '-',
 20: 'f',
 21: 'd',
 22: 'V',
 23: '>',
 24: 'n',
 25: 'G',
 26: 'o',
 27: 'I',
 28: 'l',
 29: '_',
 30: '(',
 31: ')',
 32: 'J',
 33: 'L',
 34: ':',
 35: 'x',
 36: 'M',
 37: 'Q',
 38: 'k',
 39: 'g',
 40: 'r',
 41: 'N',
 42: '<',
 43: 'm',
 44: 'j',
 45: 't',
 46: 'Y',
 47: '"',
 48: '?',
 49: ',',
 50: '3',
 51: 'K',
 52: ']',
 53: 'u',
 54: 'c',
 55: 'C',
 56: 'a',
 57: '2',
 58: 'D',
 59: '4',
 60: '\n',
 61: 's',
 62: ' ',
 63: 'p',
 64: 'q',
 65: 'T',
 66: 'R',
 67: 'b',
 68: 'h',
 69: 'B',
 70: 'w',
 71: 'W',
 72: '0',
 73: '6',
 74: 'z',
 75: '!',
 76: '`',
 77: 'A',
 78: 'F',
 79: '}',
 80: '[',
 81: '&',
 82: 'e',
 83: 'i'}

In [6]:
# Create an encoder that reads the character and returns the ID.
encoder = {char:ind for ind,char in decoder.items()}
encoder

{'U': 0,
 'v': 1,
 'Z': 2,
 'X': 3,
 '1': 4,
 "'": 5,
 '8': 6,
 '7': 7,
 '5': 8,
 '9': 9,
 'S': 10,
 '|': 11,
 '.': 12,
 'H': 13,
 ';': 14,
 'y': 15,
 'P': 16,
 'O': 17,
 'E': 18,
 '-': 19,
 'f': 20,
 'd': 21,
 'V': 22,
 '>': 23,
 'n': 24,
 'G': 25,
 'o': 26,
 'I': 27,
 'l': 28,
 '_': 29,
 '(': 30,
 ')': 31,
 'J': 32,
 'L': 33,
 ':': 34,
 'x': 35,
 'M': 36,
 'Q': 37,
 'k': 38,
 'g': 39,
 'r': 40,
 'N': 41,
 '<': 42,
 'm': 43,
 'j': 44,
 't': 45,
 'Y': 46,
 '"': 47,
 '?': 48,
 ',': 49,
 '3': 50,
 'K': 51,
 ']': 52,
 'u': 53,
 'c': 54,
 'C': 55,
 'a': 56,
 '2': 57,
 'D': 58,
 '4': 59,
 '\n': 60,
 's': 61,
 ' ': 62,
 'p': 63,
 'q': 64,
 'T': 65,
 'R': 66,
 'b': 67,
 'h': 68,
 'B': 69,
 'w': 70,
 'W': 71,
 '0': 72,
 '6': 73,
 'z': 74,
 '!': 75,
 '`': 76,
 'A': 77,
 'F': 78,
 '}': 79,
 '[': 80,
 '&': 81,
 'e': 82,
 'i': 83}

In [7]:
# Encode the text as a numpy array
encoded_text = np.array([encoder[char] for char in text])
encoded_text[:100]

array([60, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62,
       62, 62, 62, 62, 62,  4, 60, 62, 62, 78, 40, 26, 43, 62, 20, 56, 83,
       40, 82, 61, 45, 62, 54, 40, 82, 56, 45, 53, 40, 82, 61, 62, 70, 82,
       62, 21, 82, 61, 83, 40, 82, 62, 83, 24, 54, 40, 82, 56, 61, 82, 49,
       60, 62, 62, 65, 68, 56, 45, 62, 45, 68, 82, 40, 82, 67, 15, 62, 67,
       82, 56, 53, 45, 15,  5, 61, 62, 40, 26, 61, 82, 62, 43, 83])

## One-hot encoding

We will be creating a one-hot encoding matrix of all characters in the text.

In [8]:
def one_hot_encoder(encoded_text, num_uni_chars):
    '''
    Returns a one-hot encoded matrix of shape (encoded_text.size, unique_characters)
    Parameters
    ----------
    - encoded_text [np.array]: batch of encoded text
    - num_uni_chars [int]: number of unique characters in the text
    '''
    
    # Create a matrix of zeros
    one_hot = np.zeros((encoded_text.size,num_uni_chars))
    
    # Convert the matrix to Float32 to ensure Torch compatibility
    one_hot = one_hot.astype(np.float32)
    
    # One-hot encode original matrix
    one_hot[np.arange(one_hot.shape[0]),encoded_text.flatten()] = 1.0
    
    # Reshape to match the batch size. 
    one_hot = one_hot.reshape((*encoded_text.shape,num_uni_chars))
    
    return one_hot
    

In [9]:
# Sample
one_hot_encoder(np.array([0,2,2,3,1]),4)

array([[1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.]], dtype=float32)

## Training batches

The training batches target data will be the data shifted by one position. Instead of providing only the next letter, the entire context will be provided. This will allow the network to learn the gramatical rules, not just the most probable letter.

In [10]:
# Sample
t = [c for c in 'Hello there']
print(f'X = {t[:-1]}')
print(f'y = {t[1:]}')

X = ['H', 'e', 'l', 'l', 'o', ' ', 't', 'h', 'e', 'r']
y = ['e', 'l', 'l', 'o', ' ', 't', 'h', 'e', 'r', 'e']


We need to create a batch generator that will reshape the data to be of shape (batches, elements per batch)

In [11]:
# Example
sample_text = np.arange(100)
print(f'Original text = {sample_text}')
print(f'\nTransformed to 5 batches = \n{sample_text.reshape(10,-1)}')

Original text = [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]

Transformed to 5 batches = 
[[ 0  1  2  3  4  5  6  7  8  9]
 [10 11 12 13 14 15 16 17 18 19]
 [20 21 22 23 24 25 26 27 28 29]
 [30 31 32 33 34 35 36 37 38 39]
 [40 41 42 43 44 45 46 47 48 49]
 [50 51 52 53 54 55 56 57 58 59]
 [60 61 62 63 64 65 66 67 68 69]
 [70 71 72 73 74 75 76 77 78 79]
 [80 81 82 83 84 85 86 87 88 89]
 [90 91 92 93 94 95 96 97 98 99]]


In [45]:
def generate_batches(encoded_text, samp_per_batch=10, seq_len=50):
    '''
    Generator object to create training batches as requested.
    Parameters
    ----------
    - encoded_text [np.array]: encoded text to be batched
    - samp_per_batch [int]: samples per batch that will be created
    - seq_len [int]: number of characters to include in each sample
    
    Output
    -------
    X [np.array]: encoded text of length seq_len
    y [np.array]: X shifted by one position to the right
    '''
    
    # Calculate total number of characters per batch
    chars_per_batch = samp_per_batch * seq_len
    
    # Calculate the total number of batches that can be made
    num_batches = int(len(encoded_text)/chars_per_batch)
    
    # Remove extra characters that won't fit into a batch
    encoded_text = encoded_text[:num_batches*chars_per_batch]
    
    # Reshape encoded text
    encoded_text = encoded_text.reshape((samp_per_batch,-1))
    
    # Generate sequences
    for i in range(0,encoded_text.shape[1],seq_len):
        
        X = encoded_text[:,i:i+seq_len]
        y = np.zeros_like(X)
        
        # Insert in y the x values shifted by one place. X is one position smaller than y
        y[:,:-1] = X[:,1:] 
        
        # Code block to handle last row in the data
        try:
            # Insert the following value. This is different to i:i+seq_len as i:i+seq_len 
            # will not include i+seq_len item. i+seq_len will only include that item
            y[:,-1] = encoded_text[:i+seq_len] 
        except:
            # Insert the first value instead of the last value
            y[:, -1] = encoded_text[:, 0]
            
        yield X,y

In [52]:
# Test sample text
test = np.arange(1000)
test_batches = generate_batches(test)
tx, ty = next(test_batches)

tx[0], ty[0]

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]),
 array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
        35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,  0]))