# NLP

- Networks do not understand raw text so all text has to be encoded.
- Then it needs to be one-hot encoded

# Libraries

In [2]:
import torch
from torch import nn
import torch.nn.functional as F

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Data

## Read data

In [3]:
with open('../Data/shakespeare.txt','r', encoding='utf8') as f:
    text = f.read()

In [6]:
print(text[:1000])


                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bud buriest thy content,
  And tender churl mak'st waste in niggarding:
    Pity the world, or else this glutton be,
    To eat the world's due, by the grave and thee.


                     2
  When forty winters shall besiege thy brow,
  And dig deep trenches in thy beauty's field,
  Thy youth's proud livery so gazed on now,
  Will be a tattered weed of small worth held:  
  Then being asked, where all thy beauty lies,
  Where all the treasure of thy lusty days;
  To say within thine own deep su

## Encoding text

We will create a set of all characters, assign an id to each character, and build two dictionaries one with id > text, another with text > id

In [17]:
# Create a set of all unique characters in the text
all_characters = set(text)
len(all_characters)

84

In [24]:
# Create a decoder that reads the ID and returns the character.
# Assign an ID to each character and save it in a dictionary
decoder = dict(enumerate(all_characters))
decoder

{0: 'y',
 1: 'Y',
 2: '.',
 3: 't',
 4: ',',
 5: '>',
 6: '7',
 7: 'Z',
 8: '-',
 9: 'M',
 10: 'W',
 11: '0',
 12: 'P',
 13: 'G',
 14: 'd',
 15: '_',
 16: ']',
 17: "'",
 18: '1',
 19: '4',
 20: '`',
 21: 'a',
 22: 'V',
 23: 'p',
 24: 'f',
 25: 'h',
 26: 'w',
 27: 'z',
 28: '9',
 29: 'e',
 30: '?',
 31: '}',
 32: 'S',
 33: 'A',
 34: 'Q',
 35: 'u',
 36: ')',
 37: 'j',
 38: 'K',
 39: 'J',
 40: 'b',
 41: '<',
 42: 'F',
 43: 'T',
 44: 'l',
 45: 'm',
 46: 'C',
 47: 'o',
 48: ';',
 49: 'v',
 50: 'c',
 51: 'k',
 52: 'i',
 53: 'r',
 54: ':',
 55: 'X',
 56: 'n',
 57: '3',
 58: 'D',
 59: '\n',
 60: '2',
 61: '&',
 62: 'I',
 63: 'q',
 64: ' ',
 65: '5',
 66: 'N',
 67: '(',
 68: 'H',
 69: 's',
 70: 'x',
 71: 'g',
 72: '[',
 73: '6',
 74: '8',
 75: '!',
 76: 'R',
 77: 'B',
 78: 'E',
 79: '"',
 80: 'U',
 81: 'L',
 82: '|',
 83: 'O'}

In [25]:
# Create an encoder that reads the character and returns the ID.
encoder = {char:ind for ind,char in decoder.items()}
encoder

{'y': 0,
 'Y': 1,
 '.': 2,
 't': 3,
 ',': 4,
 '>': 5,
 '7': 6,
 'Z': 7,
 '-': 8,
 'M': 9,
 'W': 10,
 '0': 11,
 'P': 12,
 'G': 13,
 'd': 14,
 '_': 15,
 ']': 16,
 "'": 17,
 '1': 18,
 '4': 19,
 '`': 20,
 'a': 21,
 'V': 22,
 'p': 23,
 'f': 24,
 'h': 25,
 'w': 26,
 'z': 27,
 '9': 28,
 'e': 29,
 '?': 30,
 '}': 31,
 'S': 32,
 'A': 33,
 'Q': 34,
 'u': 35,
 ')': 36,
 'j': 37,
 'K': 38,
 'J': 39,
 'b': 40,
 '<': 41,
 'F': 42,
 'T': 43,
 'l': 44,
 'm': 45,
 'C': 46,
 'o': 47,
 ';': 48,
 'v': 49,
 'c': 50,
 'k': 51,
 'i': 52,
 'r': 53,
 ':': 54,
 'X': 55,
 'n': 56,
 '3': 57,
 'D': 58,
 '\n': 59,
 '2': 60,
 '&': 61,
 'I': 62,
 'q': 63,
 ' ': 64,
 '5': 65,
 'N': 66,
 '(': 67,
 'H': 68,
 's': 69,
 'x': 70,
 'g': 71,
 '[': 72,
 '6': 73,
 '8': 74,
 '!': 75,
 'R': 76,
 'B': 77,
 'E': 78,
 '"': 79,
 'U': 80,
 'L': 81,
 '|': 82,
 'O': 83}

In [26]:
# Encode the text as a numpy array
encoded_text = np.array([encoder[char] for char in text])
encoded_text[:100]

array([59, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
       64, 64, 64, 64, 64, 18, 59, 64, 64, 42, 53, 47, 45, 64, 24, 21, 52,
       53, 29, 69,  3, 64, 50, 53, 29, 21,  3, 35, 53, 29, 69, 64, 26, 29,
       64, 14, 29, 69, 52, 53, 29, 64, 52, 56, 50, 53, 29, 21, 69, 29,  4,
       59, 64, 64, 43, 25, 21,  3, 64,  3, 25, 29, 53, 29, 40,  0, 64, 40,
       29, 21, 35,  3,  0, 17, 69, 64, 53, 47, 69, 29, 64, 45, 52])

## One-hot encoding

We will be creating a one-hot encoding matrix of all characters in the text.

In [49]:
encoded_text.shape

(5445609,)

In [50]:
def one_hot_encoder(encoded_text, num_uni_chars):
    '''
    Returns a one-hot encoded matrix of shape (encoded_text.size, unique_characters)
    Parameters
    ----------
    - encoded_text [np.array]: batch of encoded text
    - num_uni_chars [int]: number of unique characters in the text
    '''
    
    # Create a matrix of zeros
    one_hot = np.zeros((encoded_text.size,num_uni_chars))
    
    # Convert the matrix to Float32 to ensure Torch compatibility
    one_hot = one_hot.astype(np.float32)
    
    # One-hot encode original matrix
    one_hot[np.arange(one_hot.shape[0]),encoded_text.flatten()] = 1.0
    
    # Reshape to match the batch size. 
    one_hot = one_hot.reshape((*encoded_text.shape,num_uni_chars))
    
    return one_hot
    