#Predicting a sequence of words using NLP

## 1) Importing the basic libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F

ModuleNotFoundError: No module named 'torch'

## 2) Reading the text

In [2]:
with open('./Data/shakespeare.txt') as f:
  text = f.read()

len(text), type(text)

(5445609, str)

In [5]:
text[:100]

"\n                     1\n  From fairest creatures we desire increase,\n  That thereby beauty's rose mi"

In [44]:
print(text[:500])


                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bu


## 3) Encoding and Decoding

In [7]:
# We are going for character level encoding

character_set = set(text)
print(f'{len(character_set)} \n')
print(np.array(character_set))

84 

{'t', 'N', 'D', 'L', '7', '}', '\n', 'a', '>', '2', 'J', ']', 'B', '|', 'U', 'b', 'h', 'o', 'Z', 'P', 'C', '9', 'm', "'", 'I', 'V', 'T', 'O', 'v', 'y', 'r', 'k', 'l', 'H', 'A', 'W', 'F', '.', '4', '&', 'G', 'z', ':', 'p', 'M', '0', '[', 'f', 'j', 'X', ' ', 'w', 'g', '5', 'K', 'x', 'Q', 'Y', '(', 'c', 'u', '1', ';', 'E', 'R', '<', '!', '_', '6', ',', '3', 'd', '8', '?', 's', '"', '`', 'i', ')', 'e', 'S', 'q', '-', 'n'}


In [0]:
#Creating encoder and decoder

#Decoder (num --> letter)

decoder = dict(enumerate(character_set))

In [0]:
#Encoder (letter --> num)

encoder = {each:idx for idx,each in decoder.items()}

In [10]:
#Converting the text into numerical values

encoded_text = np.array([encoder[ch] for ch in text])
encoded_text[:500]

array([ 6, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
       50, 50, 50, 50, 50, 61,  6, 50, 50, 36, 30, 17, 22, 50, 47,  7, 77,
       30, 79, 74,  0, 50, 59, 30, 79,  7,  0, 60, 30, 79, 74, 50, 51, 79,
       50, 71, 79, 74, 77, 30, 79, 50, 77, 83, 59, 30, 79,  7, 74, 79, 69,
        6, 50, 50, 26, 16,  7,  0, 50,  0, 16, 79, 30, 79, 15, 29, 50, 15,
       79,  7, 60,  0, 29, 23, 74, 50, 30, 17, 74, 79, 50, 22, 77, 52, 16,
        0, 50, 83, 79, 28, 79, 30, 50, 71, 77, 79, 69,  6, 50, 50, 12, 60,
        0, 50,  7, 74, 50,  0, 16, 79, 50, 30, 77, 43, 79, 30, 50, 74, 16,
       17, 60, 32, 71, 50, 15, 29, 50,  0, 77, 22, 79, 50, 71, 79, 59, 79,
        7, 74, 79, 69,  6, 50, 50, 33, 77, 74, 50,  0, 79, 83, 71, 79, 30,
       50, 16, 79, 77, 30, 50, 22, 77, 52, 16,  0, 50, 15, 79,  7, 30, 50,
       16, 77, 74, 50, 22, 79, 22, 17, 30, 29, 42,  6, 50, 50, 12, 60,  0,
       50,  0, 16, 17, 60, 50, 59, 17, 83,  0, 30,  7, 59,  0, 79, 71, 50,
        0, 17, 50,  0, 16

In [11]:
encoded_text.flatten()

array([ 6, 50, 50, ..., 63,  1,  2])

To convert a numpy array into one hot encoding : https://stackoverflow.com/questions/29831489/convert-array-of-indices-to-1-hot-encoded-numpy-array

### 3.1) Creating one-hot encoder

In [0]:
#Creating one-hot encoder for the encoded_text

def one_hot_encoder(encoded_text,num_uni_chars):
  #encoded text ---> Batch of encoded text
  #num_uni_chars ---> len(set(text))

  one_hot = np.zeros((encoded_text.size,num_uni_chars))

  one_hot = one_hot.astype(np.float32)  #for tensor

  one_hot[np.arange(one_hot.shape[0]),encoded_text.flatten()] = 1.0

  one_hot = one_hot.reshape((*encoded_text.shape,num_uni_chars))

  return one_hot

In [13]:
a = np.array([1,2,0])
print(a)
one_hot_encoder(a,3)

[1 2 0]


array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)

## 4) Generating Training Batches

In [0]:
def generate_batches(encoded_text,samp_per_batch=10,seq_length=50):
  
  # X  = encoded text of len sequence length(x to x+50)  
  # y = encoded text of len sequence length(x+1 to x+51)
  # For example : if encoded_text has [0,1,2,3,4,5,6] and seq_len = 3, then X = [0,1,2] , y = [1,2,3]

  #How many characters per batch ??
  char_per_batch = samp_per_batch * seq_length

  # How many batches we can make, given the length of encoding text ??
  num_batches_avail = int(len(encoded_text)/char_per_batch)

  #Cut off the borders of the encoded text that does not fit in this range
  encoded_text = encoded_text[:char_per_batch * num_batches_avail]

  encoded_text = encoded_text.reshape((samp_per_batch,-1))

  # Go through each row in array.
  for n in range(0, encoded_text.shape[1], seq_length):
      
    # Grab feature characters
    x = encoded_text[:, n:n+seq_length]
    
    # y is the target shifted over by 1
    y = np.zeros_like(x)
    
    #
    try:
        y[:, :-1] = x[:, 1:]
        y[:, -1]  = encoded_text[:, n+seq_length]
        
    # FOR POTENTIAL INDEXING ERROR AT THE END    
    except:
        y[:, :-1] = x[:, 1:]
        y[:, -1] = encoded_text[:, 0]
        
    yield x, y

In [18]:
sample_text = np.arange(20)
sample_text

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [24]:
sam = generate_batches(sample_text,2,10)
x,y = next(sam)
print(x)
print(y)

[[ 0  1  2  3  4  5  6  7  8  9]
 [10 11 12 13 14 15 16 17 18 19]]
[[ 1  2  3  4  5  6  7  8  9  0]
 [11 12 13 14 15 16 17 18 19 10]]


## 5) Creating the LSTM model

In [0]:
class CharModel(nn.Module):
    
  def __init__(self, all_chars, num_hidden=256, num_layers=4,drop_prob=0.5,use_gpu=False):
      
      
      # SET UP ATTRIBUTES
      super().__init__()
      self.drop_prob = drop_prob
      self.num_layers = num_layers
      self.num_hidden = num_hidden
      self.use_gpu = use_gpu
      
      #CHARACTER SET, ENCODER, and DECODER
      self.all_chars = all_chars
      self.decoder = dict(enumerate(all_chars))
      self.encoder = {char: ind for ind,char in decoder.items()}
      
      
      self.lstm = nn.LSTM(len(self.all_chars), num_hidden, num_layers, dropout=drop_prob, batch_first=True)
      
      self.dropout = nn.Dropout(drop_prob)
      
      self.fc_linear = nn.Linear(num_hidden, len(self.all_chars))
    
  
  def forward(self, x, hidden):
                
      
      lstm_output, hidden = self.lstm(x, hidden)
      
      
      drop_output = self.dropout(lstm_output)
      
      drop_output = drop_output.contiguous().view(-1, self.num_hidden)
      
      
      final_out = self.fc_linear(drop_output)
      
      
      return final_out, hidden
  
  
  def hidden_state(self, batch_size):
      
      if self.use_gpu:
          
          hidden = (torch.zeros(self.num_layers,batch_size,self.num_hidden).cuda(),
                    torch.zeros(self.num_layers,batch_size,self.num_hidden).cuda())
      else:
          hidden = (torch.zeros(self.num_layers,batch_size,self.num_hidden),
                    torch.zeros(self.num_layers,batch_size,self.num_hidden))
      
      return hidden
        

In [0]:
model = CharModel(
    all_chars=character_set,
    num_hidden=512,
    num_layers=3,
    drop_prob=0.5,
    use_gpu=True,
)

In [0]:
total_param  = []
for p in model.parameters():
    total_param.append(int(p.numel()))

In [29]:
sum(total_param)

5470292

## 6) Optimization and Loss Function

In [0]:
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
criterion = nn.CrossEntropyLoss()

## 7) Training Data and Validation Data

In [31]:
train_percent = 0.9
int(len(encoded_text)*train_percent)

4901048

In [0]:
train_ind = int(len(encoded_text)*train_percent)
train_data = encoded_text[:train_ind]
val_data = encoded_text[train_ind:]
 

## 8) Training the network

In [0]:
## VARIABLES

# Epochs to train for
epochs = 50
# batch size 
batch_size = 128

# Length of sequence
seq_len = 100

# for printing report purposes
# always start at 0
tracker = 0

# number of characters in text
num_char = max(encoded_text)+1

In [34]:
# Set model to train
model.train()


# Check to see if using GPU
if model.use_gpu:
    model.cuda()

for i in range(epochs):
    
  hidden = model.hidden_state(batch_size)
    
  for x,y in generate_batches(train_data,batch_size,seq_len):
      
    tracker += 1
    
    x = one_hot_encoder(x,num_char)
    inputs = torch.from_numpy(x)
    targets = torch.from_numpy(y)
    if model.use_gpu:
        
        inputs = inputs.cuda()
        targets = targets.cuda()
        
    # Reset Hidden State
    # If we dont' reset we would backpropagate through all training history
    hidden = tuple([state.data for state in hidden])
    
    model.zero_grad()
    
    lstm_output, hidden = model.forward(inputs,hidden)
    loss = criterion(lstm_output,targets.view(batch_size*seq_len).long())
    
    loss.backward()
    
    # POSSIBLE EXPLODING GRADIENT PROBLEM!
    # LET"S CLIP JUST IN CASE
    nn.utils.clip_grad_norm_(model.parameters(),max_norm=5)
    
    optimizer.step()
    
    ### CHECK ON VALIDATION SET ######
    
    if tracker % 25 == 0:
       
      val_hidden = model.hidden_state(batch_size)
      val_losses = []
      model.eval()
      
      for x,y in generate_batches(val_data,batch_size,seq_len):
  
        x = one_hot_encoder(x,num_char)
        inputs = torch.from_numpy(x)
        targets = torch.from_numpy(y)

        if model.use_gpu:

            inputs = inputs.cuda()
            targets = targets.cuda()
            
        val_hidden = tuple([state.data for state in val_hidden])
        
        lstm_output, val_hidden = model.forward(inputs,val_hidden)
        val_loss = criterion(lstm_output,targets.view(batch_size*seq_len).long())

        val_losses.append(val_loss.item())
      
      # Reset to training model after val for loop
      model.train()
      
      print(f"Epoch: {i} Step: {tracker} Val Loss: {val_loss.item()}")

Epoch: 0 Step: 25 Val Loss: 3.215010404586792
Epoch: 0 Step: 50 Val Loss: 3.2082674503326416
Epoch: 0 Step: 75 Val Loss: 6.7060418128967285
Epoch: 0 Step: 100 Val Loss: 3.2046127319335938
Epoch: 0 Step: 125 Val Loss: 3.146069288253784
Epoch: 0 Step: 150 Val Loss: 2.9709537029266357
Epoch: 0 Step: 175 Val Loss: 2.8704428672790527
Epoch: 0 Step: 200 Val Loss: 2.719435930252075
Epoch: 0 Step: 225 Val Loss: 2.627060651779175
Epoch: 0 Step: 250 Val Loss: 2.5036046504974365
Epoch: 0 Step: 275 Val Loss: 2.3791403770446777
Epoch: 0 Step: 300 Val Loss: 2.276585102081299
Epoch: 0 Step: 325 Val Loss: 2.2038164138793945
Epoch: 0 Step: 350 Val Loss: 2.1464829444885254
Epoch: 0 Step: 375 Val Loss: 2.099104642868042
Epoch: 1 Step: 400 Val Loss: 2.05954647064209
Epoch: 1 Step: 425 Val Loss: 2.025606393814087
Epoch: 1 Step: 450 Val Loss: 1.985239863395691
Epoch: 1 Step: 475 Val Loss: 1.951309323310852
Epoch: 1 Step: 500 Val Loss: 1.9205573797225952
Epoch: 1 Step: 525 Val Loss: 1.894120216369629
Epoch: 

## 9) Saving the model

In [0]:
model_name = 'trained_model.net'
torch.save(model.state_dict(),model_name)

## 10) Loading the model

In [0]:
# MUST MATCH THE EXACT SAME SETTINGS AS MODEL USED DURING TRAINING!

model = CharModel(
    all_chars=character_set,
    num_hidden=512,
    num_layers=3,
    drop_prob=0.5,
    use_gpu=True,
)

In [38]:
model.load_state_dict(torch.load(model_name))
model.eval()

CharModel(
  (lstm): LSTM(84, 512, num_layers=3, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc_linear): Linear(in_features=512, out_features=84, bias=True)
)

## 11) Generating the predictions

In [0]:
def predict_next_char(model, char, hidden=None, k=1):
        
  # Encode raw letters with model
  encoded_text = model.encoder[char]
  
  # set as numpy array for one hot encoding
  # NOTE THE [[ ]] dimensions!!
  encoded_text = np.array([[encoded_text]])
  
  # One hot encoding
  encoded_text = one_hot_encoder(encoded_text, len(model.all_chars))
  
  # Convert to Tensor
  inputs = torch.from_numpy(encoded_text)
  
  # Check for CPU
  if(model.use_gpu):
    inputs = inputs.cuda()
  
  
  # Grab hidden states
  hidden = tuple([state.data for state in hidden])
  
  
  # Run model and get predicted output
  lstm_out, hidden = model(inputs, hidden)

  
  # Convert lstm_out to probabilities
  probs = F.softmax(lstm_out, dim=1).data
   
  if(model.use_gpu):
    # move back to CPU to use with numpy
    probs = probs.cpu()

  
  # k determines how many characters to consider
  # for our probability choice.
  # https://pytorch.org/docs/stable/torch.html#torch.topk
  
  # Return k largest probabilities in tensor
  probs, index_positions = probs.topk(k)
  
  
  index_positions = index_positions.numpy().squeeze()
  
  # Create array of probabilities
  probs = probs.numpy().flatten()
  
  # Convert to probabilities per index
  probs = probs/probs.sum()
  
  # randomly choose a character based on probabilities
  char = np.random.choice(index_positions, p=probs)
  
  # return the encoded value of the predicted char and the hidden state
  return model.decoder[char], hidden

In [0]:
def generate_text(model, size, seed='The', k=1):
      
  # CHECK FOR GPU
  if(model.use_gpu):
    model.cuda()
  else:
    model.cpu()
  
  # Evaluation mode
  model.eval()
  
  # begin output from initial seed
  output_chars = [c for c in seed]
  
  # intiate hidden state
  hidden = model.hidden_state(1)
  
  # predict the next character for every character in seed
  for char in seed:
    char, hidden = predict_next_char(model, char, hidden, k=k)
  
  # add initial characters to output
  output_chars.append(char)
  
  # Now generate for size requested
  for i in range(size):
          
    # predict based off very last letter in output_chars
    char, hidden = predict_next_char(model, output_chars[-1], hidden, k=k)
    
    # add predicted character
    output_chars.append(char)
  
  # return string of predicted text
  return ''.join(output_chars)

In [41]:
print(generate_text(model, 1500, seed='The ', k=3))

The fools and shallows,
    And show'd the cheers to the true cheek to show,
    Which will not had this trencher as to be
    The summer's sons, as this that stirs the way.
    This is not there, and what hath have been so.
    The windows are not well and then a storm,
    Which she shall be such as a state of him
    To seek her fortune, which was that they stay
    The brow to this aspect of that shall seem
    To see her bearing shows, and there in time
    We would not strike the world it. If you will,
    The strength the state is there.
  CORIOLANUS. The sea, think it in me.
    I will not be a mortal to his honour
    And set our country with her. Therefore, sir,
    The season of their standings are as love
    With their bestrews and statutes, and the throne
    And banish'd them. When I had stand to hear
    This the sound of their senses. Then there were they are
    That this day shall be so much. This is me.
    Thou hast been so true and a morning heart,
    That weak a