#Predicting a sequence of words.

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


## 1) Importing the basic libraries

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F

## 2) Reading the text

In [0]:
with open('drive/My Drive/the_war_of_the_worlds.txt') as f:
  text = f.read()

len(text), type(text)

(336832, str)

In [0]:
text[:100]

'BOOK ONE\n\nTHE COMING OF THE MARTIANS\n\nCHAPTER ONE \n \nTHE EVE OF THE WAR\n \nNo one would have believed'

In [0]:
print(text[:500])

BOOK ONE

THE COMING OF THE MARTIANS

CHAPTER ONE 
 
THE EVE OF THE WAR
 
No one would have believed in the last years of the nineteenth century that this world was being watched keenly and closely by intelligences greater than man’s and yet as mortal as his own; that as men busied themselves about their various concerns they were scrutinised and studied, perhaps almost as narrowly as a man with a microscope might scrutinise the transient creatures that swarm and multiply in a drop of water. Wit


## 3) Encoding and Decoding

In [0]:
# We are going for character level encoding

character_set = set(text)
print(f'{len(character_set)} \n')
print(np.array(character_set))

80 

{'G', 'X', '\n', 'I', 'z', '2', 'Q', 'u', '’', 'J', 't', 'V', '.', 'C', 'T', '0', 'x', '‘', 'y', 'v', '\u2003', '(', '3', 'o', 'O', 'R', 'r', 'Y', '5', '1', '-', 'j', 'A', 'm', '?', '8', 'k', 'f', '9', 'Z', 'H', 'd', '4', 'L', '!', '…', 'M', 'E', 's', 'p', 'q', 'l', ',', ';', 'S', 'i', 'B', 'w', 'D', ')', 'U', 'a', '\t', 'n', 'e', '—', 'F', 'K', 'b', 'N', 'W', '"', ':', '&', '#', 'g', 'P', 'h', ' ', 'c'}


In [0]:
#Creating encoder and decoder

#Decoder (num --> letter)

decoder = dict(enumerate(character_set))

In [0]:
#Encoder (letter --> num)

encoder = {each:idx for idx,each in decoder.items()}

In [0]:
#Converting the text into numerical values

encoded_text = np.array([encoder[ch] for ch in text])
encoded_text[:500]

array([56, 24, 24, 67, 78, 24, 69, 47,  2,  2, 14, 40, 47, 78, 13, 24, 46,
        3, 69,  0, 78, 24, 66, 78, 14, 40, 47, 78, 46, 32, 25, 14,  3, 32,
       69, 54,  2,  2, 13, 40, 32, 76, 14, 47, 25, 78, 24, 69, 47, 78,  2,
       78,  2, 14, 40, 47, 78, 47, 11, 47, 78, 24, 66, 78, 14, 40, 47, 78,
       70, 32, 25,  2, 78,  2, 69, 23, 78, 23, 63, 64, 78, 57, 23,  7, 51,
       41, 78, 77, 61, 19, 64, 78, 68, 64, 51, 55, 64, 19, 64, 41, 78, 55,
       63, 78, 10, 77, 64, 78, 51, 61, 48, 10, 78, 18, 64, 61, 26, 48, 78,
       23, 37, 78, 10, 77, 64, 78, 63, 55, 63, 64, 10, 64, 64, 63, 10, 77,
       78, 79, 64, 63, 10,  7, 26, 18, 78, 10, 77, 61, 10, 78, 10, 77, 55,
       48, 78, 57, 23, 26, 51, 41, 78, 57, 61, 48, 78, 68, 64, 55, 63, 75,
       78, 57, 61, 10, 79, 77, 64, 41, 78, 36, 64, 64, 63, 51, 18, 78, 61,
       63, 41, 78, 79, 51, 23, 48, 64, 51, 18, 78, 68, 18, 78, 55, 63, 10,
       64, 51, 51, 55, 75, 64, 63, 79, 64, 48, 78, 75, 26, 64, 61, 10, 64,
       26, 78, 10, 77, 61

In [0]:
encoded_text.flatten()

array([56, 24, 24, ..., 78, 78,  2])

To convert a numpy array into one hot encoding : https://stackoverflow.com/questions/29831489/convert-array-of-indices-to-1-hot-encoded-numpy-array

### 3.1) Creating one-hot encoder

In [0]:
#Creating one-hot encoder for the encoded_text

def one_hot_encoder(encoded_text,num_uni_chars):
  #encoded text ---> Batch of encoded text
  #num_uni_chars ---> len(set(text))

  one_hot = np.zeros((encoded_text.size,num_uni_chars))

  one_hot = one_hot.astype(np.float32)  #for tensor

  one_hot[np.arange(one_hot.shape[0]),encoded_text.flatten()] = 1.0

  one_hot = one_hot.reshape((*encoded_text.shape,num_uni_chars))

  return one_hot

In [0]:
a = np.array([1,2,0])
print(a)
one_hot_encoder(a,3)

[1 2 0]


array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)

## 4) Generating Training Batches

In [0]:
def generate_batches(encoded_text,samp_per_batch=10,seq_length=50):
  
  # X  = encoded text of len sequence length(x to x+50)  
  # y = encoded text of len sequence length(x+1 to x+51)
  # For example : if encoded_text has [0,1,2,3,4,5,6] and seq_len = 3, then X = [0,1,2] , y = [1,2,3]

  #How many characters per batch ??
  char_per_batch = samp_per_batch * seq_length

  # How many batches we can make, given the length of encoding text ??
  num_batches_avail = int(len(encoded_text)/char_per_batch)

  #Cut off the borders of the encoded text that does not fit in this range
  encoded_text = encoded_text[:char_per_batch * num_batches_avail]

  encoded_text = encoded_text.reshape((samp_per_batch,-1))

  # Go through each row in array.
  for n in range(0, encoded_text.shape[1], seq_length):
      
    # Grab feature characters
    x = encoded_text[:, n:n+seq_length]
    
    # y is the target shifted over by 1
    y = np.zeros_like(x)
    
    #
    try:
        y[:, :-1] = x[:, 1:]
        y[:, -1]  = encoded_text[:, n+seq_length]
        
    # FOR POTENTIAL INDEXING ERROR AT THE END    
    except:
        y[:, :-1] = x[:, 1:]
        y[:, -1] = encoded_text[:, 0]
        
    yield x, y

In [0]:
sample_text = np.arange(20)
sample_text

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [0]:
sam = generate_batches(sample_text,2,10)
x,y = next(sam)
print(x)
print(y)

[[ 0  1  2  3  4  5  6  7  8  9]
 [10 11 12 13 14 15 16 17 18 19]]
[[ 1  2  3  4  5  6  7  8  9  0]
 [11 12 13 14 15 16 17 18 19 10]]


## 5) Creating the LSTM model

In [0]:
class CharModel(nn.Module):
    
  def __init__(self, all_chars, num_hidden=256, num_layers=4,drop_prob=0.5,use_gpu=False):
      
      
      # SET UP ATTRIBUTES
      super().__init__()
      self.drop_prob = drop_prob
      self.num_layers = num_layers
      self.num_hidden = num_hidden
      self.use_gpu = use_gpu
      
      #CHARACTER SET, ENCODER, and DECODER
      self.all_chars = all_chars
      self.decoder = dict(enumerate(all_chars))
      self.encoder = {char: ind for ind,char in decoder.items()}
      
      
      self.lstm = nn.LSTM(len(self.all_chars), num_hidden, num_layers, dropout=drop_prob, batch_first=True)
      
      self.dropout = nn.Dropout(drop_prob)
      
      self.fc_linear = nn.Linear(num_hidden, len(self.all_chars))
    
  
  def forward(self, x, hidden):
                
      
      lstm_output, hidden = self.lstm(x, hidden)
      
      
      drop_output = self.dropout(lstm_output)
      
      drop_output = drop_output.contiguous().view(-1, self.num_hidden)
      
      
      final_out = self.fc_linear(drop_output)
      
      
      return final_out, hidden
  
  
  def hidden_state(self, batch_size):
      
      if self.use_gpu:
          
          hidden = (torch.zeros(self.num_layers,batch_size,self.num_hidden).cuda(),
                    torch.zeros(self.num_layers,batch_size,self.num_hidden).cuda())
      else:
          hidden = (torch.zeros(self.num_layers,batch_size,self.num_hidden),
                    torch.zeros(self.num_layers,batch_size,self.num_hidden))
      
      return hidden
        

In [0]:
model = CharModel(
    all_chars=character_set,
    num_hidden=512,
    num_layers=3,
    drop_prob=0.5,
    use_gpu=True,
)

In [0]:
total_param  = []
for p in model.parameters():
    total_param.append(int(p.numel()))

In [0]:
sum(total_param)

5460048

## 6) Optimization and Loss Function

In [0]:
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
criterion = nn.CrossEntropyLoss()

## 7) Training Data and Validation Data

In [0]:
train_percent = 0.9
int(len(encoded_text)*train_percent)

303148

In [0]:
train_ind = int(len(encoded_text)*train_percent)
train_data = encoded_text[:train_ind]
val_data = encoded_text[train_ind:]
 

## 8) Training the network

In [0]:
## VARIABLES

# Epochs to train for
epochs = 100
# batch size 
batch_size = 128

# Length of sequence
seq_len = 100

# for printing report purposes
# always start at 0
tracker = 0

# number of characters in text
num_char = max(encoded_text)+1

In [0]:
# Set model to train
model.train()


# Check to see if using GPU
if model.use_gpu:
    model.cuda()

for i in range(epochs):
    
  hidden = model.hidden_state(batch_size)
    
  for x,y in generate_batches(train_data,batch_size,seq_len):
      
    tracker += 1
    
    x = one_hot_encoder(x,num_char)
    inputs = torch.from_numpy(x)
    targets = torch.from_numpy(y)
    if model.use_gpu:
        
        inputs = inputs.cuda()
        targets = targets.cuda()
        
    # Reset Hidden State
    # If we dont' reset we would backpropagate through all training history
    hidden = tuple([state.data for state in hidden])
    
    model.zero_grad()
    
    lstm_output, hidden = model.forward(inputs,hidden)
    loss = criterion(lstm_output,targets.view(batch_size*seq_len).long())
    
    loss.backward()
    
    # POSSIBLE EXPLODING GRADIENT PROBLEM!
    # LET"S CLIP JUST IN CASE
    nn.utils.clip_grad_norm_(model.parameters(),max_norm=5)
    
    optimizer.step()
    
    ### CHECK ON VALIDATION SET ######
    
    if tracker % 25 == 0:
       
      val_hidden = model.hidden_state(batch_size)
      val_losses = []
      model.eval()
      
      for x,y in generate_batches(val_data,batch_size,seq_len):
  
        x = one_hot_encoder(x,num_char)
        inputs = torch.from_numpy(x)
        targets = torch.from_numpy(y)

        if model.use_gpu:

            inputs = inputs.cuda()
            targets = targets.cuda()
            
        val_hidden = tuple([state.data for state in val_hidden])
        
        lstm_output, val_hidden = model.forward(inputs,val_hidden)
        val_loss = criterion(lstm_output,targets.view(batch_size*seq_len).long())

        val_losses.append(val_loss.item())
      
      # Reset to training model after val for loop
      model.train()
      
      print(f"Epoch: {i} Step: {tracker} Val Loss: {val_loss.item()}")

Epoch: 1 Step: 25 Val Loss: 3.022536516189575
Epoch: 2 Step: 50 Val Loss: 3.0179052352905273
Epoch: 3 Step: 75 Val Loss: 3.01595401763916
Epoch: 4 Step: 100 Val Loss: 3.004434823989868
Epoch: 5 Step: 125 Val Loss: 2.8606786727905273
Epoch: 6 Step: 150 Val Loss: 2.695924758911133
Epoch: 7 Step: 175 Val Loss: 2.626570463180542
Epoch: 8 Step: 200 Val Loss: 2.526420831680298
Epoch: 9 Step: 225 Val Loss: 2.362206220626831
Epoch: 10 Step: 250 Val Loss: 2.2275004386901855
Epoch: 11 Step: 275 Val Loss: 2.156747341156006
Epoch: 13 Step: 300 Val Loss: 2.094712495803833
Epoch: 14 Step: 325 Val Loss: 2.041759967803955
Epoch: 15 Step: 350 Val Loss: 1.9899237155914307
Epoch: 16 Step: 375 Val Loss: 1.9497240781784058
Epoch: 17 Step: 400 Val Loss: 1.9127224683761597
Epoch: 18 Step: 425 Val Loss: 1.8756123781204224
Epoch: 19 Step: 450 Val Loss: 1.8424971103668213
Epoch: 20 Step: 475 Val Loss: 1.8128924369812012
Epoch: 21 Step: 500 Val Loss: 1.7901405096054077
Epoch: 22 Step: 525 Val Loss: 1.76271474361

## 9) Saving the model

In [0]:
model_name = 'war_of_worlds_trained_model.net'
torch.save(model.state_dict(),model_name)

## 10) Loading the model

In [0]:
# MUST MATCH THE EXACT SAME SETTINGS AS MODEL USED DURING TRAINING!

model = CharModel(
    all_chars=character_set,
    num_hidden=512,
    num_layers=3,
    drop_prob=0.5,
    use_gpu=True,
)

In [0]:
model.load_state_dict(torch.load(model_name))
model.eval()

CharModel(
  (lstm): LSTM(80, 512, num_layers=3, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc_linear): Linear(in_features=512, out_features=80, bias=True)
)

## 11) Generating the predictions

In [0]:
def predict_next_char(model, char, hidden=None, k=1):
        
  # Encode raw letters with model
  encoded_text = model.encoder[char]
  
  # set as numpy array for one hot encoding
  # NOTE THE [[ ]] dimensions!!
  encoded_text = np.array([[encoded_text]])
  
  # One hot encoding
  encoded_text = one_hot_encoder(encoded_text, len(model.all_chars))
  
  # Convert to Tensor
  inputs = torch.from_numpy(encoded_text)
  
  # Check for CPU
  if(model.use_gpu):
    inputs = inputs.cuda()
  
  
  # Grab hidden states
  hidden = tuple([state.data for state in hidden])
  
  
  # Run model and get predicted output
  lstm_out, hidden = model(inputs, hidden)

  
  # Convert lstm_out to probabilities
  probs = F.softmax(lstm_out, dim=1).data
   
  if(model.use_gpu):
    # move back to CPU to use with numpy
    probs = probs.cpu()

  
  # k determines how many characters to consider
  # for our probability choice.
  # https://pytorch.org/docs/stable/torch.html#torch.topk
  
  # Return k largest probabilities in tensor
  probs, index_positions = probs.topk(k)
  
  
  index_positions = index_positions.numpy().squeeze()
  
  # Create array of probabilities
  probs = probs.numpy().flatten()
  
  # Convert to probabilities per index
  probs = probs/probs.sum()
  
  # randomly choose a character based on probabilities
  char = np.random.choice(index_positions, p=probs)
  
  # return the encoded value of the predicted char and the hidden state
  return model.decoder[char], hidden

In [0]:
def generate_text(model, size, seed='The', k=1):
      
  # CHECK FOR GPU
  if(model.use_gpu):
    model.cuda()
  else:
    model.cpu()
  
  # Evaluation mode
  model.eval()
  
  # begin output from initial seed
  output_chars = [c for c in seed]
  
  # intiate hidden state
  hidden = model.hidden_state(1)
  
  # predict the next character for every character in seed
  for char in seed:
    char, hidden = predict_next_char(model, char, hidden, k=k)
  
  # add initial characters to output
  output_chars.append(char)
  
  # Now generate for size requested
  for i in range(size):
          
    # predict based off very last letter in output_chars
    char, hidden = predict_next_char(model, output_chars[-1], hidden, k=k)
    
    # add predicted character
    output_chars.append(char)
  
  # return string of predicted text
  return ''.join(output_chars)

In [0]:
print(generate_text(model, 1500, seed='The ', k=3))

The sun screwing off the place. 
The sudden shopper was a shift flashes of staring at the steamer, and a huge sheet of half away the church the country across the law and face into the road, but a silent country and things about the flood in the sand pits. 
It was a flowing of gunners and terrible. The fact that his changed as my brother struck the common, so that they had been bright against the heat of the sun came to the common. 
At the top of Weybridge were staring and saw the flashes and staring at the streets, something off and raining from house from his store, but in a strange and small and dost of fire interment that at last into a share and cloud of all the crackled and clamper, as where I was not for the former that had been bruesed their shouts of the pit. I should have seemed to have been a sort of staring from that dirty black distance at last, and the flashes of smoke or flight, threating and strange and strange and smashed and smulted in tries, and so forth, a strange a