In [111]:
import re
import pickle
import random

import numpy as np
import pandas as pd
import torch
from tqdm.notebook import tqdm
import torch.nn as nn
import torch.nn.functional as F

In [231]:
!cp /content/drive/MyDrive/NLP/*.pickle .

In [112]:
## Generated text
pickle_gut = open("generated_text_guten.pickle","rb")
gen_text_gut = pickle.load(pickle_gut)

pickle_brow = open("generated_text_brown.pickle","rb")
gen_text_brown = pickle.load(pickle_brow)

## Real world Text
## Using Movie plots from IMDB

# read pickle file
pickle_in = open("plots_text.pickle","rb")
movie_plots = pickle.load(pickle_in)

In [204]:
class data_preprocessor():
  def __init__(self):
    self.corpus = []
    self.sequences = []
    self.word2idx = {}
    self.idx2word = {}
    self.input_text = []
    self.output_text = []
    self.input_idx = []
    self.output_idx = []

  def clean_corpus(self, corpus):
    if self.ss:
      print("Cleaning the corpus..")
    self.corpus = [re.sub("[^a-z' ]", "", i) for i in corpus]

  def create_sequence(self, seq_len = 5, hide_progress=True):
    """
    Function to create sequences of a given length from a corpus
    """
    if self.ss:
      print(f"\nGenerating sequences of length {seq_len} from the corpus..")
    for line in tqdm(self.corpus, desc="Process lines ", disable=hide_progress):
      token_list = line.split()
      token_list = [token for token in token_list if token != "" or token != " "]
      token_list_len = len(token_list)
      if token_list_len > seq_len:
        for i in range(seq_len,token_list_len):
          seq = token_list[i-seq_len:i+1]
          self.sequences.append(" ".join(seq))
      # else:
      #   self.sequences.append(line)

    if self.ss:
      print(f"Generated {len(self.sequences)} sequences !")

  def create_train_data(self):
    """
    Function to generate input and output text
    Input text : all words in the sentence except the last one
    Output text : all words in the sentence except the first one
    """
    if self.ss:
      print("\nGenerating Input and Ouput sequences..")
    for seq in self.sequences:
      self.input_text.append(" ".join(seq.split()[:-1]))
      self.output_text.append(" ".join(seq.split()[1:]))


  def generate_wordindex_map(self):
    """
    Function to generate index to word and word to index mapping
    """
    if self.ss:
      print("\nGenerating word to index and index to word mapping..")
    self.all_words = sorted(set(" ".join(self.corpus).split())) 
    self.vocab_size = len(self.all_words)  
    self.word2idx = {w: self.all_words.index(w) for w in self.all_words}
    self.idx2word = {self.all_words.index(w): w  for w in self.all_words}
    if self.ss:
      print(f"Vocab Size : {self.vocab_size}")


  def generate_idx_train_data(self):
    """
    Function to generate indexs of corresponding input and output text
    """
    if self.ss:
      print("\nGenerating Indexed version of input/output data..")

    self.input_idx = np.array([[self.word2idx[w] for w in 
                       inp_sentec.split()] for inp_sentec in self.input_text])
    
    self.output_idx = np.array([[self.word2idx[w] for w in 
                       out_sentec.split()] for out_sentec in self.output_text])



  def process_text(self, corpus, show_status = False):
    self.ss = show_status  
    self.clean_corpus(corpus)
    self.create_sequence()
    self.create_train_data()
    self.generate_wordindex_map()
    self.generate_idx_train_data()
    if self.ss:
      print("\nProcessing done !")

# Processing Text

### Real Data (IMDB Movie plots)

In [206]:
data_proc = data_preprocessor()

data_proc.process_text(movie_plots, show_status=True )

Cleaning the corpus..

Generating sequences of length 5 from the corpus..
Generated 152644 sequences !

Generating Input and Ouput sequences..

Generating word to index and index to word mapping..
Vocab Size : 16592

Generating Indexed version of input/output data..

Processing done !


In [193]:
print(data_proc.input_text[1], data_proc.input_idx[1])
print(data_proc.output_text[1], data_proc.output_idx[1])

is a private with the [ 7662    58 11422 16310 14867]
a private with the st [   58 11422 16310 14867 14005]


### Generated Data 1 (from NLTK Gutenberg)




In [248]:
data_proc_gut = data_preprocessor()
data_proc_gut.process_text(gen_text_gut, show_status=True )

Cleaning the corpus..

Generating sequences of length 5 from the corpus..
Generated 91381 sequences !

Generating Input and Ouput sequences..

Generating word to index and index to word mapping..
Vocab Size : 177

Generating Indexed version of input/output data..

Processing done !


In [197]:
print(data_proc_gut.input_text[1], data_proc_gut.input_idx[1])
print(data_proc_gut.output_text[1], data_proc_gut.output_idx[1])

you will be a very [175, 167, 24, 0, 157]
will be a very good [167, 24, 0, 157, 59]


### Generated Data 2 (from NLTK Brown)


In [249]:
data_proc_brown = data_preprocessor()
data_proc_brown.process_text(gen_text_brown, show_status=True )

Cleaning the corpus..

Generating sequences of length 5 from the corpus..
Generated 86951 sequences !

Generating Input and Ouput sequences..

Generating word to index and index to word mapping..
Vocab Size : 183

Generating Indexed version of input/output data..

Processing done !


In [214]:
print(data_proc_brown.input_text[1], data_proc_brown.input_idx[1])
print(data_proc_brown.output_text[1], data_proc_brown.output_idx[1])

from the fact that the [ 55 146  48 144 146]
the fact that the united [146  48 144 146 161]


# Creating Model

In [216]:
def get_batches(arr_x, arr_y, batch_size):
         
    # iterate through the arrays
    prv = 0
    for n in range(batch_size, arr_x.shape[0], batch_size):
      x = arr_x[prv:n,:]
      y = arr_y[prv:n,:]
      prv = n
      yield x, y

In [217]:
class WordLSTM(nn.Module):
    
    def __init__(self, vocab_size, n_hidden=256, n_layers=4, drop_prob=0.3, 
                 lr=0.001):
        super().__init__()

        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.emb_layer = nn.Embedding(vocab_size, 200)
        self.lstm = nn.LSTM(200, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)

        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(n_hidden, vocab_size)      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''

        ## pass input through embedding layer
        embedded = self.emb_layer(x)     
        
        ## Get the outputs and the new hidden state from the lstm
        lstm_output, hidden = self.lstm(embedded, hidden)

        out = self.dropout(lstm_output)
        
        #out = out.contiguous().view(-1, self.n_hidden) 
        out = out.reshape(-1, self.n_hidden) 
        out = self.fc(out)

        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        # if GPU is available
        if (torch.cuda.is_available()):
          hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                    weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        
        # if GPU is not available
        else:
          hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                    weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden

In [251]:

# instantiate the model
plot_net = WordLSTM(data_proc.vocab_size).cuda()
gutten_net = WordLSTM(data_proc.vocab_size).cuda()
brown_net = WordLSTM(data_proc.vocab_size).cuda()
print(plot_net)

WordLSTM(
  (emb_layer): Embedding(16592, 200)
  (lstm): LSTM(200, 256, num_layers=4, batch_first=True, dropout=0.3)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=16592, bias=True)
)


In [219]:
def train(net, input_t, output_t, epochs=10, batch_size=32, lr=0.001, clip=1):
    
    optim = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    net.cuda()

    net.train()

    epoch_loop = tqdm(range(epochs), desc="Epochs ")
    batch_loop = tqdm(range(len(input_t)//batch_size), desc="Batch Status ")

    for e in range(epochs):
        batch_loop.refresh()
        batch_loop.reset()
        epoch_loop.update()

        # initialize hidden state
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(input_t, output_t, batch_size):
            batch_loop.update()

            # convert numpy arrays to PyTorch arrays
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            inputs, targets = inputs.cuda(), targets.cuda()

            # detach hidden states
            h = tuple([each.data for each in h])

            net.zero_grad()
            output, h = net(inputs, h)
            loss = criterion(output, targets.view(-1))
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)

            optim.step()            
            

# Training Model

In [223]:
# # train the model
train(plot_net,
      data_proc.input_idx, 
      data_proc.output_idx, 
      batch_size = 32, epochs=20)

torch.save(plot_net, "/content/drive/MyDrive/NLP/plot_net.pth")

HBox(children=(FloatProgress(value=0.0, description='Epochs ', max=20.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='Batch Status ', max=4770.0, style=ProgressStyle(descripti…

In [252]:
# train the model
train(gutten_net,
      data_proc_gut.input_idx, 
      data_proc_gut.output_idx, 
      batch_size = 32, epochs=20)


torch.save(gutten_net, "/content/drive/MyDrive/NLP/gutten_net.pth")

HBox(children=(FloatProgress(value=0.0, description='Epochs ', max=20.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='Batch Status ', max=2855.0, style=ProgressStyle(descripti…

In [253]:
# train the model
train(brown_net,
      data_proc_brown.input_idx, 
      data_proc_brown.output_idx, 
      batch_size = 32, epochs=20)

torch.save(brown_net, "/content/drive/MyDrive/NLP/brown_net.pth")

HBox(children=(FloatProgress(value=0.0, description='Epochs ', max=20.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='Batch Status ', max=2717.0, style=ProgressStyle(descripti…

# Load Models and Test

In [254]:
plot_net = WordLSTM(data_proc.vocab_size).cuda()
plot_net = torch.load("/content/drive/MyDrive/NLP/plot_net.pth")

gutten_net = WordLSTM(data_proc.vocab_size).cuda()
gutten_net = torch.load("/content/drive/MyDrive/NLP/gutten_net.pth")

brown_net = WordLSTM(data_proc.vocab_size).cuda()
brown_net = torch.load("/content/drive/MyDrive/NLP/brown_net.pth")

In [370]:
temp = open("test_orig_gutten.pickle","rb")
test_gutten = pickle.load(temp)

temp = open('test_orig_plot.pickle','rb')
test_plot = pickle.load(temp)

temp = open("test_orig_brown.pickle","rb")
test_brown = pickle.load(temp)

In [371]:
def process_test_data(sentence_list):
  new_list = []
  for sentence in sentence_list:
    new_sent = []
    for item in sentence.split():
      if item not in [",",".","!","?"] and not item.isdigit():
        new_sent.append(item.strip(".,"))
    if len(new_sent) > 3:
      new_list.append(" ".join(new_sent))
  return new_list

In [372]:
test_brown = process_test_data(test_brown)
test_gutten = process_test_data(test_gutten)
test_plot = process_test_data(test_plot)
print(len(test_brown),len(test_gutten),len(test_plot))

983 1000 973


In [373]:

start_tokens_plot = [" ".join(item.split()[:2]) for item in test_plot]
start_tokens_gutten = [" ".join(item.split()[:2]) for item in test_gutten]
start_tokens_brown = [" ".join(item.split()[:2]) for item in test_brown] 

In [355]:
# predict next token
def predict(net, tkn, idx2wrd, wrd2idx ,h=None):
         
  # tensor inputs
  # x = np.array([[token2int[tkn]]])
  x = np.array([[wrd2idx[tkn]]])
  inputs = torch.from_numpy(x)
  inputs = inputs.cuda()

  # detach hidden state from history
  h = tuple([each.data for each in h])

  out, h = net(inputs, h)

  # get the token probabilities
  p = F.softmax(out, dim=1).data

  p = p.cpu()
  p = p.numpy()
  p = p.reshape(p.shape[1],)

  # get indices of top 3 values
  top_n_idx = p.argsort()[-3:][::-1]

  # randomly select one of the three indices
  sampled_token_index = top_n_idx[random.sample([0,1,2],1)[0]]

  # return the encoded value of the predicted char and the hidden state
  # return int2token[sampled_token_index], h
  return idx2wrd[sampled_token_index], h





In [356]:
# function to generate text
def sample(net, size, idx2word, word2idx, prime='it is'):
        
    # push to GPU
    net.cuda()
    
    net.eval()

    # batch size is 1
    h = net.init_hidden(1)

    toks = prime.split()

    # predict next token
    for t in prime.split():
      token, h = predict(net, t, idx2word, word2idx, h)
    
    toks.append(token)

    # predict subsequent tokens
    for i in range(size-1):
        token, h = predict(net, toks[-1],idx2word,word2idx, h)
        toks.append(token)

    return ' '.join(toks)

In [375]:
rnn_test_plot = []
rnn_test_gutten = []
rnn_test_brown = []

for i in tqdm(range(len(start_tokens_plot))):
  pred_plot = sample(plot_net, 15, data_proc.idx2word, 
                     data_proc.word2idx, prime=start_tokens_plot[i])
  rnn_test_plot.append(pred_plot)

for i in tqdm(range(len(start_tokens_gutten))):
  pred_gutten = sample(gutten_net, 15, data_proc_gut.idx2word, 
                       data_proc_gut.word2idx, prime=start_tokens_gutten[i])

  rnn_test_gutten.append(pred_gutten)

for i in tqdm(range(len(start_tokens_brown))):
  pred_brown = sample(brown_net, 15, data_proc_brown.idx2word, 
                      data_proc_brown.word2idx, prime=start_tokens_brown[i])
  rnn_test_brown.append(pred_brown)

HBox(children=(FloatProgress(value=0.0, max=973.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=983.0), HTML(value='')))




In [384]:
sample_ = int(np.random.choice(np.arange(len(test_plot)),1))
print(f"Original Movie plot text :{test_plot[sample_]}")
print(f"Generated Movie plot text: {rnn_test_plot[sample_]}")

Original Movie plot text :decides to give all of thunderbird's money away and tells ambrose what to do to accomplish this which results in bags of money being poured out of the office's window
Generated Movie plot text: decides to make a new man and the film ends with the help for the police station


In [388]:
sample_ = int(np.random.choice(np.arange(len(test_gutten)),1))
print(f"Original Guttenberg (generated) text : {test_gutten[sample_]}")
print(f"Generated Guttenberg (generated) text: {rnn_test_gutten[sample_]}")

Original Guttenberg (generated) text : your sister s being a bad
Generated Guttenberg (generated) text: your sister be a will a bad a bad i never sure more than a i am


In [387]:
sample_ = int(np.random.choice(np.arange(len(test_brown)),1))
print(f"Original Brown (generated) text : {test_brown[sample_]}")
print(f"Generated Brown (generated) text: {rnn_test_brown[sample_]}")

Original Brown (generated) text : for the first time in the
Generated Brown (generated) text: for the first time in know again first is the first to the with a actual stage


In [389]:
with open('/content/drive/MyDrive/NLP/rnn_test_gutten.pickle', 'wb') as f:
    pickle.dump(rnn_test_gutten, f)

with open('/content/drive/MyDrive/NLP/rnn_test_brown.pickle', 'wb') as f:
    pickle.dump(rnn_test_brown, f)
    
with open('/content/drive/MyDrive/NLP/rnn_test_plot.pickle', 'wb') as f:
    pickle.dump(rnn_test_plot, f)