In [1]:
import re
import pickle
import random

import numpy as np
import pandas as pd
import torch
from tqdm.notebook import tqdm
import torch.nn as nn
import torch.nn.functional as F

In [2]:
!cp /content/drive/MyDrive/NLP/*.pickle .
!cp /content/drive/MyDrive/NLP/*.pth .

In [26]:
## Generated text
pickle_gut = open("generated_text_guten.pickle","rb")
gen_text_gut = pickle.load(pickle_gut)

pickle_brow = open("generated_text_brown.pickle","rb")
gen_text_brown = pickle.load(pickle_brow)

## Real world Text
## Using Movie plots from IMDB

# read pickle file
pickle_in = open("plots_text.pickle","rb")
movie_plots = pickle.load(pickle_in)

In [20]:
class data_preprocessor():
  def __init__(self):
    self.corpus = []
    self.sequences = []
    self.word2idx = {}
    self.idx2word = {}
    self.input_text = []
    self.output_text = []
    self.input_idx = []
    self.output_idx = []

  def clean_corpus(self, corpus):
    if self.ss:
      print("Cleaning the corpus..")
    self.corpus = [re.sub("[^a-z' ]", "", i) for i in corpus]

  def create_sequence(self, seq_len = 5, hide_progress=True):
    """
    Function to create sequences of a given length from a corpus
    """
    if self.ss:
      print(f"\nGenerating sequences of length {seq_len} from the corpus..")
    for line in tqdm(self.corpus, desc="Process lines ", disable=hide_progress):
      token_list = line.split()
      token_list = [token for token in token_list if token != "" or token != " "]
      token_list_len = len(token_list)
      if token_list_len > seq_len:
        for i in range(seq_len,token_list_len):
          seq = token_list[i-seq_len:i+1]
          self.sequences.append(" ".join(seq))
      # else:
      #   self.sequences.append(line)

    if self.ss:
      print(f"Generated {len(self.sequences)} sequences !")

  def create_train_data(self):
    """
    Function to generate input and output text
    Input text : all words in the sentence except the last one
    Output text : all words in the sentence except the first one
    """
    if self.ss:
      print("\nGenerating Input and Ouput sequences..")
    for seq in self.sequences:
      self.input_text.append(" ".join(seq.split()[:-1]))
      self.output_text.append(" ".join(seq.split()[1:]))


  def generate_wordindex_map(self):
    """
    Function to generate index to word and word to index mapping
    """
    if self.ss:
      print("\nGenerating word to index and index to word mapping..")
    self.all_words = sorted(set(" ".join(self.corpus).split())) 
    self.vocab_size = len(self.all_words)  
    self.word2idx = {w: self.all_words.index(w) for w in self.all_words}
    self.idx2word = {self.all_words.index(w): w  for w in self.all_words}
    if self.ss:
      print(f"Vocab Size : {self.vocab_size}")


  def generate_idx_train_data(self):
    """
    Function to generate indexs of corresponding input and output text
    """
    if self.ss:
      print("\nGenerating Indexed version of input/output data..")

    self.input_idx = np.array([[self.word2idx[w] for w in 
                       inp_sentec.split()] for inp_sentec in self.input_text])
    
    self.output_idx = np.array([[self.word2idx[w] for w in 
                       out_sentec.split()] for out_sentec in self.output_text])



  def process_text(self, corpus, show_status = False):
    self.ss = show_status  
    self.clean_corpus(corpus)
    self.create_sequence()
    self.create_train_data()
    self.generate_wordindex_map()
    self.generate_idx_train_data()
    if self.ss:
      print("\nProcessing done !")

# Processing Text

### Real Data (IMDB Movie plots)

In [27]:
data_proc = data_preprocessor()

data_proc.process_text(movie_plots, show_status=True )

Cleaning the corpus..

Generating sequences of length 5 from the corpus..
Generated 152644 sequences !

Generating Input and Ouput sequences..

Generating word to index and index to word mapping..
Vocab Size : 16592

Generating Indexed version of input/output data..

Processing done !


In [6]:
print(data_proc.input_text[1], data_proc.input_idx[1])
print(data_proc.output_text[1], data_proc.output_idx[1])

is a private with the [ 7662    58 11422 16310 14867]
a private with the st [   58 11422 16310 14867 14005]


### Generated Data 1 (from NLTK Gutenberg)




In [16]:
data_proc_gut = data_preprocessor()
data_proc_gut.process_text(gen_text_gut[:-1000], show_status=True )

Cleaning the corpus..

Generating sequences of length 5 from the corpus..
Generated 90785 sequences !

Generating Input and Ouput sequences..

Generating word to index and index to word mapping..
Vocab Size : 177

Generating Indexed version of input/output data..

Processing done !


In [8]:
print(data_proc_gut.input_text[1], data_proc_gut.input_idx[1])
print(data_proc_gut.output_text[1], data_proc_gut.output_idx[1])

you will be a very [175 167  24   0 157]
will be a very good [167  24   0 157  59]


### Generated Data 2 (from NLTK Brown)


In [19]:
data_proc_brown = data_preprocessor()
data_proc_brown.process_text(gen_text_brown[:-1000], show_status=True )

Cleaning the corpus..

Generating sequences of length 5 from the corpus..
Generated 86371 sequences !

Generating Input and Ouput sequences..

Generating word to index and index to word mapping..
Vocab Size : 183

Generating Indexed version of input/output data..

Processing done !


In [20]:
print(data_proc_brown.input_text[1], data_proc_brown.input_idx[1])
print(data_proc_brown.output_text[1], data_proc_brown.output_idx[1])

from the fact that the [ 55 146  48 144 146]
the fact that the united [146  48 144 146 161]


# Creating Model

In [22]:
def get_batches(arr_x, arr_y, batch_size):
         
    # iterate through the arrays
    prv = 0
    for n in range(batch_size, arr_x.shape[0], batch_size):
      x = arr_x[prv:n,:]
      y = arr_y[prv:n,:]
      prv = n
      yield x, y

In [23]:
class WordLSTM(nn.Module):
    
    def __init__(self, vocab_size, n_hidden=256, n_layers=4, drop_prob=0.3, 
                 lr=0.001):
        super().__init__()

        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.emb_layer = nn.Embedding(vocab_size, 200)
        self.lstm = nn.LSTM(200, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)

        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(n_hidden, vocab_size)      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''

        ## pass input through embedding layer
        embedded = self.emb_layer(x)     
        
        ## Get the outputs and the new hidden state from the lstm
        lstm_output, hidden = self.lstm(embedded, hidden)

        out = self.dropout(lstm_output)
        
        #out = out.contiguous().view(-1, self.n_hidden) 
        out = out.reshape(-1, self.n_hidden) 
        out = self.fc(out)

        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        # if GPU is available
        if (torch.cuda.is_available()):
          hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                    weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        
        # if GPU is not available
        else:
          hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                    weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden

In [24]:

# instantiate the model
plot_net = WordLSTM(data_proc.vocab_size).cuda()
gutten_net = WordLSTM(data_proc.vocab_size).cuda()
brown_net = WordLSTM(data_proc.vocab_size).cuda()
print(plot_net)

AttributeError: ignored

In [14]:
def train(net, input_t, output_t, epochs=10, batch_size=32, lr=0.001, clip=1):
    
    optim = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    net.cuda()

    net.train()

    epoch_loop = tqdm(range(epochs), desc="Epochs ")
    batch_loop = tqdm(range(len(input_t)//batch_size), desc="Batch Status ")

    for e in range(epochs):
        batch_loop.refresh()
        batch_loop.reset()
        epoch_loop.update()

        # initialize hidden state
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(input_t, output_t, batch_size):
            batch_loop.update()

            # convert numpy arrays to PyTorch arrays
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            inputs, targets = inputs.cuda(), targets.cuda()

            # detach hidden states
            h = tuple([each.data for each in h])

            net.zero_grad()
            output, h = net(inputs, h)
            loss = criterion(output, targets.view(-1))
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)

            optim.step()            
            

# Training Model

In [104]:
# train the model
train(plot_net,
      data_proc.input_idx, 
      data_proc.output_idx, 
      batch_size = 32, epochs=30)

torch.save(plot_net, "/content/drive/MyDrive/NLP/plot_net.pth")

In [105]:
# train the model
train(gutten_net,
      data_proc_gut.input_idx, 
      data_proc_gut.output_idx, 
      batch_size = 32, epochs=20)


torch.save(gutten_net, "/content/drive/MyDrive/NLP/gutten_net.pth")

In [106]:
# train the model
train(brown_net,
      data_proc_brown.input_idx, 
      data_proc_brown.output_idx, 
      batch_size = 32, epochs=20)

torch.save(brown_net, "/content/drive/MyDrive/NLP/brown_net.pth")

# Load Models and Test

### Reading model objects 

In [16]:
# plot_net = WordLSTM(data_proc.vocab_size).cuda()
# plot_net = torch.load("plot_net.pth")

# gutten_net = WordLSTM(data_proc.vocab_size).cuda()
# gutten_net = torch.load("gutten_net.pth")

# brown_net = WordLSTM(data_proc.vocab_size).cuda()
# brown_net = torch.load("brown_net.pth")

### Loading the test data 

In [3]:
test_gutten = pickle.load(open("test_orig_gutten.pickle","rb"))
test_plot = pickle.load(open('test_orig_plot.pickle','rb'))
test_brown = pickle.load(open("test_orig_brown.pickle","rb"))

In [28]:
plot_net = WordLSTM(data_proc.vocab_size).cuda()
plot_net = torch.load("plot_net_2.pth")

### Preprocessing for test data
Since the current RNN model is not trained to handel OOV items, data needs to pre-processed to avoid prediction error. This will result in reducing the test data size marginally. 

In [4]:
def process_test_data(sentence_list):
  new_list = []
  for sentence in sentence_list:
    sentence = re.sub("[^a-z' ]", "", sentence)
    new_sent = []
    for item in sentence.split():
      if item not in [",",".","!","?"] and not item.isdigit():
        new_sent.append(item.strip(".,"))
    if len(new_sent) > 3:
      new_list.append(" ".join(new_sent))
  return new_list

test_brown_s = process_test_data(test_brown)
test_gutten_s = process_test_data(test_gutten)
test_plot_s = process_test_data(test_plot)

print(f"Original Length of corpus  :{len(test_brown)},{len(test_gutten)},{len(test_plot)}")
print(f"Length after pre-processing:{len(test_brown_s)} ,{len(test_gutten_s)},{len(test_plot_s)}")

Original Length of corpus  :1000,1000,1000
Length after pre-processing:983 ,1000,973


### Extracting firs two tokens from test sentences to use for generation

In [5]:
start_tokens_plot = [" ".join(item.split()[:2]) for item in test_plot_s]
start_tokens_gutten = [" ".join(item.split()[:2]) for item in test_gutten_s]
start_tokens_brown = [" ".join(item.split()[:2]) for item in test_brown_s] 

In [19]:
start_tokens_plot_4 = [" ".join(item.split()[:4]) for item in test_plot_s]

In [30]:
# predict next token
def predict(net, tkn, idx2wrd, wrd2idx ,h=None, sp=False):
         
  # tensor inputs
  # x = np.array([[token2int[tkn]]])
  x = np.array([[wrd2idx[tkn]]])
  inputs = torch.from_numpy(x)
  inputs = inputs.cuda()

  print(f"Input tkn: {tkn}   | Input index : {inputs}") if sp else None
  # detach hidden state from history
  h = tuple([each.data for each in h])

  out, h = net(inputs, h)

  # get the token probabilities
  p = F.softmax(out, dim=1).data

  p = p.cpu()
  p = p.numpy()
  p = p.reshape(p.shape[1],)

  # get indices of top 3 values
  top_n_idx = p.argsort()[-3:][::-1]

  # randomly select one of the three indices
  sampled_token_index = top_n_idx[random.sample([0,1,2],1)[0]]

  # return the encoded value of the predicted char and the hidden state
  # return int2token[sampled_token_index], h
  return idx2wrd[sampled_token_index], h

In [31]:
# function to generate text
def sample(net, size, idx2word, word2idx, prime='it is', show_process = False):
        
    # push to GPU
    net.cuda()
    
    net.eval()

    # batch size is 1
    h = net.init_hidden(1)

    toks = prime.split()
    print(f"Initial State : {toks}") if show_process else None
    # predict next token
    for t in prime.split():
      token, h = predict(net, t, idx2word, word2idx, h, sp=show_process)
    toks.append(token)
    print(f"Second state : {toks}") if show_process else None

    # predict subsequent tokens
    for i in range(size-1):
        inp = toks[-1]
        token, h = predict(net, toks[-1],idx2word,word2idx, h, sp=show_process)
        toks.append(token)
        print(f"Subsequent State | Input : {inp} \
        Output : {token}") if show_process else None

    return ' '.join(toks)

In [55]:
sample(plot_net, 15, data_proc.idx2word, data_proc.word2idx, 
       prime="she also")

'she also decides not to be able for a better and his family and completes his own'

In [56]:
rnn_test_plot = []
rnn_test_gutten = []
rnn_test_brown = []

for i in tqdm(range(len(start_tokens_plot))):
  pred_plot = sample(plot_net, 15, data_proc.idx2word, 
                     data_proc.word2idx, prime=start_tokens_plot[i])
  rnn_test_plot.append(pred_plot)

for i in tqdm(range(len(start_tokens_gutten))):
  pred_gutten = sample(gutten_net, 4, data_proc_gut.idx2word, 
                       data_proc_gut.word2idx, prime=start_tokens_gutten[i])

  rnn_test_gutten.append(pred_gutten)

for i in tqdm(range(len(start_tokens_brown))):
  pred_brown = sample(brown_net, 4, data_proc_brown.idx2word, 
                      data_proc_brown.word2idx, prime=start_tokens_brown[i])
  rnn_test_brown.append(pred_brown)

HBox(children=(FloatProgress(value=0.0, max=973.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=983.0), HTML(value='')))




In [32]:
rnn_test_plot_4 = []
for i in tqdm(range(len(start_tokens_plot_4))):
  pred_plot = sample(plot_net, 15, data_proc.idx2word, 
                     data_proc.word2idx, prime=start_tokens_plot_4[i])
  rnn_test_plot_4.append(pred_plot)

HBox(children=(FloatProgress(value=0.0, max=973.0), HTML(value='')))




## Printing the results

In [6]:
rnn_test_gutten = pickle.load(open("rnn_test_gutten.pickle","rb"))
rnn_test_brown = pickle.load(open('rnn_test_brown.pickle','rb'))
rnn_test_plot = pickle.load(open("rnn_test_plot.pickle","rb"))

#reminder : Preprocess the test data first 
# test_brown_s = process_test_data(test_brown)
# test_gutten_s = process_test_data(test_gutten)
# test_plot_s = process_test_data(test_plot)

### Examples for RNN generated Movie plots with 2 inputs

In [7]:
sample_ = np.random.choice(np.arange(len(test_plot)),5, replace=False)
print("\nResults for Movie plots with 2 token input : \n")
for i in range(5):
  print(f"Original  :{test_plot_s[int(sample_[i])]}")
  print(f"Generated : {rnn_test_plot[int(sample_[i])]}\n")


Results for Movie plots : 

Original  :while adina tries to seduce a fellow kok brother jimmy to get the tape daisy is falling for the dog president leah
Generated : while adina tries unsuccessfully to convince her how he is a software propounding a lot for a

Original  :six months later a new game is discovered by a swat team led by lt
Generated : six months and is paralyzed by a vampire and musician and the two are infected and quarantined

Original  :the pace and tone of the film is immediately made clear with an opening fight in the woods as wahjee and his uncle attempt to flee from ruthless fighters led by mien tsumun
Generated : the pace and rips the leader for a young boy in a car called the final murder

Original  :this part starts with a zoom into a house and to a doghouse labeled killer with spike in it
Generated : this part is in fact the same time in his home he was a priest and he

Original  :during the funeral while having sex inside a closet kimie reveals to netah that 

### Examples for RNN generated Movie plots with 4 inputs

In [33]:
sample_ = np.random.choice(np.arange(len(test_plot)),5, replace=False)
print("\nResults for Movie plots with 4 token input: \n")
for i in range(5):
  print(f"Original  :{test_plot_s[int(sample_[i])]}")
  print(f"Generated : {rnn_test_plot_4[int(sample_[i])]}\n")


Results for Movie plots : 

Original  :after one final moment of doubt when beth suggests they visit dr
Generated : after one final moment of disassociative house and the rest are unsuccessful in the film is a successful depression

Original  :jamal warns prettyeyed willy about chicago
Generated : jamal warns prettyeyed willy and his wife are transporting dr connell eliminating the whole family is based for the

Original  :although hurt she agrees and after a staged honeymoon aboard the dakin family yacht they return to boston
Generated : although hurt she agrees in the village the film is the sensitive and completes a x interview the film

Original  :jim admits to have been walking in the vicinity of the murder site that plus the cheque and his head injury make him the prime suspect
Generated : jim admits to have a lot for the murder the two meet in the trunk of bobbili veerakesavudu who

Original  :on the island local mario ruoppolo is dissatisfied with being a fisherman like his fa

### Examples for RNN generated Guttenberg with 2 inputs

In [8]:
sample_ = np.random.choice(np.arange(len(test_plot)),5, replace=False)
print("\nResults for Guttenberg:\n")
for i in range(5):
  print(f"Original  : {test_gutten_s[int(sample_[i])]}")
  print(f"Generated : {rnn_test_gutten[int(sample_[i])]}\n")


Results for Guttenberg:

Original  : now i am sure i
Generated : now i the able to be

Original  : colonel brandon s being a bad
Generated : colonel brandon s steele and the

Original  : smith and if he had been
Generated : smith and if he i not

Original  : every thing that was not in
Generated : every thing had not am to

Original  : from the first time after all
Generated : from the to be happy should



### Examples for RNN generated Brown with 2 inputs

In [9]:
sample_ = np.random.choice(np.arange(len(test_brown)),5, replace=False)
print("\nResults for Brown:\n")
for i in range(5):
  print(f"Original  : {test_brown_s[int(sample_[i])]}")
  print(f"Generated : {rnn_test_brown[int(sample_[i])]}\n")


Results for Brown:

Original  : here the first time in
Generated : here the first first the world

Original  : at the same time the
Generated : at the first first accepted and

Original  : what is the only way to
Generated : what is the only important of

Original  : as a result of the united
Generated : as a first time the first

Original  : maybe i can see that the
Generated : maybe i can see been the



### Saving the results

In [103]:
with open('/content/drive/MyDrive/NLP/rnn_test_gutten.pickle', 'wb') as f:
    pickle.dump(rnn_test_gutten, f)

with open('/content/drive/MyDrive/NLP/rnn_test_brown.pickle', 'wb') as f:
    pickle.dump(rnn_test_brown, f)
    
with open('/content/drive/MyDrive/NLP/rnn_test_plot.pickle', 'wb') as f:
    pickle.dump(rnn_test_plot, f)

# Result Evaluation

## Evaluating RNN Outputs

In [10]:
%%capture
!pip install datasets
!pip install bert_score

In [39]:
# datasets require pyarrow version 2.0
# import pyarrow
# print(pyarrow.__version__) 

In [12]:
import warnings
warnings.filterwarnings("ignore")
from datasets import load_metric

In [37]:
rnn_test_gutten = pickle.load(open("rnn_test_gutten.pickle","rb"))
rnn_test_brown = pickle.load(open('rnn_test_brown.pickle','rb'))
rnn_test_plot = pickle.load(open("rnn_test_plot.pickle","rb"))

### Calculating Bert score for **Guttenberg** results from RNN

In [14]:
bertscore_gutten = load_metric('bertscore')
bertscore_gutten.add_batch(predictions=rnn_test_gutten, references=test_gutten_s)
rnn_gutt_precision = torch.mean(bertscore_gutten.compute(lang='en')['precision']).item()
bertscore_gutten = load_metric('bertscore')
bertscore_gutten.add_batch(predictions=rnn_test_gutten, references=test_gutten_s)
rnn_gutt_f1 = torch.mean(bertscore_gutten.compute(lang='en')['f1']).item()

### Calculating Bert score for **Brown** results from RNN

In [15]:
bertscore_brown = load_metric('bertscore')
bertscore_brown.add_batch(predictions=rnn_test_brown, references=test_brown_s)
rnn_brown_precision = torch.mean(bertscore_brown.compute(lang='en')['precision']).item()
bertscore_brown = load_metric('bertscore')
bertscore_brown.add_batch(predictions=rnn_test_brown, references=test_brown_s)
rnn_brown_f1 = torch.mean(bertscore_brown.compute(lang='en')['f1']).item()

### Calculating Bert score for **Movie plot (2 token input)** results from RNN

In [16]:
bertscore_plot = load_metric('bertscore')
bertscore_plot.add_batch(predictions=rnn_test_plot, references=test_plot_s)
rnn_plot_precision = torch.mean(bertscore_plot.compute(lang='en')['precision']).item()
bertscore_plot = load_metric('bertscore')
bertscore_plot.add_batch(predictions=rnn_test_plot, references=test_plot_s)
rnn_plot_f1 = torch.mean(bertscore_plot.compute(lang='en')['f1']).item()

### Calculating Bert score for **Movie plot (4 token input)** results from RNN

In [34]:
bertscore_plot = load_metric('bertscore')
bertscore_plot.add_batch(predictions=rnn_test_plot_4, references=test_plot_s)
rnn_plot_4_precision = torch.mean(bertscore_plot.compute(lang='en')['precision']).item()
bertscore_plot = load_metric('bertscore')
bertscore_plot.add_batch(predictions=rnn_test_plot_4, references=test_plot_s)
rnn_plot_4_f1 = torch.mean(bertscore_plot.compute(lang='en')['f1']).item()

In [35]:
print("\n")
print(f"RNN | Precision | Movie Plot 2 token| {round(rnn_plot_precision,3)}")
print(f"RNN | Precision | Movie Plot 4 token| {round(rnn_plot_4_precision,3)}")
print(f"RNN | Precision | Guttenberg 2 token| {round(rnn_gutt_precision,3)}")
print(f"RNN | Precision | Brown Crop 2 token| {round(rnn_brown_precision,3)}")
print("\n")
print(f"RNN | F-1 score | Movie Plot 2 token| {round(rnn_plot_f1,3)}")
print(f"RNN | F-1 score | Movie Plot 4 token| {round(rnn_plot_4_f1,3)}")
print(f"RNN | F-1 score | Guttenberg 2 token| {round(rnn_gutt_f1,3)}")
print(f"RNN | F-1 score | Brown Crop 2 token| {round(rnn_brown_f1,3)}")



RNN | Precision | Movie Plot 2 token| 0.845
RNN | Precision | Movie Plot 4 token| 0.856
RNN | Precision | Guttenberg 2 token| 0.857
RNN | Precision | Brown Crop 2 token| 0.852


RNN | F-1 score | Movie Plot 2 token| 0.842
RNN | F-1 score | Movie Plot 4 token| 0.856
RNN | F-1 score | Guttenberg 2 token| 0.861
RNN | F-1 score | Brown Crop 2 token| 0.856


## Evaluating HMM Outputs

In [4]:
gen_test_gutten = pickle.load(open("test_gen_gutten.pickle","rb"))
gen_test_gutten = [" ".join(item) for item in gen_test_gutten]

gen_test_brown = pickle.load(open("test_gen_brown.pickle","rb"))
gen_test_brown = [" ".join(item) for item in gen_test_brown]

gen_test_plot = pickle.load(open("test_gen_plot_unrestricted.pickle","rb"))
gen_test_plot = [" ".join(item) for item in gen_test_plot]

test_gutten = pickle.load(open("test_orig_gutten.pickle","rb"))
test_brown = pickle.load(open("test_orig_brown.pickle","rb"))
test_plot = pickle.load(open('test_orig_plot.pickle','rb'))

### Calculating Bert score for **Guttenberg** results from Generative model (HMM)

In [5]:
bertscore_gutten_gen = load_metric('bertscore')
bertscore_gutten_gen.add_batch(predictions=gen_test_gutten, references=test_gutten)
gen_gutt_precision =  torch.mean(bertscore_gutten_gen.compute(lang='en')['precision']).item()
bertscore_gutten_gen = load_metric('bertscore')
bertscore_gutten_gen.add_batch(predictions=gen_test_gutten, references=test_gutten)
gen_gutt_f1 =  torch.mean(bertscore_gutten_gen.compute(lang='en')['f1']).item()

### Calculating Bert score for **Brown** results from Generative model (HMM)

In [6]:
bertscore_brown_gen = load_metric('bertscore')
bertscore_brown_gen.add_batch(predictions=gen_test_brown, references=test_brown)
gen_brown_precision =  torch.mean(bertscore_brown_gen.compute(lang='en')['precision']).item()
bertscore_brown_gen = load_metric('bertscore')
bertscore_brown_gen.add_batch(predictions=gen_test_brown, references=test_brown)
gen_brown_f1 =  torch.mean(bertscore_brown_gen.compute(lang='en')['f1']).item()

### Calculating Bert score for **Movie plot** results from Generative model (HMM)

In [7]:
bertscore_plot_gen = load_metric('bertscore')
bertscore_plot_gen.add_batch(predictions=gen_test_plot, references=test_plot)
gen_plot_precision =  torch.mean(bertscore_plot_gen.compute(lang='en')['precision']).item()
bertscore_plot_gen = load_metric('bertscore')
bertscore_plot_gen.add_batch(predictions=gen_test_plot, references=test_plot)
gen_plot_f1 =  torch.mean(bertscore_plot_gen.compute(lang='en')['f1']).item()

In [9]:
print("\n")
print(f"Generative | Precision | Movie Plot | {round(gen_plot_precision,3)}")
print(f"Generative | Precision | Guttenberg | {round(gen_gutt_precision,3)}")
print(f"Generative | Precision | Brown Crop | {round(gen_brown_precision,3)}")
print("\n")
print(f"Generative | F-1 score | Movie Plot | {round(gen_plot_f1,3)}")
print(f"Generative | F-1 score | Guttenberg | {round(gen_gutt_f1,3)}")
print(f"Generative | F-1 score | Brown Crop | {round(gen_brown_f1,3)}")



Generative | Precision | Movie Plot | 0.846
Generative | Precision | Guttenberg | 1.0
Generative | Precision | Brown Crop | 0.851


Generative | F-1 score | Movie Plot | 0.852
Generative | F-1 score | Guttenberg | 1.0
Generative | F-1 score | Brown Crop | 0.853


In [36]:
print("\n")
print(f"RNN | Precision | Movie Plot 2 token| {round(rnn_plot_precision,3)}")
print(f"RNN | Precision | Movie Plot 4 token| {round(rnn_plot_4_precision,3)}")
print(f"RNN | Precision | Guttenberg 2 token| {round(rnn_gutt_precision,3)}")
print(f"RNN | Precision | Brown Crop 2 token| {round(rnn_brown_precision,3)}")
print("\n")
print(f"RNN | F-1 score | Movie Plot 2 token| {round(rnn_plot_f1,3)}")
print(f"RNN | F-1 score | Movie Plot 4 token| {round(rnn_plot_4_f1,3)}")
print(f"RNN | F-1 score | Guttenberg 2 token| {round(rnn_gutt_f1,3)}")
print(f"RNN | F-1 score | Brown Crop 2 token| {round(rnn_brown_f1,3)}")



RNN | Precision | Movie Plot 2 token| 0.845
RNN | Precision | Movie Plot 4 token| 0.856
RNN | Precision | Guttenberg 2 token| 0.857
RNN | Precision | Brown Crop 2 token| 0.852


RNN | F-1 score | Movie Plot 2 token| 0.842
RNN | F-1 score | Movie Plot 4 token| 0.856
RNN | F-1 score | Guttenberg 2 token| 0.861
RNN | F-1 score | Brown Crop 2 token| 0.856


*****