# Movie Plot Generator with LSTM model: Romance

### W266 Final Project


## Setup

### Import libraries

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

import tensorflow as tf
import pandas as pd 
import numpy as np
import re
import os
import time
import nltk
from nltk import tokenize
!pip install rouge
from rouge import Rouge 
from datetime import datetime
from google.colab import files
nltk.download('punkt')

Collecting rouge
  Downloading https://files.pythonhosted.org/packages/43/cc/e18e33be20971ff73a056ebdb023476b5a545e744e3fc22acd8c758f1e0d/rouge-1.0.0-py3-none-any.whl
Installing collected packages: rouge
Successfully installed rouge-1.0.0
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Mount Drive



In [0]:
from google.colab import drive
drive.mount("/content/drive/")

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/


In [0]:
# !pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()

### Read the data

In [0]:
id = '1hMv9RfBn5sAdK58_fJCpEFRbwEKITtS3' # horror train plot
# id = '1R_7jnjvw7Gp_GxVEQ1W6HrBBJU12iwVR' # romance train plot
gdrive = GoogleDrive(gauth)
downloaded = gdrive.CreateFile({'id':id}) 
downloaded.GetContentFile('thrilhor_train_plot.txt')  
# downloaded.GetContentFile('romance_train_plot.txt')  

In [0]:
# Read and decode
text = open('thrilhor_train_plot.txt', 'rb').read().decode(encoding='utf-8')
# print the number of characters in the text
print ('Length of text: {} characters'.format(len(text)))
# lowercase the text
text = text.lower()

Length of text: 12646503 characters


In [0]:
# Take a look at the first 250 characters in text
print(text[:250])

plot
the four mathematicians are gathered and meet with a top official of the united states department of defense. after some discussion, the group agrees that they must be wary with whom to trust and control their solution. the official offers them 


In [0]:
# The unique characters in the file
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

158 unique characters


In [0]:
# Only take lowercase characters and ' ', ',', '.'
vocab = ['a', 'b', 'c', 'd', 'e',
 'f', 'g', 'h', 'i', 'j', 'k',
 'l', 'm', 'n', 'o', 'p', 'q',
 'r', 's', 't', 'u', 'v', 'w',
 'x', 'y', 'z', ' ', ',', '.']

## Process the text

### Vectorize the text


In [0]:
# Before training, we need to map strings to a numerical representation. 
# Create two lookup tables: one mapping characters to numbers, 
# and another for numbers to characters.

# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = []
text_clean = ''
for c in text:
  if c in char2idx.keys():
    text_clean += str(c)
    text_as_int.append(char2idx[c])

text_as_int = np.array(text_as_int)

In [0]:
# print the mapping
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  'a' :   0,
  'b' :   1,
  'c' :   2,
  'd' :   3,
  'e' :   4,
  'f' :   5,
  'g' :   6,
  'h' :   7,
  'i' :   8,
  'j' :   9,
  'k' :  10,
  'l' :  11,
  'm' :  12,
  'n' :  13,
  'o' :  14,
  'p' :  15,
  'q' :  16,
  'r' :  17,
  's' :  18,
  't' :  19,
  ...
}


In [0]:
# Show how the first 13 characters from the text are mapped to integers
print ('{} ---- characters mapped to int ---- > {}'.format(repr(text_clean[:13]), text_as_int[:13]))

'plotthe four ' ---- characters mapped to int ---- > [15 11 14 19 19  7  4 26  5 14 20 17 26]


### Prediction

### Create training examples and targets

In [0]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(text_clean)//(seq_length+1)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
  print(idx2char[i.numpy()])

p
l
o
t
t


In [0]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
  print(repr(''.join(idx2char[item.numpy()])))

'plotthe four mathematicians are gathered and meet with a top official of the united states department'
' of defense. after some discussion, the group agrees that they must be wary with whom to trust and co'
'ntrol their solution. the official offers them a reward of  million in exchange for their portion of '
'the algorithm, swaying them by attempting to address their concerns. only one of the four speaks out '
'against the sale, and in doing so is forced to reveal a dark truth about his portion of the solution.'


In [0]:
# For each sequence, duplicate and shift it to form the input and target text 
# by using the map to apply a simple function to each batch:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [0]:
# Print the first examples input and target values
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'plotthe four mathematicians are gathered and meet with a top official of the united states departmen'
Target data: 'lotthe four mathematicians are gathered and meet with a top official of the united states department'


In [0]:
# Each index of these vectors are processed as one time step. For the input 
# at time step 0, the model receives the index for "F" and trys to predict the 
# index for "i" as the next character. At the next timestep, it does the same 
# thing but the `RNN` considers the previous step context in addition to the 
# current input character.
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 15 ('p')
  expected output: 11 ('l')
Step    1
  input: 11 ('l')
  expected output: 14 ('o')
Step    2
  input: 14 ('o')
  expected output: 19 ('t')
Step    3
  input: 19 ('t')
  expected output: 19 ('t')
Step    4
  input: 19 ('t')
  expected output: 7 ('h')


### Create training batches

In [0]:
# shuffle the data and pack it into batches.

# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

## Build The Model

In [0]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [0]:
# def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
#   model = tf.keras.Sequential([
#     tf.keras.layers.Embedding(vocab_size, embedding_dim,
#                               batch_input_shape=[batch_size, None]),
#     tf.keras.layers.LSTM(rnn_units,
#                         return_sequences=True,
#                         stateful=True,
#                         recurrent_initializer='glorot_uniform'),
#     tf.keras.layers.Dropout(0.2), 
#     tf.keras.layers.Dense(vocab_size, activation='softmax'),
#   ])
#   return model

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)            
  ])
  return model

# def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
#   model = tf.keras.Sequential([
#     tf.keras.layers.Embedding(vocab_size, embedding_dim,
#                               batch_input_shape=[batch_size, None]),
#     # tf.keras.layers.Dropout(0.2),
#     tf.keras.layers.LSTM(rnn_units,
#                         return_sequences=True,
#                         stateful=True,
#                         recurrent_initializer='glorot_uniform'),
#     # tf.keras.layers.Dropout(0.2), 
#     # tf.keras.layers.LSTM(rnn_units,
#     #                     return_sequences=True,
#     #                     stateful=True,
#     #                     recurrent_initializer='glorot_uniform'),
#     # tf.keras.layers.Dropout(0.2), 
#     tf.keras.layers.Dense(vocab_size)            
#   ])
#   return model

In [0]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

## Try the model


In [0]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 29) # (batch_size, sequence_length, vocab_size)


In [0]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (64, None, 256)           7424      
_________________________________________________________________
lstm_2 (LSTM)                (64, None, 1024)          5246976   
_________________________________________________________________
dense_1 (Dense)              (64, None, 29)            29725     
Total params: 5,284,125
Trainable params: 5,284,125
Non-trainable params: 0
_________________________________________________________________


In [0]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [0]:
# At each timestep, a prediction of the next character index
sampled_indices

array([13, 22,  5,  7,  4,  0, 14, 20, 19, 27, 21, 22, 24, 12, 11, 21, 21,
        1, 12, 13, 26, 22, 21,  2,  9,  0, 26, 12, 18, 24, 17, 18, 28, 18,
        2, 18,  6, 17, 26, 18, 22, 27, 13, 14,  7, 17,  6, 18, 20,  6, 11,
       27, 15, 10, 17, 14,  7,  8, 13, 25,  4, 13, 27, 21, 22,  1, 15, 21,
       26, 26, 10, 11, 15,  4,  8,  8, 16,  8, 27,  8, 16, 20, 17, 13, 14,
       17, 27, 28, 10, 22,  0, 23, 23,  5, 22, 21,  4, 28, 16, 21])

In [0]:
# Decode sampled_indices to see the text predicted by the untrained model

print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 ' her, their faces blurred. spending the next month in the hospital, in and out of consciousness and '

Next Char Predictions: 
 'nwfheaout,vwymlvvbmn wvcja msyrs.scsgr sw,nohrgsugl,pkrohinzen,vwbpv  klpeiiqi,iqurnor,.kwaxxfwve.qv'


## Train the model

### Attach an optimizer, and a loss function

In [0]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 29)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       3.3667743


In [0]:
optimizer = tf.keras.optimizers.Adam()

model.compile(optimizer=Adam, loss=loss)

### Configure checkpoints

In [0]:
# Ensure that checkpoints are saved during training

# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

### Execute the training

To keep training time reasonable, use 10 epochs to train the model. In Colab, set the runtime to GPU for faster training.

In [0]:
EPOCHS=100

In [0]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100

## Generate text

### Restore the latest checkpoint

In [0]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints/ckpt_100'

In [0]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [0]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (1, None, 256)            7424      
_________________________________________________________________
lstm_3 (LSTM)                (1, None, 1024)           5246976   
_________________________________________________________________
dense_2 (Dense)              (1, None, 29)             29725     
Total params: 5,284,125
Trainable params: 5,284,125
Non-trainable params: 0
_________________________________________________________________


### The text generation


In [0]:
def generate_text(model, num_generate, start_string):
  # Evaluation step (generating text using the learned model)

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 0.8

  # Here batch size == 1
  model.reset_states()

  num_spaces = 0


  #for i in range(num_generate):
  i = 0
  while num_spaces <= num_generate:
      if i == 1000:
        break
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the character returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted character as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      if " " in idx2char[predicted_id]:
        num_spaces += 1

      text_generated.append(idx2char[predicted_id])
      i += 1

  return (start_string + ''.join(text_generated))

In [0]:
print(generate_text(model, 26, start_string=u"bella is killed by her husband, one dark and stormy night."))

bella is killed by her husband, one dark and stormy night. after sinker and chun finds the news in the car in love and buries a little police head of the three and calls her that a 


In [0]:
print(generate_text(model, 35, start_string=u"bella  marries  the  man  of  herdreams last summer on a boat."))

bella  marries  the  man  of  herdreams last summer on a boat. to the sunlight the other shotgun home men and supplies that the two destroy that they see the the several man and her shot with his story to claim to rest of the package and 


## Generate Test Plots

In [0]:
# !pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
id = '1RknpWIKhPrAdjI1vaA3j7XFofcrLV45s' # horror test data 
# id = '10WvoM9amr-y0e424ctACgwG3n1ocFMLo' # romance test data
gdrive = GoogleDrive(gauth)
downloaded = gdrive.CreateFile({'id':id}) 
downloaded.GetContentFile('Genre_thrilhor_test.csv')  
df = pd.read_csv('Genre_thrilhor_test.csv')
# downloaded.GetContentFile('Genre_romance_test.csv')  
# df = pd.read_csv('Genre_romance_test.csv')

In [0]:
df.shape

(600, 4)

In [0]:
#sentence extraction 1: len = total len / 2
#sentence extraction 1: len = total len - 2
def extract_beginning_sent(text):
  sent_tokens = tokenize.sent_tokenize(text)
  size = max(len(sent_tokens) - 2, 1)
  beg_text = sent_tokens[:size]
  beg = " ".join(beg_text)
  return beg

def extract_beginning_word(text, threshold):
  word_tokens = tokenize.word_tokenize(text)
  size = max(int(len(word_tokens) * threshold), 1)
  beg_text = word_tokens[:size]
  beg = " ".join(beg_text)
  return beg, len(text) - len(beg)

def plot_generation(subset, p, folder):
  start_time = time.time()
  subset = df
  padding = 15

  for index, row in subset.iterrows():
    if index % 20 == 0:
      print(index)
    plot = row['Plot']
    t, remaining = extract_beginning_word(plot, p)
    t = t.lower()
    t = re.sub(r'[^a-z\s]+','',t)
    generated_plot = generate_text(model, remaining + padding, t)
    file_name = "/content/" + folder + "/Plot Summary: " + row['Title'] + ".txt"
    file = open(file_name,"w")
    file.write(generated_plot)
    file.close() 
  print("My program took", time.time() - start_time,  "to run")

In [0]:
# plot_generation(df, 0.25, 'rom25')
# plot_generation(df, 0.5, 'rom50')
# plot_generation(df, 0.75, 'rom75')

plot_generation(df, 0.25, 'hor25')
plot_generation(df, 0.5, 'hor50')
plot_generation(df, 0.75, 'hor75')

0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
My program took 2511.243899345398 to run
0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
My program took 2508.5567877292633 to run
0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
My program took 2222.580857515335 to run


## ROUGE evaluation

In [0]:
def find_plot_via_title(filename):
  title = filename[14:-4]
  plt = df.loc[df['Title'] == title]["Plot"].values[0]
  return plt

#sentence extraction: len = total len / 2
#sentence extraction: len = total len - 2
def extract_prediction_sent(text):
  sent_tokens = tokenize.sent_tokenize(text)
  size = max(len(sent_tokens) - 2, 1)
  beg_text = sent_tokens[size:]
  beg = " ".join(beg_text)
  return beg

def extract_prediction_word(text, amount):
  word_tokens = tokenize.word_tokenize(text)
  #size = max(int(len(word_tokens) * threshold), 1)
  prediction_tokens = word_tokens[amount:]
  prediction = " ".join(prediction_tokens)
  return prediction

def find_prefix(text, threshold):
  word_tokens = tokenize.word_tokenize(text)
  size = max(int(len(word_tokens) * threshold), 1)
  return size

def join_scores(s1, s2):
  temp = [{'rouge-1': {'f': 0, 'p': 0, 'r': 0},
          'rouge-2': {'f': 0, 'p': 0, 'r': 0},
          'rouge-l': {'f': 0, 'p': 0, 'r': 0}}]

  temp[0]['rouge-1']['f'] = s1[0]['rouge-1']['f'] + s2[0]['rouge-1']['f']
  temp[0]['rouge-1']['p'] = s1[0]['rouge-1']['p'] + s2[0]['rouge-1']['p'] 
  temp[0]['rouge-1']['r'] = s1[0]['rouge-1']['r'] + s2[0]['rouge-1']['r']  

  temp[0]['rouge-2']['f'] = s1[0]['rouge-2']['f'] + s2[0]['rouge-2']['f']
  temp[0]['rouge-2']['p'] = s1[0]['rouge-2']['p'] + s2[0]['rouge-2']['p'] 
  temp[0]['rouge-2']['r'] = s1[0]['rouge-2']['r'] + s2[0]['rouge-2']['r']  

  temp[0]['rouge-l']['f'] = s1[0]['rouge-l']['f'] + s2[0]['rouge-l']['f']
  temp[0]['rouge-l']['p'] = s1[0]['rouge-l']['p'] + s2[0]['rouge-l']['p'] 
  temp[0]['rouge-l']['r'] = s1[0]['rouge-l']['r'] + s2[0]['rouge-l']['r']  

  return temp


def average_scores(score, div):
  temp = [{'rouge-1': {'f': 0, 'p': 0, 'r': 0},
          'rouge-2': {'f': 0, 'p': 0, 'r': 0},
          'rouge-l': {'f': 0, 'p': 0, 'r': 0}}]

  temp[0]['rouge-1']['f'] = score[0]['rouge-1']['f'] / div
  temp[0]['rouge-1']['p'] = score[0]['rouge-1']['p'] / div
  temp[0]['rouge-1']['r'] = score[0]['rouge-1']['r']  / div

  temp[0]['rouge-2']['f'] = score[0]['rouge-2']['f'] / div
  temp[0]['rouge-2']['p'] = score[0]['rouge-2']['p'] / div
  temp[0]['rouge-2']['r'] = score[0]['rouge-2']['r']  / div 

  temp[0]['rouge-l']['f'] = score[0]['rouge-l']['f'] / div
  temp[0]['rouge-l']['p'] = score[0]['rouge-l']['p'] / div
  temp[0]['rouge-l']['r'] = score[0]['rouge-l']['r']  / div
  
  return temp

In [0]:
rouge = Rouge()

In [0]:
def rouge_calculations(directory_path, threshold, total):
  current_totals = [{'rouge-1': {'f': 0, 'p': 0, 'r': 0},
          'rouge-2': {'f': 0, 'p': 0, 'r': 0},
          'rouge-l': {'f': 0, 'p': 0, 'r': 0}}]

  current_prediction = [{'rouge-1': {'f': 0, 'p': 0, 'r': 0},
          'rouge-2': {'f': 0, 'p': 0, 'r': 0},
          'rouge-l': {'f': 0, 'p': 0, 'r': 0}}]
  directory = os.fsencode(directory_path)
  for file in os.listdir(directory):
    try:
      filename = os.fsdecode(file)
      full_path = os.path.join(directory_path, filename)
      file_open = open(full_path,mode='r')

      reference = find_plot_via_title(filename)
      num_tokens = find_prefix(reference, threshold)
      prediction_ref = extract_prediction_word(reference, num_tokens)

      plot = file_open.read()
      prediction_plot = extract_prediction_word(plot, num_tokens)
      
      total_scores = rouge.get_scores(plot, reference)
      prediction_scores = rouge.get_scores(prediction_plot, prediction_ref)
      
      current_totals = join_scores(current_totals, total_scores)
      current_prediction = join_scores(current_prediction, prediction_scores)
    except:
      total -= 1
  return average_scores(current_totals, total), average_scores(current_prediction, total)
  
def create_results_table(res25, res50, res75):
  labels = ["Rouge-1 F1", "Rouge-1 Precision", "Rouge-1 Recall", "Rouge-2 F1", "Rouge-2 Precision", "Rouge-2 Recall", "Rouge-L F1", "Rouge-L Precision", "Rouge-L Recall"]
  gen_results = pd.DataFrame(np.array([[res25['rouge-1']['f'],res50['rouge-1']['f'], res75['rouge-1']['f']], 
                                       [res25['rouge-1']['p'],res50['rouge-1']['p'], res75['rouge-1']['p']], 
                                       [res25['rouge-1']['r'],res50['rouge-1']['r'], res75['rouge-1']['r']],
                                       [res25['rouge-2']['f'],res50['rouge-2']['f'], res75['rouge-2']['f']], 
                                       [res25['rouge-2']['p'],res50['rouge-2']['p'], res75['rouge-2']['p']], 
                                       [res25['rouge-2']['r'],res50['rouge-2']['r'], res75['rouge-2']['r']],
                                       [res25['rouge-l']['f'],res50['rouge-l']['f'], res75['rouge-l']['f']], 
                                       [res25['rouge-l']['p'],res50['rouge-l']['p'], res75['rouge-l']['p']], 
                                       [res25['rouge-l']['r'],res50['rouge-l']['r'], res75['rouge-l']['r']]]),
                           columns = ["25% Prefix", "50% Prefix", "75% Prefix"],
                           index = labels)
  return gen_results

In [0]:
# directory_25 = "./rom25/"
# directory_50 = "./rom50/"
# directory_75 = "./rom75/"

# totals_25, pred_25 = rouge_calculations(directory_25, 0.25, 470)
# totals_50, pred_50 = rouge_calculations(directory_50, 0.5, 470)
# totals_75, pred_75 = rouge_calculations(directory_75, 0.75, 470)

directory_25 = "./hor25/"
directory_50 = "./hor50/"
directory_75 = "./hor75/"

totals_25, pred_25 = rouge_calculations(directory_25, 0.25, 600)
totals_50, pred_50 = rouge_calculations(directory_50, 0.5, 600)
totals_75, pred_75 = rouge_calculations(directory_75, 0.75, 600)

# display outcome
final_pred_results = create_results_table(pred_25[0], pred_50[0], pred_75[0])
display(final_pred_results)
final_total_results = create_results_table(totals_25[0], totals_50[0], totals_75[0])
display(final_total_results)

Unnamed: 0,25% Prefix,50% Prefix,75% Prefix
Rouge-1 F1,0.219223,0.204496,0.17442
Rouge-1 Precision,0.291239,0.24326,0.16774
Rouge-1 Recall,0.268368,0.283831,0.293087
Rouge-2 F1,0.027806,0.02455,0.01781
Rouge-2 Precision,0.041201,0.031577,0.01777
Rouge-2 Recall,0.031605,0.031035,0.028054
Rouge-L F1,0.140302,0.134335,0.122324
Rouge-L Precision,0.160502,0.139854,0.107222
Rouge-L Recall,0.171747,0.184951,0.202128


Unnamed: 0,25% Prefix,50% Prefix,75% Prefix
Rouge-1 F1,0.3371,0.430341,0.512696
Rouge-1 Precision,0.375962,0.407721,0.437611
Rouge-1 Recall,0.390145,0.539679,0.669518
Rouge-2 F1,0.157697,0.269461,0.376243
Rouge-2 Precision,0.183187,0.256569,0.320172
Rouge-2 Recall,0.174599,0.33727,0.494853
Rouge-L F1,0.278879,0.392707,0.502526
Rouge-L Precision,0.294113,0.368485,0.440767
Rouge-L Recall,0.314799,0.473075,0.618399


In [0]:
print(final_pred_results.to_csv())
print(final_pred_results.to_json())

,25% Prefix,50% Prefix,75% Prefix
Rouge-1 F1,0.21922304879871807,0.2044964557589695,0.1744197582881655
Rouge-1 Precision,0.2912392204016413,0.24326010292631708,0.16774043825526982
Rouge-1 Recall,0.26836815887685006,0.28383090940980454,0.2930867624703416
Rouge-2 F1,0.027805909221720305,0.02454980298449888,0.017810376809180305
Rouge-2 Precision,0.04120113580021869,0.03157670152231385,0.017769922604774505
Rouge-2 Recall,0.031605271005702824,0.03103514517308271,0.028053523067715174
Rouge-L F1,0.1403017448246059,0.13433477829036541,0.1223242111351257
Rouge-L Precision,0.1605018081369916,0.13985351510015306,0.10722167135104821
Rouge-L Recall,0.17174726119052905,0.18495095679853435,0.20212761151680256

{"25% Prefix":{"Rouge-1 F1":0.2192230488,"Rouge-1 Precision":0.2912392204,"Rouge-1 Recall":0.2683681589,"Rouge-2 F1":0.0278059092,"Rouge-2 Precision":0.0412011358,"Rouge-2 Recall":0.031605271,"Rouge-L F1":0.1403017448,"Rouge-L Precision":0.1605018081,"Rouge-L Recall":0.1717472612},"50% Prefix":

In [0]:
# zip files

# !zip -r /content/rom25.zip /content/rom25
# !zip -r /content/rom50.zip /content/rom50
# !zip -r /content/rom75.zip /content/rom75

!zip -r /content/hor25.zip /content/hor25
!zip -r /content/hor50.zip /content/hor50
!zip -r /content/hor75.zip /content/hor75

  adding: content/hor25/ (stored 0%)
  adding: content/hor25/Plot Summary: Dangerously Close.txt (deflated 48%)
  adding: content/hor25/Plot Summary: Curiosity.txt (deflated 47%)
  adding: content/hor25/Plot Summary: Dogs of Hell.txt (deflated 48%)
  adding: content/hor25/Plot Summary: Star Runners.txt (deflated 51%)
  adding: content/hor25/Plot Summary: Blood for Dracula.txt (deflated 49%)
  adding: content/hor25/Plot Summary: Southland Tales.txt (deflated 49%)
  adding: content/hor25/Plot Summary: Don't Go In The Woods.txt (deflated 50%)
  adding: content/hor25/Plot Summary: The Black Waters of Echo's Pond.txt (deflated 47%)
  adding: content/hor25/Plot Summary: Malone.txt (deflated 49%)
  adding: content/hor25/Plot Summary: Cold Steel.txt (deflated 48%)
  adding: content/hor25/Plot Summary: Nine Lives.txt (deflated 49%)
  adding: content/hor25/Plot Summary: Hanky Panky.txt (deflated 49%)
  adding: content/hor25/Plot Summary: Man on a Ledge.txt (deflated 51%)
  adding: content/hor25/

In [0]:
# save files

# files.download("/content/rom25.zip")
# files.download("/content/rom50.zip")
# files.download("/content/rom75.zip")

files.download("/content/hor25.zip")
files.download("/content/hor50.zip")
files.download("/content/hor75.zip")