In [None]:
import numpy as np
import pandas as pd 
import re
import tensorflow as tf
from time import time
from tensorflow.python.keras.layers import Layer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Dense, Concatenate, TimeDistributed, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras import backend as K 
from matplotlib import pyplot
from numpy import array
from nltk.translate.bleu_score import sentence_bleu

In [None]:
# Attention layer class that is later used when building our model.
# Source: https://arxiv.org/pdf/1409.0473.pdf
# Source: https://colab.research.google.com/drive/1XrjPL3O_szhahYZW0z9yhCl9qvIcJJYW 

class AttentionLayer(Layer):
    """
    This class implements Bahdanau attention (https://arxiv.org/pdf/1409.0473.pdf).
    There are three sets of weights introduced W_a, U_a, and V_a
     """
 
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)
 
    def build(self, input_shape):
        assert isinstance(input_shape, list)
        # Create a trainable weight variable for this layer.
 
        self.W_a = self.add_weight(name='W_a',
                                   shape=tf.TensorShape((input_shape[0][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.U_a = self.add_weight(name='U_a',
                                   shape=tf.TensorShape((input_shape[1][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.V_a = self.add_weight(name='V_a',
                                   shape=tf.TensorShape((input_shape[0][2], 1)),
                                   initializer='uniform',
                                   trainable=True)
 
        super(AttentionLayer, self).build(input_shape)  # Be sure to call this at the end
 
    def call(self, inputs, verbose=False):
        """
        inputs: [encoder_output_sequence, decoder_output_sequence]
        """
        assert type(inputs) == list
        encoder_out_seq, decoder_out_seq = inputs
        if verbose:
            print('encoder_out_seq>', encoder_out_seq.shape)
            print('decoder_out_seq>', decoder_out_seq.shape)
 
        def energy_step(inputs, states):
            """ Step function for computing energy for a single decoder state """
 
            assert_msg = "States must be a list. However states {} is of type {}".format(states, type(states))
            assert isinstance(states, list) or isinstance(states, tuple), assert_msg
 
            """ Some parameters required for shaping tensors"""
            en_seq_len, en_hidden = encoder_out_seq.shape[1], encoder_out_seq.shape[2]
            de_hidden = inputs.shape[-1]
 
            """ Computing S.Wa where S=[s0, s1, ..., si]"""
            # <= batch_size*en_seq_len, latent_dim
            reshaped_enc_outputs = K.reshape(encoder_out_seq, (-1, en_hidden))
            # <= batch_size*en_seq_len, latent_dim
            W_a_dot_s = K.reshape(K.dot(reshaped_enc_outputs, self.W_a), (-1, en_seq_len, en_hidden))
            if verbose:
                print('wa.s>',W_a_dot_s.shape)
 
            """ Computing hj.Ua """
            U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1)  # <= batch_size, 1, latent_dim
            if verbose:
                print('Ua.h>',U_a_dot_h.shape)
 
            """ tanh(S.Wa + hj.Ua) """
            # <= batch_size*en_seq_len, latent_dim
            reshaped_Ws_plus_Uh = K.tanh(K.reshape(W_a_dot_s + U_a_dot_h, (-1, en_hidden)))
            if verbose:
                print('Ws+Uh>', reshaped_Ws_plus_Uh.shape)
 
            """ softmax(va.tanh(S.Wa + hj.Ua)) """
            # <= batch_size, en_seq_len
            e_i = K.reshape(K.dot(reshaped_Ws_plus_Uh, self.V_a), (-1, en_seq_len))
            # <= batch_size, en_seq_len
            e_i = K.softmax(e_i)
 
            if verbose:
                print('ei>', e_i.shape)
 
            return e_i, [e_i]
 
        def context_step(inputs, states):
            """ Step function for computing ci using ei """
            # <= batch_size, hidden_size
            c_i = K.sum(encoder_out_seq * K.expand_dims(inputs, -1), axis=1)
            if verbose:
                print('ci>', c_i.shape)
            return c_i, [c_i]
 
        def create_inital_state(inputs, hidden_size):
            # We are not using initial states, but need to pass something to K.rnn funciton
            fake_state = K.zeros_like(inputs)  # <= (batch_size, enc_seq_len, latent_dim
            fake_state = K.sum(fake_state, axis=[1, 2])  # <= (batch_size)
            fake_state = K.expand_dims(fake_state)  # <= (batch_size, 1)
            fake_state = K.tile(fake_state, [1, hidden_size])  # <= (batch_size, latent_dim
            return fake_state
 
        fake_state_c = create_inital_state(encoder_out_seq, encoder_out_seq.shape[-1])
        fake_state_e = create_inital_state(encoder_out_seq, encoder_out_seq.shape[1])  # <= (batch_size, enc_seq_len, latent_dim
 
        """ Computing energy outputs """
        # e_outputs => (batch_size, de_seq_len, en_seq_len)
        last_out, e_outputs, _ = K.rnn(
            energy_step, decoder_out_seq, [fake_state_e],
        )
 
        """ Computing context vectors """
        last_out, c_outputs, _ = K.rnn(
            context_step, e_outputs, [fake_state_c],
        )
 
        return c_outputs, e_outputs
 
    def compute_output_shape(self, input_shape):
        """ Outputs produced by the layer """
        return [
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[1][2])),
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[0][1]))
        ]

In [None]:
# Brings dataset file and shapes dataset to only have information we want and need

wikimoviefile = 'wiki_titles.csv'
orgdatafile = pd.read_csv(wikimoviefile)
num_samples = 1000
datafile = orgdatafile.dropna(subset=['Plot','Titles'], axis=0).reset_index(drop=True) 
datafile = datafile.drop(labels=range(num_samples, 21710), axis=0)

print('Data shape is ', datafile.shape)

In [None]:
# Maps contractions to their actual words to help with consistency when tokenizing words

contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",

                           "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",

                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",

                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",

                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",

                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",

                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",

                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",

                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",

                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",

                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",

                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",

                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",

                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",

                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",

                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",

                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",

                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",

                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",

                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",

                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",

                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",

                           "you're": "you are", "you've": "you have"}


In [None]:
# Method that cleans text and splits them into a list of strings

def text_cleaner(text):
    # Converts all text to string
    newString = str(text)
    # Makes all of the string lowercased
    newString = newString.lower()
    # Gets rid of special characters
    newString = re.sub(r'\([^)]*\)', '', newString)
    # Gets rid of quotations
    newString = re.sub('"','', newString)
    # Replaces contractions to their corresponding words from the contraction map 
    newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")])  
    # Gets rid of /n
    newString = re.sub(r"'s\n","",newString)
    # Gets rid of more special characters
    newString = re.sub(r'[?$()_\-—\d{}/#&%<>=@\*~:;\\\+\']',r' ', newString) 
    # Make sure punctuations are separated so they can be their own tokens
    newString = re.sub(r'([,.!;:?])', r' \1 ', newString)

    return newString.split()

In [None]:
# Actually cleans data in "Plot" and "Titles" and puts them in clean_data

clean_data = pd.DataFrame()

clean_data['plots'] = datafile['Plot'].apply(text_cleaner)
for plot in clean_data['plots']:
    del plot[150:]
    
print(clean_data['plots'])

clean_data['titles'] = datafile['Titles'].apply(text_cleaner)

print(clean_data['titles'])

In [None]:
# Find what the maximum number of words are for title and plot to help create padding and dimensions for model

max_words = 0
for text in clean_data['plots']:
    max_words = max(max_words, len(text))
print(max_words)


max_words =0
for text in clean_data['titles']:
    max_words = max(max_words, len(text))
print(max_words)

In [None]:
# Build word embedding index using pre-trained word embedding file

embeddings_index = {}
f = open('glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [None]:
# Method that tokenizes words, pads sequences, and creates an embedding matrix

def doc2seq(texts, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM):
    # Tokenizes words except for the characters in filters 
    tokenizer = Tokenizer(filters='"#$%&()*+-/<=>@[\\]^_`{|}~\t\n')
    # Creates vocabulary index based on word frequency from texts
    tokenizer.fit_on_texts(texts)
    # Transforms text into sequence of integers
    sequences = tokenizer.texts_to_sequences(texts)

    # Maps words to index
    word_index = tokenizer.word_index
    # Maps index to word
    index_word = tokenizer.index_word

    # Pads data based on its sequences and length after the sequence   
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding = 'post')
    
    # Creates an embedding matrix based on embedding_index and word_index
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
    return data, embedding_matrix, word_index, index_word

In [None]:
MAX_PLOT_LENGTH = 150
EMBEDDING_DIM = 300
data = clean_data.plots

x_data, encoder_emb, x_word_index, x_index_word = doc2seq(data, MAX_PLOT_LENGTH, EMBEDDING_DIM)

MAX_TITLE_LENGTH = 10
EMBEDDING_DIM = 300
data = clean_data.titles

y_data, decoder_emb, y_word_index, y_index_word = doc2seq(data, MAX_TITLE_LENGTH, EMBEDDING_DIM)

In [None]:
# Splits data for training and testing

x_train, x_test_temp, y_train, y_test_temp = train_test_split(x_data, y_data, 
                                                            test_size=0.3, random_state=0) 
x_dev, x_test, y_dev, y_test = train_test_split(x_test_temp, y_test_temp, 
                                                            test_size=0.33, random_state=0)

In [None]:
hidden_units = 200

In [None]:
# Create encoding and decoding embedding layer

enc_embedding_layer = Embedding(len(x_word_index) + 1, # Input dimension 
                            EMBEDDING_DIM, # Output dimension
                            weights=[encoder_emb], #initialize weights with encoder embedding matrix
                            input_length=MAX_PLOT_LENGTH, #length of input sequence
                            trainable=False, # Does not update weights during training
                            name='EncoderEmbeddingLayer')


dec_embedding_layer = Embedding(len(y_word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[decoder_emb],
                            input_length=MAX_TITLE_LENGTH,
                            trainable=False,
                            name='DecoderEmbeddingLayer')

In [None]:
# Create word level Seq2seq model with encoder and decoder architecture

# Encoder:
# Creates encoder input and feeds it into the encoder embedding layer
encoder_inputs = Input(shape=(MAX_PLOT_LENGTH,), name="EncoderInput") #Input length is set because all input data has been padded in preprocessing
enc_emb = enc_embedding_layer(encoder_inputs) 

# Creates multiple LSTM layers 
encoder_lstm1 = LSTM(hidden_units,return_sequences=True,return_state=True, name='EncLSTM1') 
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb) 

encoder_lstm2 = LSTM(hidden_units,return_sequences=True,return_state=True, name='EncLSTM2') 
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1) 

encoder_lstm3=LSTM(hidden_units, return_state=True, return_sequences=True, name='EncLSTM3') 
encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2) 

In [None]:
# Decoder:
decoder_inputs = Input(shape=(None,), name = 'DecoderInput') 
dec_emb = dec_embedding_layer(decoder_inputs) 

# LSTM uses last LSTM layer of encoder as initial state for decoder LSTM
decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True, name='DecLSTM1') 
decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb, initial_state=[state_h, state_c]) 

# Create attention layer with [encoder_outputs, decoder_outputs] as input
attn_layer = AttentionLayer(name='attention_layer') 
attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs]) 

# Concatenate decoder's LSTM output and attention output to return a single tensor 
decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attn_out])

# Time Distrubted Dense layer
decoder_dense = TimeDistributed(Dense(len(y_word_index)+1, activation='softmax')) 
decoder_outputs = decoder_dense(decoder_concat_input) 

In [None]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs) 
model.summary()

In [None]:
# Compile and train model

model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

# TensorBoard tracks loss and accuracy as our model trains
# Tracking validation loss is useful to spot overfitting but accuracy is not useful in our case because of NLP
tensorboard = TensorBoard(log_dir="logs/{}".format(time()))

history=model.fit([x_train,np.hstack((np.zeros((y_train.shape[0],1)), y_train[:, :-1]))], 
                  y_train,
                  epochs=19,
                  batch_size= 9, 
                  callbacks=[tensorboard],
                  validation_data=([x_dev,np.hstack((np.zeros((y_dev.shape[0],1)), y_dev[:, :-1]))], y_dev)
                 )

In [None]:
# Plots our training and validation loss

pyplot.plot(history.history['loss'])
pyplot.plot(history.history['val_loss'])
pyplot.title('model train vs validation loss')
pyplot.ylabel('loss')
pyplot.xlabel('epoch')
pyplot.legend(['train', 'validation'], loc='upper right')
pyplot.show()

In [None]:
# Recreate model into encoder and decoder inference model to predict and generate movie titles

# Encoder inference
encoder_model = Model(inputs=encoder_inputs,outputs=[encoder_outputs, state_h, state_c])

# Decoder inference
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(hidden_units,))
decoder_state_input_c = Input(shape=(hidden_units,))
decoder_hidden_state_input = Input(shape=(MAX_PLOT_LENGTH,hidden_units))

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb, initial_state=[decoder_state_input_h, decoder_state_input_c])

# Create attention layer inference
attn_out_inf, attn_states_inf = attn_layer([decoder_hidden_state_input, decoder_outputs2])
decoder_inf_concat = Concatenate(axis=-1, name='concat')([decoder_outputs2, attn_out_inf])

# Create dense softmax layer from model to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_inf_concat)

# Creating the decoder model
decoder_model = Model(
[decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
[decoder_outputs2] + [state_h2, state_c2])

In [None]:
# Takes tokens of title words and creates a string of words from y_index_word
def seq2title(input_seq):
    newString = ''
    for i in input_seq:
        if(i != 0):
            newString = newString + y_index_word[i] + ' '
    return newString

# Takes tokens of plot words and creates a string of words from x_index_word
def seq2plot(input_seq):
    newString = ''
    for i in input_seq:
        if(i != 0):
            newString = newString + x_index_word[i] + ' '
    return newString

In [None]:
# Method that takes an input and predicts what our model will output using our encoder and decoder model

def decode_sequence(input_seq):
    # Encode the input as state vectors
    e_out, e_h, e_c = encoder_model.predict(input_seq)

    # Make empty target sequence of length 1
    target_seq = np.zeros((1,1))

    # Start the sequence with nothing
    target_seq[0, 0] = 0.0

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = y_index_word.get(sampled_token_index, '.')

        decoded_sentence += ' '+sampled_token

        # Change stop condition if it reaches max title limit
        if (len(decoded_sentence.split()) >= (MAX_TITLE_LENGTH-1)):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        e_h, e_c = h, c

    return decoded_sentence

In [None]:
# Generates numerical measurement for the comprehension of an individual title
# BLEU score, unigram (word by word)

def calc_indiv_BLEU(id_text, text_df, title_df): 
    # Generates decoded title from plot
    gen_output = decode_sequence(text_df[id_text].reshape(1,-1))
    # Splits output into individual words
    split_output = gen_output.split(" ")
    # Removes empty spaces and periods 
    candidate = [item for item in split_output if (item!="." and item!="")] 
    # Get the real title for corresponding plot
    gen_ref = seq2title(title_df[id_text])
    # Splits real title into the individual words
    split_ref = gen_ref.split(" ")
    # Removes empty spaces and periods
    reference = [item for item in split_ref if (item!="." and item!="")] 
    # Calculate + return BLEU score by comparing output and reference using sentence_bleu
    score = sentence_bleu(gen_ref, gen_output, weights=(1, 0, 0, 0))

    return(score)

In [None]:
# Generates predictions using decode_sequence method

for i in range(45,55):
    print("BLEU score(Unigram): " + str(calc_indiv_BLEU(i, x_test, y_test)))
    print("Plot: " + seq2plot(x_test[i]))
    print("Original Title: "+ seq2title(y_test[i]))
    # We want to reshape the data so that it matches the shape to feed into our encoder model
    print("Generated Title: "+ decode_sequence(x_test[i].reshape(1,-1)))
    print('___________________________________________________________________________________')