# Conditional Encoder-Decoder Model (5.2)


*   STEP 1 TOKENIZE THE DATA
*   STEP 2 PREPROCESS THE DATA FOR TRAINING
*   STEP 3 BUILD & TRAIN THE ENCODER-DECODER MODEL
*   STEP 4 LET'S TRY OUR TRAINED MODEL!
*   STEP 5 EVALUATE RESULTS (BLEU & UNIVERSAL COSINE SIMILARITY)



In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


## STEP0 // IMPORT OUR STUFF

In [None]:
# !pip install numpy
import pandas as pd
import numpy as np
import ast
# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from itertools import compress

import random
import sys
import io

In [None]:
import tensorflow as tf

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate, Masking, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import GRU, LSTM, Bidirectional
from tensorflow.keras.layers import Conv1D, Activation, Multiply
from tensorflow.keras.losses import CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.callbacks import LambdaCallback, ModelCheckpoint
from tensorflow.keras.optimizers import RMSprop, Adam, Adamax
from tensorflow.keras import activations

from sklearn.model_selection import train_test_split
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

In [None]:
data = pd.read_csv('/content/drive/My Drive/CS230/finaldata.csv')
train = pd.read_csv('/content/drive/My Drive/CS230/finaldata_train.csv')
test = pd.read_csv('/content/drive/My Drive/CS230/finaldata_test.csv')

In [None]:
data.head(5).iloc[:, :10]

Unnamed: 0,genres,overview,title,vote_average,vote_count,length,num_genres,action,adventure,animation
0,"['animation', 'comedy', 'family']","Led by Woody, Andy's toys live happily in his ...",Toy Story,7.7,5415.0,50,3,0,0,1
1,"['adventure', 'fantasy', 'family']",When siblings Judy and Peter discover an encha...,Jumanji,6.9,2413.0,65,3,0,1,0
2,"['romance', 'comedy']",A family wedding reignites the ancient feud be...,Grumpier Old Men,6.5,92.0,57,2,0,0,0
3,['comedy'],Just when George Banks has recovered from his ...,Father of the Bride Part II,5.7,173.0,56,1,0,0,0
4,"['action', 'crime', 'drama', 'thriller']","Obsessive master thief, Neil McCauley leads a ...",Heat,7.7,1886.0,55,4,1,0,0


## STEP1 // LET'S TOKENIZE THE DATA

In [None]:
data['overview_stop']=['<START> '+a+' <END>' for a in data.overview_stop]
train['overview_stop']=['<START> '+a+' <END>' for a in train.overview_stop]
test['overview_stop']=['<START> '+a+' <END>' for a in test.overview_stop]

In [None]:
# Tokenizing / Create a Tokenizer object

liststrings = list(data.overview_stop)
size_dict = 10000
tokenizer = Tokenizer(num_words= size_dict+1, 
                      filters='!"#$%&()*+,-/:;=?@[\\]^_`{|}~\t\n', 
                      split=' ', 
                      oov_token='<UNK>',
                      document_count=0)


tokenizer.fit_on_texts(liststrings) 
seqtokens = tokenizer.texts_to_sequences(liststrings)
traintokens = tokenizer.texts_to_sequences(list(train.overview_stop))
testtokens = tokenizer.texts_to_sequences(list(test.overview_stop))

tokenizer_config = tokenizer.get_config()
dict_counts = tokenizer_config['word_counts']
dict_index = tokenizer_config['word_index'] 

In [None]:
tokenizer.sequences_to_texts(seqtokens)[:2] 

["<start> led by woody andy's toys live happily in his room until andy's birthday brings buzz <UNK> onto the scene . afraid of losing his place in andy's heart woody plots against buzz . but when circumstances separate buzz and woody from their owner the duo eventually learns to put aside their differences . <end>",
 "<start> when siblings judy and peter discover an enchanted board game that opens the door to a magical world they unwittingly invite alan an adult who's been trapped inside the game for 26 years into their living room . <UNK> only hope for freedom is to finish the game which proves risky as all three find themselves running from giant <UNK> evil <UNK> and other terrifying creatures . <end>"]

##STEP2 // PREPROCESS THE DATA FOR TRAINING

In [None]:
### WORD-LEVEL LANGUAGE MODEL WITH SEQ TO SEQ STRUCTURE ###
# Script inspired in parts by the CHAR-LEVEL model: lstm_seq2seq.py by fchollet https://github.com/keras-team/keras
# Adapted by: ceciloge@stanford.edu

# PREPPING THE (INPUT GENRE) ENCODER DATA:

genredata = np.array(data[['action', 'adventure', 'animation', 'comedy', 'crime', 'documentary', 'drama', 'family', 
                           'fantasy','foreign', 'history', 'horror', 'music', 'mystery', 'romance', 'sci_fi',
                           'thriller', 'tv_movie', 'war', 'western']])
genretrain = np.array(train[['action', 'adventure', 'animation', 'comedy', 'crime', 'documentary', 'drama', 'family', 
                           'fantasy','foreign', 'history', 'horror', 'music', 'mystery', 'romance', 'sci_fi',
                           'thriller', 'tv_movie', 'war', 'western']])
genretest = np.array(test[['action', 'adventure', 'animation', 'comedy', 'crime', 'documentary', 'drama', 'family', 
                           'fantasy','foreign', 'history', 'horror', 'music', 'mystery', 'romance', 'sci_fi',
                           'thriller', 'tv_movie', 'war', 'western']])

m = genretrain.shape[0]
num_genres = genretrain.shape[1]
print("Size of Training Set: ", m) 
print("Max sequence length for input (genres):", num_genres) #num_encoder_tokens

Size of Training Set:  20277
Max sequence length for input (genres): 20


In [None]:
# Preprocess the text into smaller sequences of words on one side (window length), and output next word on the other
win_len = 10
tokens = traintokens
genre_cond = [] #ENCODER
x_input = [] #DECODER INPUT
y_next = []  #DECODER OUTPUT

for j, text in enumerate(tokens):
  if j%5000 == 0: print("We're at...", j)
  for i in range(len(text)-win_len):
    x_input.append(text[i:i+win_len])
    y_next.append(text[i+win_len])
    genre_cond.append(genretrain[j,:])

print("Total number of smaller sequences: ", len(x_input))

We're at... 0
We're at... 5000
We're at... 10000
We're at... 15000
We're at... 20000
Total number of smaller sequences:  878744


In [None]:
# Turning our sequences into arrays
x_input_array = np.array([np.array(s) for s in x_input]) #Decoder Input
y_next_array = np.array([np.array(s) for s in y_next])-1 #Decoder Output
genre_array = np.array(genre_cond) #Encoder Input
print('x Shape: ',x_input_array.shape)
print('y Shape: ',y_next_array.shape)
print('genre Shape: ',genre_array.shape)

x Shape:  (878744, 10)
y Shape:  (878744,)
genre Shape:  (878744, 20)


##STEP3 // BUILD & TRAIN THE ENCODER-DECODER MODEL

In [None]:

# Start with the ENCODER (input is genretrain)
encoder_input = Input(shape=(num_genres,))
encoder = Dense(256, activation = 'relu', kernel_initializer = 'he_normal')(encoder_input)
# The state will then be used as input for the decoder


# Input & Embedding for DECODER 
decoder_input = Input(shape=(win_len,))
h = Embedding(size_dict+1, 64, input_length = win_len, mask_zero=True, name = 'embedding')(decoder_input)

# Three GRU Layers
h = GRU(256, name = 'GRU1', return_sequences= True, kernel_initializer = 'glorot_normal')(h, initial_state=encoder)
h = GRU(256, name = 'GRU2', return_sequences= True, kernel_initializer = 'glorot_normal')(h)
h = GRU(256, name = 'GRU3', kernel_initializer = 'glorot_normal')(h)

# Final Dense Layers
h = Dense(512, activation = 'relu', name = 'dense1', kernel_initializer = 'he_normal')(h)
h = Dense(2560, activation = 'relu', name = 'dense2', kernel_initializer = 'he_normal')(h)
next_word = Dense(size_dict, activation='linear', name = 'final')(h)

# The model to train:
modelgen = Model([encoder_input, decoder_input], next_word)
modelgen.summary()


Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 10)]         0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 10, 64)       640064      input_2[0][0]                    
__________________________________________________________________________________________________
dense (Dense)                   (None, 256)          5376        input_1[0][0]                    
_______________________________________________________________________________________

In [None]:
opt = Adamax(learning_rate=0.001)
loss = SparseCategoricalCrossentropy(from_logits=True, name='sparse_cce')
modelgen.compile(loss=loss, optimizer=opt)
print('Ready!')

Ready!


In [None]:
modelgen.load_weights('/content/drive/My Drive/CS230/Models/encoder')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fdf53e279e8>

In [None]:
modelgen.fit([genre_array, x_input_array], y_next_array,
             batch_size = 256,
             epochs = 10)

In [None]:
modelgen.save_weights('/content/drive/My Drive/CS230/Models/encoder')

##STEP 4 // TRY OUR TRAINED MODEL

In [None]:
# Let's choose several seeds:
string_seed0 = 'alex and max meet at school and discover a'
string_seed1 = 'kian goes on a mission to'
string_seed2 = 'professor andrew goes on a mission to' 
string_seed3 = 'alex is pretty rich and famous but she discovers'
string_seed4 = 'judy is feeling lonely and'
string_seed5 = 'a group of friends decides to go out for'

In [None]:
# Functions to produce text with our Encoder-Decoder Model

every = 5
end = tokenizer.texts_to_sequences([['<end>']])[0][0] 
start = tokenizer.texts_to_sequences([['<start>']])[0][0] 
dicke = tokenizer.texts_to_sequences([["dickens'"]])[0][0]

# Sampling
def sample(preds, greedy = False):
  # This first function samples the next word based on the output logits from the model
  preds = np.asarray(preds).astype('float64')
  #print(preds)
  preds[0] = np.min(preds) # Minimize prob of <UNK> from being generated as it is not useful
  preds[start-1] = np.min(preds) # Minimize prob of <start> from being generated as it is not useful
  preds[dicke-1] = np.min(preds) # Minimize prob of "dicken'" from being generated as it is not useful
  preds = preds-np.max(preds)
  preds = np.exp(preds)
  preds = preds/np.sum(preds)
  samp = np.random.multinomial(1, preds, 1)
  if greedy: 
    out = max(1, np.argmax(preds))
  else: 
    out = np.argmax(samp)
  return out+1

# Genre Utils
listgenre = ['action', 'adventure', 'animation', 'comedy', 'crime', 'documentary', 'drama', 'family', 'fantasy','foreign', 
             'history', 'horror', 'music', 'mystery', 'romance', 'sci_fi','thriller', 'tv_movie', 'war', 'western']
dictgenre = {listgenre[i] : i for i in range(len(listgenre))}

# Generate Text
def model_generate_text(model = modelgen, seed = string_seed1, genre = ['drama'], length = 100, greedy = False, verbose = True):
    # This function generates text from a given seed - works like the callback function.
    input = '<START> '+seed
    output = []
    x_in = pad_sequences(tokenizer.texts_to_sequences([input]), maxlen=10)
    genre_in = np.zeros((1,20))
    for g in genre:
      i = dictgenre[g]
      genre_in[:,i] = 1

    i = -1
    count = 0
    v = 0
    while i != end and count <length:     
      preds = model.predict([genre_in, x_in], verbose=0)[0]
      i = sample(preds, greedy=greedy)
      x_in = np.append(x_in[:,1:],i).reshape((1,10))
      output.append(i)
      count +=1
      #v = count/15
    if verbose: print(seed+" "+tokenizer.sequences_to_texts([output])[0])
    return str(seed+" "+tokenizer.sequences_to_texts([output])[0]), output

In [None]:
# Now let's produce text with our fully trained model: 

seed1 = string_seed5
print(20*"_"+" RANDOM " + 20*"_")
print(" ")
text1, output1 = model_generate_text(model = modelgen, seed = seed1, genre =['romance'], length = 65)
text3, output3 = model_generate_text(model = modelgen, seed = seed1, genre =['mystery', 'action'], length = 65)
text4, output4 = model_generate_text(model = modelgen, seed = seed1, genre =['thriller', 'crime'], length = 65)
print(" ")
print(50*"_")
print(" ")

____________________ RANDOM ____________________
 
a group of friends decides to go out for another motorcycle island by his apartment albert starts an adventurous journey to discover his identity . <end>
a group of friends decides to go out for another money first after another on the case may have been invaded by a werewolf . when a team of murderous thieves are orphaned . . . . . eric's any kingdom . lucy has just all to know his attention and the most gorgeous professional girl jimmy has not using her and say that she is responsible for caring them for help . after
a group of friends decides to go out for dinner . sneak by the drug the local sheriff sam gen . for the town dr . alex jo and molly try to talk about it all . <end>
 
__________________________________________________
 


## STEP 5 // EVALUATE RESULTS

In [None]:
import tensorflow_hub as hub
univ_embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [None]:
# Generate Text for the evaluation process
def eval_generate(seed, genre, length):
    # This function generates text from a given seed - works like the callback function.
    input = seed
    output = []
    x_in = pad_sequences(tokenizer.texts_to_sequences([input]), maxlen=10)
    genre_in = genre

    i = -1
    count = 0
    v = 0
    while count <length:     
      preds = modelgen.predict([genre_in, x_in], verbose=0)[0]
      i = sample(preds, greedy=True)
      x_in = np.append(x_in[:,1:],i).reshape((1,10))
      output.append(i)
      count +=1
    return str(seed+" "+tokenizer.sequences_to_texts([output])[0]), output

In [None]:
# EVALUATE BLEU & COSINE SIMILARITY ON THE TEST SET

def cosine_sim(x,y):
    num = np.sum(x*y)
    den = np.sqrt(np.sum(x**2))*np.sqrt(np.sum(y**2))
    return num/float(den)

def evaluate_gen(greedy = True):
  bleu = []
  sim = [] 

  for j, synopsis in enumerate(testtokens):
    #From test set:
    input = tokenizer.sequences_to_texts([synopsis[:10]])[0]
    output_test = synopsis[10:]
    output_test_string = tokenizer.sequences_to_texts([output_test])[0]
    output_test_list = [tokenizer.sequences_to_texts([[i]])[0] for i in output_test]
    emb_test = univ_embed([output_test_string])
    l = len(synopsis)-10


    #From model:
    _, output = eval_generate(genre = genretest[j,:].reshape(1,20) ,seed = input, length = l)
    output_string = tokenizer.sequences_to_texts([output])[0]
    output_list = [tokenizer.sequences_to_texts([[i]])[0] for i in output]
    emb = univ_embed([output_string])
    #Similarity Scores:
    b = sentence_bleu([output_test_list], output_list, smoothing_function=SmoothingFunction().method2)
    s = cosine_sim(emb, emb_test)

    if j%100 == 0: 
      print(j, " done!")
      #print("bleu: ", b)
      #print("sim: ", s)

    bleu.append(b)
    sim.append(s)
  
  bleu_score = np.mean(bleu)
  sim_score = np.mean(sim)
  print("BLEU: ", bleu_score)
  print("COS. SIMILARITY: ", sim_score)
  return bleu_score, sim_score

In [None]:
print("Encoder-Decoder Performance:")
bleu_score, sim_score = evaluate_gen()

print("----------- Metrics -----------")
print("BLEU: ", np.round(bleu_score,4))
print("COS. SIMILARITY: ", np.round(sim_score,4))
print(" ")

Encoder-Decoder Performance:
0  done!
100  done!
200  done!
300  done!
400  done!
BLEU:  0.06268873925101057
COS. SIMILARITY:  0.2592567894944022
----------- Metrics -----------
BLEU:  0.0627
COS. SIMILARITY:  0.2593
 
