# Conditional Encoder-Decoder Model


*   STEP 1 TOKENIZE THE DATA
*   STEP 2 PREPROCESS THE DATA FOR TRAINING
*   STEP 3 BUILD & TRAIN THE ENCODER-DECODER MODEL
*   STEP 4 LET'S TRY OUR TRAINED MODEL!
*   STEP 5 EVALUATE RESULTS (BLEU & UNIVERSAL COSINE SIMILARITY)



In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


## STEP 0 // IMPORT OUR STUFF

In [None]:
# !pip install numpy
import pandas as pd
import numpy as np
import ast
# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from itertools import compress

import random
import sys
import io

In [None]:
import tensorflow as tf

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate, Masking, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import GRU, LSTM, Bidirectional
from tensorflow.keras.layers import Conv1D, Activation, Multiply
from tensorflow.keras.losses import CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.callbacks import LambdaCallback, ModelCheckpoint
from tensorflow.keras.optimizers import RMSprop, Adam, Adamax
from tensorflow.keras import activations

from sklearn.model_selection import train_test_split
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

In [None]:
import tensorflow_hub as hub
univ_embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [None]:
data = pd.read_csv('/content/drive/My Drive/CS230/finaldata.csv')
train = pd.read_csv('/content/drive/My Drive/CS230/finaldata_train.csv')
test = pd.read_csv('/content/drive/My Drive/CS230/finaldata_test.csv')

In [None]:
data.head(5).iloc[:, :10]

Unnamed: 0,genres,overview,title,vote_average,vote_count,length,num_genres,action,adventure,animation
0,"['animation', 'comedy', 'family']","Led by Woody, Andy's toys live happily in his ...",Toy Story,7.7,5415.0,50,3,0,0,1
1,"['adventure', 'fantasy', 'family']",When siblings Judy and Peter discover an encha...,Jumanji,6.9,2413.0,65,3,0,1,0
2,"['romance', 'comedy']",A family wedding reignites the ancient feud be...,Grumpier Old Men,6.5,92.0,57,2,0,0,0
3,['comedy'],Just when George Banks has recovered from his ...,Father of the Bride Part II,5.7,173.0,56,1,0,0,0
4,"['action', 'crime', 'drama', 'thriller']","Obsessive master thief, Neil McCauley leads a ...",Heat,7.7,1886.0,55,4,1,0,0


## STEP1 // LET'S TOKENIZE THE DATA

In [None]:
data['overview_stop']=['<START> '+a+' <END>' for a in data.overview_stop]
train['overview_stop']=['<START> '+a+' <END>' for a in train.overview_stop]
test['overview_stop']=['<START> '+a+' <END>' for a in test.overview_stop]

In [None]:
# Tokenizing / Create a Tokenizer object

liststrings = list(data.overview_stop)
size_dict = 9000
tokenizer = Tokenizer(num_words= size_dict+1, 
                      filters='!"#$%&()*+,-/:;=?@[\\]^_`{|}~\t\n', 
                      split=' ', 
                      oov_token='<UNK>',
                      document_count=0)


tokenizer.fit_on_texts(liststrings) 
seqtokens = tokenizer.texts_to_sequences(liststrings)
traintokens = tokenizer.texts_to_sequences(list(train.overview_stop))
testtokens = tokenizer.texts_to_sequences(list(test.overview_stop))

tokenizer_config = tokenizer.get_config()
dict_counts = tokenizer_config['word_counts']
dict_index = tokenizer_config['word_index'] 

In [None]:
tokenizer.sequences_to_texts(seqtokens)[:2] 

["<start> led by woody andy's toys live happily in his room until andy's birthday brings buzz <UNK> onto the scene . afraid of losing his place in andy's heart woody plots against buzz . but when circumstances separate buzz and woody from their owner the duo eventually learns to put aside their differences . <end>",
 "<start> when siblings judy and peter discover an enchanted board game that opens the door to a magical world they unwittingly invite alan an adult who's been trapped inside the game for 26 years into their living room . <UNK> only hope for freedom is to finish the game which proves risky as all three find themselves running from giant <UNK> evil <UNK> and other terrifying creatures . <end>"]

##STEP2 // PREPROCESS THE DATA FOR TRAINING

In [None]:
### WORD-LEVEL LANGUAGE MODEL WITH SEQ TO SEQ STRUCTURE ###
# Script inspired in parts by the CHAR-LEVEL model: lstm_seq2seq.py by fchollet https://github.com/keras-team/keras
# Adapted by: ceciloge@stanford.edu

# PREPPING THE (INPUT GENRE) ENCODER DATA:

genredata = np.array(data[['action', 'adventure', 'animation', 'comedy', 'crime', 'documentary', 'drama', 'family', 
                           'fantasy','foreign', 'history', 'horror', 'music', 'mystery', 'romance', 'sci_fi',
                           'thriller', 'tv_movie', 'war', 'western']])
genretrain = np.array(train[['action', 'adventure', 'animation', 'comedy', 'crime', 'documentary', 'drama', 'family', 
                           'fantasy','foreign', 'history', 'horror', 'music', 'mystery', 'romance', 'sci_fi',
                           'thriller', 'tv_movie', 'war', 'western']])
genretest = np.array(test[['action', 'adventure', 'animation', 'comedy', 'crime', 'documentary', 'drama', 'family', 
                           'fantasy','foreign', 'history', 'horror', 'music', 'mystery', 'romance', 'sci_fi',
                           'thriller', 'tv_movie', 'war', 'western']])

m = genretrain.shape[0]
num_genres = genretrain.shape[1]
print("Size of Training Set: ", m) 
print("Max sequence length for input (genres):", num_genres) #num_encoder_tokens

Size of Training Set:  20277
Max sequence length for input (genres): 20


In [None]:
# Preprocess the text into smaller sequences of words on one side (window length), and output next word on the other
win_len = 10
tokens = traintokens
genre_cond = [] #ENCODER
x_input = [] #DECODER INPUT
y_next = []  #DECODER OUTPUT

for j, text in enumerate(tokens):
  if j%5000 == 0: print("We're at...", j)
  for i in range(len(text)-win_len):
    x_input.append(text[i:i+win_len])
    y_next.append(text[i+win_len])
    genre_cond.append(genretrain[j,:])

print("Total number of smaller sequences: ", len(x_input))

We're at... 0
We're at... 5000
We're at... 10000
We're at... 15000
We're at... 20000
Total number of smaller sequences:  878744


In [None]:
# Turning our sequences into arrays
x_input_array = np.array([np.array(s) for s in x_input]) #Decoder Input
y_next_array = np.array([np.array(s) for s in y_next])-1 #Decoder Output
genre_array = np.array(genre_cond) #Encoder Input
print('x Shape: ',x_input_array.shape)
print('y Shape: ',y_next_array.shape)
print('genre Shape: ',genre_array.shape)

x Shape:  (878744, 10)
y Shape:  (878744,)
genre Shape:  (878744, 20)


##STEP 3 // BUILD & TRAIN THE ENCODER-DECODER MODEL

In [None]:

# Start with the ENCODER (input is genretrain)
encoder_input = Input(shape=(num_genres,))
encoder = Dense(256)(encoder_input)
# The state will then be used as input for the decoder


# Input & Embedding for DECODER 
decoder_input = Input(shape=(win_len,))
h = Embedding(size_dict+1, 64, input_length = win_len, mask_zero=True, name = 'embedding')(decoder_input)

# Three GRU Layers
h = GRU(256, name = 'GRU1', return_sequences= True)(h, initial_state=encoder)
h = GRU(256, name = 'GRU2', return_sequences= True)(h)
h = GRU(256, name = 'GRU3')(h)

# Final Dense Layers
h = Dense(512, activation = 'relu', name = 'dense1')(h)
h = Dense(2560, activation = 'relu', name = 'dense2')(h)
next_word = Dense(size_dict, activation='linear', name = 'final')(h)

# The model to train:
modelgen = Model([encoder_input, decoder_input], next_word)
modelgen.summary()


Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None, 10)]         0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 10, 64)       576064      input_6[0][0]                    
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 256)          5376        input_5[0][0]                    
_______________________________________________________________________________________

In [None]:
opt = Adamax(learning_rate=0.01)
loss = SparseCategoricalCrossentropy(from_logits=True, name='sparse_cce')
modelgen.compile(loss=loss, optimizer=opt)
print('Ready!')

Ready!


In [None]:
modelgen.load_weights('/content/drive/My Drive/CS230/Models/encoder2')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7ff66726ac50>

In [None]:
modelgen.fit([genre_array, x_input_array], y_next_array,
             batch_size = 256,
             epochs = 30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7ff6671ba518>

In [None]:
modelgen.save_weights('/content/drive/My Drive/CS230/Models/encoder2')

In [None]:
# Functions to produce text

every = 5

def generate_text2(epoch, _):
  # This second function prints generated text at end of every few epochs
  if epoch%every == 0:
    print()
    input = 'when siblings judy and peter discover a board game that'
    print('Seed: "' + input + '"')
    output = []
    x_in = np.array(tokenizer.texts_to_sequences([input])[0])
    for i in range(60):     
      preds = gen_model2.predict(x_in.reshape((1,x_in.shape[0])), verbose=0)[0]
      next_idx = sample(preds)
      x_in = np.append(x_in[1:],next_idx)
      output.append(next_idx)
    print(input+" "+tokenizer.sequences_to_texts([output])[0])


# Defining our callbacks - reusing code from previous model:
checkpoint2 = ModelCheckpoint(filepath='model2',
                             frequency = "epoch",
                             save_weights_only = True,
                             verbose = 0)

gen_callback2 = LambdaCallback(on_epoch_end=generate_text2)

##STEP 4 // TRY OUR TRAINED MODEL

In [None]:
# Let's choose several seeds:

string_seed0 = 'when siblings judy and peter discover a board game that'
string_seed1 = 'andy and judy met a year ago for the first'
string_seed2 = 'alex is willing to accept a new mission to save' 
string_seed3 = 'led by alex the gang decides to take on a'
string_seed4 = 'judy is a young woman who is starting to feel'
string_seed5 = 'a group of friends decide to go out for a'
string_seed6 = '<UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK>'

def model_generate_text2(model, seed = string_seed1, length = 100, greedy = False, verbose = True):
    # This function generates text from a given seed - works like the callback function.
    input = seed
    output = []
    x_in = tokenizer.texts_to_sequences([input])
    x_in = np.array(x_in[0])

    for i in range(length):     
      preds = model.predict(x_in.reshape((1,x_in.shape[0])), verbose=0)[0]
      next_idx = sample(preds, greedy = greedy)
      x_in = np.append(x_in[1:],next_idx)
      output.append(next_idx)
    if verbose: print(input+" "+tokenizer.sequences_to_texts([output])[0])
    return str(input+" "+tokenizer.sequences_to_texts([output])[0]), output

In [None]:
# Now let's produce text with our fully trained model: 

text0, output0 = model_generate_text2(model = gen_model2, seed = string_seed0, length = 65, greedy = True)
text0, output0 = model_generate_text2(model = gen_model2, seed = string_seed0, length = 65, greedy = False)
print(20*"_"+" RANDOM " + 20*"_")
print(" ")
text1, output1 = model_generate_text2(model = gen_model2, seed = string_seed1, length = 65)
text2, output2 = model_generate_text2(model = gen_model2, seed = string_seed2, length = 65)
text3, output3 = model_generate_text2(model = gen_model2, seed = string_seed3, length = 65)
text4, output4 = model_generate_text2(model = gen_model2, seed = string_seed4, length = 65)
text5, output5 = model_generate_text2(model = gen_model2, seed = string_seed5, length = 65)
print(" ")
print(50*"_")
print(" ")
print(20*"_"+" GREEDY " + 20*"_")
print(" ")
text6, output6 = model_generate_text2(model = gen_model2, seed = string_seed1, length = 65, greedy = True)
text7, output7 = model_generate_text2(model = gen_model2, seed = string_seed2, length = 65, greedy = True)
text8, output8 = model_generate_text2(model = gen_model2, seed = string_seed3, length = 65, greedy = True)
text9, output9 = model_generate_text2(model = gen_model2, seed = string_seed4, length = 65, greedy = True)
text10, output10 = model_generate_text2(model = gen_model2, seed = string_seed5, length = 65, greedy = True)
print(" ")
print(20*"_"+" NO REAL SEED " + 20*"_")
print(" ")
text11, output11 = model_generate_text2(model = gen_model2, seed = string_seed6, length = 65, greedy = False)
text11, output11 = model_generate_text2(model = gen_model2, seed = string_seed6, length = 65, greedy = False)
print(" ")
print(50*"_")

## STEP 5 // EVALUATE RESULTS

In [None]:
# EVALUATE BLEU & COSINE SIMILARITY ON THE TEST SET

def cosine_sim(x,y):
    num = np.sum(x*y)
    den = np.sqrt(np.sum(x**2))*np.sqrt(np.sum(y**2))
    return num/float(den)

def evaluate_gen(version, greedy = True):
  bleu = []
  sim = []
  if version == 1: 
    model = gen_model
    fun = model_generate_text
  else:
    model = gen_model2
    fun = model_generate_text2   
  
  for j, synopsis in enumerate(testtokens[:250]):
    #From test set:
    input = tokenizer.sequences_to_texts([synopsis[:10]])[0]
    output_test = synopsis[10:]
    output_test_string = tokenizer.sequences_to_texts([output_test])[0]
    output_test_list = [tokenizer.sequences_to_texts([[i]])[0] for i in output_test]
    emb_test = univ_embed([output_test_string])
    l = len(synopsis)-10


    #From model:
    _, output = fun(model = model, seed = input, length = l, greedy = greedy, verbose = False)
    output_string = tokenizer.sequences_to_texts([output])[0]
    output_list = [tokenizer.sequences_to_texts([[i]])[0] for i in output]
    emb = univ_embed([output_string])
    #Similarity Scores:
    b = sentence_bleu([output_test_list], output_list, smoothing_function=SmoothingFunction().method2)
    s = cosine_sim(emb, emb_test)

    if j%20 == 0: 
      print(j, " done!")
      #print("bleu: ", b)
      #print("sim: ", s)

    bleu.append(b)
    sim.append(s)
  
  bleu_score = np.mean(bleu)
  sim_score = np.mean(sim)
  print("BLEU: ", bleu_score)
  print("COS. SIMILARITY: ", sim_score)
  return bleu_score, sim_score