# Model Demo (Bidirectional Model & Encoder-Decoder)


*   STEP 1 IMPORT MODEL & UTILS
*   STEP 2 DEMO



In [None]:
from google.colab import drive
drive.mount("/content/drive")

## STEP 0 // IMPORT OUR STUFF

In [2]:
# !pip install numpy
import pandas as pd
import numpy as np
import ast
# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from itertools import compress

import random
import sys
import io

In [3]:
!pip install colorama
from colorama import Fore



In [4]:
import tensorflow as tf

from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, Concatenate, Masking, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Layer, InputSpec
from tensorflow.keras.layers import GRU, LSTM, Bidirectional, Dot, Permute
from tensorflow.keras.layers import Conv1D, Activation, Multiply, Flatten, BatchNormalization, Add
from tensorflow.keras.losses import CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.callbacks import LambdaCallback, ModelCheckpoint
from tensorflow.keras.optimizers import RMSprop, Adam, Adamax
from tensorflow.keras import activations

from sklearn.model_selection import train_test_split
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

In [5]:
data = pd.read_csv('/content/drive/My Drive/CS230/finaldata.csv')
data['overview_stop']=['<START> '+a+' <END>' for a in data.overview_stop]

## STEP 1 // IMPORT MODEL & UTILS

In [6]:
# TOKENIZE
liststrings = list(data.overview_stop)
size_dict = 10000
tokenizer = Tokenizer(num_words= size_dict+1, 
                      filters='!"#$%&()*+,-/:;=?@[\\]^_`{|}~\t\n', 
                      split=' ', 
                      oov_token='<UNK>',
                      document_count=0)


tokenizer.fit_on_texts(liststrings) 
tokenizer_config = tokenizer.get_config()
dict_counts = tokenizer_config['word_counts']
dict_index = tokenizer_config['word_index'] 

# UTILS
win_len = 10

In [7]:
model = load_model('/content/drive/My Drive/CS230/Models/bidirectional.h5')



In [9]:
encod_model = load_model('/content/drive/My Drive/CS230/Models/encoder.h5')



In [10]:
# DEMO FUNCTIONS FOR CHOOSE YOUR OWN ADVENTURE

end = tokenizer.texts_to_sequences([['<end>']])[0][0] 
start = tokenizer.texts_to_sequences([['<start>']])[0][0] 
dicke = tokenizer.texts_to_sequences([["dickens'"]])[0][0]

def top_n(preds, n):
  preds = np.asarray(preds).astype('float64')
  #print(preds)
  preds[0] = np.min(preds) # Minimize prob of <UNK> from being generated as it is not useful
  preds[start-1] = np.min(preds) # Minimize prob of <start> from being generated as it is not useful
  preds[dicke-1] = np.min(preds) # Minimize prob of "dicken'" from being generated as it is not useful
  preds = preds-np.max(preds)
  preds = np.exp(preds)
  preds = preds/np.sum(preds)

  out = np.argpartition(preds, -n)[-n:]
  return out+1

def model_next(start = True, seed = 'we have come to think that'):
    if start: seed = '<START> '+seed
    print(Fore.LIGHTBLUE_EX + seed)
    x_in = pad_sequences(tokenizer.texts_to_sequences([seed]), truncating='pre', maxlen=10)
    
    preds = model.predict(x_in, verbose=0)[0]
    options = top_n(preds, 5)
    a = tokenizer.sequences_to_texts([[options[0]]])[0]
    b = tokenizer.sequences_to_texts([[options[1]]])[0]
    c = tokenizer.sequences_to_texts([[options[2]]])[0]
    d = tokenizer.sequences_to_texts([[options[3]]])[0]
    e = tokenizer.sequences_to_texts([[options[4]]])[0]
    
    print(Fore.BLACK + 'Options: 1.{} , 2.{} , 3.{} , 4.{}, 5.{} '.format(a,b,c,d,e))
    print(Fore.BLACK + "Select your next word! (input the number)")

    choice = input()
    choice = int(choice)
    newseed = seed+" "+tokenizer.sequences_to_texts([[options[choice-1]]])[0]


    
    return newseed, options[choice-1]

In [11]:
# DEMO FUNCTIONS FOR FULL SAMPLES

def sample(preds, greedy = False):
  # This first function samples the next word based on the output logits from the model
  preds = np.asarray(preds).astype('float64')
  #print(preds)
  preds[0] = np.min(preds) # Minimize prob of <UNK> from being generated as it is not useful
  preds[start-1] = np.min(preds) # Minimize prob of <start> from being generated as it is not useful
  preds[dicke-1] = np.min(preds) # Minimize prob of "dicken'" from being generated as it is not useful
  preds = preds-np.max(preds)
  preds = np.exp(preds)
  preds = preds/np.sum(preds)
  samp = np.random.multinomial(1, preds, 1)
  if greedy: 
    out = max(1, np.argmax(preds))
  else: 
    out = np.argmax(samp)
  return out+1


def model_generate_text(seed, length = 35, greedy = False, verbose = True, endtok=True):
    # This function generates text from a given seed - works like the callback function.
    input = '<START> '+seed
    output = []
    
    x_in = pad_sequences(tokenizer.texts_to_sequences([input]), maxlen=10)

    i = -1
    count = 0
    if endtok == True:
      while i != end and count <length:     
        preds = model.predict(x_in, verbose=0)[0]
        i = sample(preds, greedy=greedy)
        x_in = np.append(x_in[:,1:],i).reshape((1,10))
        output.append(i)
        count +=1
    else:
      while count <length:     
        preds = model.predict(x_in, verbose=0)[0]
        i = sample(preds, greedy=greedy)
        x_in = np.append(x_in[:,1:],i).reshape((1,10))
        output.append(i)
        count +=1
    if verbose: print(seed+" "+tokenizer.sequences_to_texts([output])[0])
    return str(seed+" "+tokenizer.sequences_to_texts([output])[0]), output

# Genre Utils
listgenre = ['action', 'adventure', 'animation', 'comedy', 'crime', 'documentary', 'drama', 'family', 'fantasy','foreign', 
             'history', 'horror', 'music', 'mystery', 'romance', 'sci_fi','thriller', 'tv_movie', 'war', 'western']
dictgenre = {listgenre[i] : i for i in range(len(listgenre))}

def genre_generate(seed, genre = ['drama'], length = 35, greedy = False, verbose = True):
    input = '<START> '+seed
    output = []
    x_in = pad_sequences(tokenizer.texts_to_sequences([input]), maxlen=10)
    genre_in = np.zeros((1,20))
    for g in genre:
      i = dictgenre[g]
      genre_in[:,i] = 1

    i = -1
    count = 0
    v = 0
    while i != end and count <length:     
      preds = encod_model.predict([genre_in, x_in], verbose=0)[0]
      i = sample(preds, greedy=greedy)
      x_in = np.append(x_in[:,1:],i).reshape((1,10))
      output.append(i)
      count +=1
    if verbose: print(seed+" "+tokenizer.sequences_to_texts([output])[0])
    return str(seed+" "+tokenizer.sequences_to_texts([output])[0]), output

def generate_n(seed,n):
  print("ADVENTURE:")
  genre_generate(seed, genre = ['adventure'], greedy = False)
  print("FANTASY:")
  genre_generate(seed, genre = ['fantasy'], greedy = False)
  print("THRILLER CRIME:")
  genre_generate(seed, genre = ['thriller','crime'], greedy = False)
  print("RANDOM:")
  for i in range(n-1):
    model_generate_text(seed)
  model_generate_text(seed, greedy=True)

In [12]:
def fun_demo_start():
  print("What's your seed?")
  text_input = input()
  return text_input.lower()


## STEP 2 // IT'S DEMO TIME!

In [13]:
# DEMO FULL SAMPLE / RANDOM & BY GENRE
seed = fun_demo_start()
generate_n(seed, 5)


What's your seed?
students are ready to play
ADVENTURE:
students are ready to play near and down the monastery . the journalist covers jim help eve marries nick a corrupt female coach living in romania dead he has hired the outside of the world stolen of the imminent girl's
FANTASY:
students are ready to play within the yellow boat . <end>
THRILLER CRIME:
students are ready to play right on or better success and was brought any mark that some people kill it . <end>
RANDOM:
students are ready to play their life's goal . when they explore a new world war one day they discover clues to an odd relationship with the attorney and eventually being sent to devil's springs . steve and the girls'
students are ready to play their blood at an underground military college politics . <end>
students are ready to play their goal before recorded through the world's part and early music police department to the point of view of drugs rather . <end>
students are ready to play the activities and fight . .

In [None]:
# DEMO WORD BY WORD "CHOOSE YOUR OWN ADVENTURE" / RANDOM
seed = fun_demo_start()
seed, a = model_next(start = True, seed = seed)
while a != end:
  seed, a =  model_next(start = False, seed = seed)

print(Fore.LIGHTBLUE_EX + "Well done! Here is your story:")
print(seed)

What's your seed?
cs230 students are meeting at school today to study
[94m<START> cs230 students are meeting at school today to study
[30mOptions: 1.king , 2.destruction , 3.robots , 4.rush, 5.various 
[30mSelect your next word! (input the number)
5
[94m<START> cs230 students are meeting at school today to study various
[30mOptions: 1.images , 2.stories , 3.unseen , 4.creatures, 5.college 
[30mSelect your next word! (input the number)
4
[94m<START> cs230 students are meeting at school today to study various creatures
[30mOptions: 1.she , 2.from , 3.they , 4.in, 5.living 
[30mSelect your next word! (input the number)
5
[94m<START> cs230 students are meeting at school today to study various creatures living
[30mOptions: 1.from , 2.life , 3.work , 4.large, 5.in 
[30mSelect your next word! (input the number)
5
[94m<START> cs230 students are meeting at school today to study various creatures living in
[30mOptions: 1.new , 2.a , 3.rio , 4.the, 5.las 
[30mSelect your next word!