<a href="https://colab.research.google.com/github/cecileloge/Free-Text-Generation/blob/main/DemoNotebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model Demo (Bidirectional Model & Encoder-Decoder)


*   STEP 1 IMPORT MODEL & UTILS
*   STEP 2 DEMO



In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


## STEP 0 // IMPORT OUR STUFF

In [2]:
# !pip install numpy
import pandas as pd
import numpy as np
import ast
# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from itertools import compress

import random
import sys
import io

In [3]:
!pip install colorama
from colorama import Fore

Collecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Installing collected packages: colorama
Successfully installed colorama-0.4.4


In [4]:
import tensorflow as tf

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate, Masking, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Layer, InputSpec
from tensorflow.keras.layers import GRU, LSTM, Bidirectional, Dot, Permute
from tensorflow.keras.layers import Conv1D, Activation, Multiply, Flatten, BatchNormalization, Add
from tensorflow.keras.losses import CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.callbacks import LambdaCallback, ModelCheckpoint
from tensorflow.keras.optimizers import RMSprop, Adam, Adamax
from tensorflow.keras import activations

from sklearn.model_selection import train_test_split
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

In [5]:
data = pd.read_csv('/content/drive/My Drive/CS230/finaldata.csv')
data['overview_stop']=['<START> '+a+' <END>' for a in data.overview_stop]

## STEP 1 // IMPORT MODEL & UTILS

In [None]:
# TOKENIZE
liststrings = list(data.overview_stop)
size_dict = 10000
tokenizer = Tokenizer(num_words= size_dict+1, 
                      filters='!"#$%&()*+,-/:;=?@[\\]^_`{|}~\t\n', 
                      split=' ', 
                      oov_token='<UNK>',
                      document_count=0)


tokenizer.fit_on_texts(liststrings) 
tokenizer_config = tokenizer.get_config()
dict_counts = tokenizer_config['word_counts']
dict_index = tokenizer_config['word_index'] 

# UTILS
win_len = 10

In [None]:
### RNN LANGUAGE MODEL ###
# By: ceciloge@stanford.edu

# Input & Embedding
seq_input = Input(shape=(win_len,), name = 'miniseq')
h = Embedding(size_dict+1, 64, input_length = win_len, mask_zero=True, name = 'embedding')(seq_input)

# Two BIDIRECTIONAL GRU Layers
h = Bidirectional (GRU(256, name = 'GRU1', return_sequences= True, kernel_initializer = 'glorot_normal'))(h)
h = Bidirectional (GRU(256, name = 'GRU2', kernel_initializer = 'glorot_normal'))(h)

# Final Dense Layers
h = Dense(512, activation = 'relu', name = 'dense1')(h)
h = Dense(2560, activation = 'relu', name = 'dense2')(h)
next_word = Dense(size_dict, activation='linear', name = 'final')(h)

model = Model(inputs = seq_input, outputs = next_word)

# COMPILE & IMPORT TRAINED WEIGHTS
opt = Adamax(learning_rate=0.001)
loss = SparseCategoricalCrossentropy(from_logits=True, name='sparse_cce')
model.compile(loss=loss, optimizer=opt)

model.load_weights('/content/drive/My Drive/CS230/Models/bidirectional')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fe0d64e32e8>

In [None]:
num_genres = 20
# Start with the ENCODER (input is genretrain)
encoder_input = Input(shape=(num_genres,))
encoder = Dense(256, activation = 'relu', kernel_initializer = 'he_normal')(encoder_input)
# The state will then be used as input for the decoder


# Input & Embedding for DECODER 
decoder_input = Input(shape=(win_len,))
h = Embedding(size_dict+1, 64, input_length = win_len, mask_zero=True, name = 'embedding')(decoder_input)

# Three GRU Layers
h = GRU(256, name = 'GRU1', return_sequences= True, kernel_initializer = 'glorot_normal')(h, initial_state=encoder)
h = GRU(256, name = 'GRU2', return_sequences= True, kernel_initializer = 'glorot_normal')(h)
h = GRU(256, name = 'GRU3', kernel_initializer = 'glorot_normal')(h)

# Final Dense Layers
h = Dense(512, activation = 'relu', name = 'dense1', kernel_initializer = 'he_normal')(h)
h = Dense(2560, activation = 'relu', name = 'dense2', kernel_initializer = 'he_normal')(h)
next_word = Dense(size_dict, activation='linear', name = 'final')(h)

# The model to train:
encod_model = Model([encoder_input, decoder_input], next_word)

# COMPILE & IMPORT TRAINED WEIGHTS
opt = Adamax(learning_rate=0.001)
loss = SparseCategoricalCrossentropy(from_logits=True, name='sparse_cce')
encod_model.compile(loss=loss, optimizer=opt)

encod_model.load_weights('/content/drive/My Drive/CS230/Models/encoder')


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fe047821f98>

In [None]:
# DEMO FUNCTIONS FOR CHOOSE YOUR OWN ADVENTURE

end = tokenizer.texts_to_sequences([['<end>']])[0][0] 
start = tokenizer.texts_to_sequences([['<start>']])[0][0] 
dicke = tokenizer.texts_to_sequences([["dickens'"]])[0][0]

def top_n(preds, n):
  preds = np.asarray(preds).astype('float64')
  #print(preds)
  preds[0] = np.min(preds) # Minimize prob of <UNK> from being generated as it is not useful
  preds[start-1] = np.min(preds) # Minimize prob of <start> from being generated as it is not useful
  preds[dicke-1] = np.min(preds) # Minimize prob of "dicken'" from being generated as it is not useful
  preds = preds-np.max(preds)
  preds = np.exp(preds)
  preds = preds/np.sum(preds)

  out = np.argpartition(preds, -n)[-n:]
  return out+1

def model_next(start = True, seed = 'we have come to think that'):
    if start: seed = '<START> '+seed
    print(Fore.LIGHTBLUE_EX + seed)
    x_in = pad_sequences(tokenizer.texts_to_sequences([seed]), truncating='pre', maxlen=10)
    
    preds = model.predict(x_in, verbose=0)[0]
    options = top_n(preds, 5)
    a = tokenizer.sequences_to_texts([[options[0]]])[0]
    b = tokenizer.sequences_to_texts([[options[1]]])[0]
    c = tokenizer.sequences_to_texts([[options[2]]])[0]
    d = tokenizer.sequences_to_texts([[options[3]]])[0]
    e = tokenizer.sequences_to_texts([[options[4]]])[0]
    
    print(Fore.BLACK + 'Options: 1.{} , 2.{} , 3.{} , 4.{}, 5.{} '.format(a,b,c,d,e))
    print(Fore.BLACK + "Select your next word! (input the number)")

    choice = input()
    choice = int(choice)
    newseed = seed+" "+tokenizer.sequences_to_texts([[options[choice-1]]])[0]


    
    return newseed, options[choice-1]

In [None]:
# DEMO FUNCTIONS FOR FULL SAMPLES

def sample(preds, greedy = False):
  # This first function samples the next word based on the output logits from the model
  preds = np.asarray(preds).astype('float64')
  #print(preds)
  preds[0] = np.min(preds) # Minimize prob of <UNK> from being generated as it is not useful
  preds[start-1] = np.min(preds) # Minimize prob of <start> from being generated as it is not useful
  preds[dicke-1] = np.min(preds) # Minimize prob of "dicken'" from being generated as it is not useful
  preds = preds-np.max(preds)
  preds = np.exp(preds)
  preds = preds/np.sum(preds)
  samp = np.random.multinomial(1, preds, 1)
  if greedy: 
    out = max(1, np.argmax(preds))
  else: 
    out = np.argmax(samp)
  return out+1


def model_generate_text(seed, length = 35, greedy = False, verbose = True, endtok=True):
    # This function generates text from a given seed - works like the callback function.
    input = '<START> '+seed
    output = []
    
    x_in = pad_sequences(tokenizer.texts_to_sequences([input]), maxlen=10)

    i = -1
    count = 0
    if endtok == True:
      while i != end and count <length:     
        preds = model.predict(x_in, verbose=0)[0]
        i = sample(preds, greedy=greedy)
        x_in = np.append(x_in[:,1:],i).reshape((1,10))
        output.append(i)
        count +=1
    else:
      while count <length:     
        preds = model.predict(x_in, verbose=0)[0]
        i = sample(preds, greedy=greedy)
        x_in = np.append(x_in[:,1:],i).reshape((1,10))
        output.append(i)
        count +=1
    if verbose: print(seed+" "+tokenizer.sequences_to_texts([output])[0])
    return str(seed+" "+tokenizer.sequences_to_texts([output])[0]), output

# Genre Utils
listgenre = ['action', 'adventure', 'animation', 'comedy', 'crime', 'documentary', 'drama', 'family', 'fantasy','foreign', 
             'history', 'horror', 'music', 'mystery', 'romance', 'sci_fi','thriller', 'tv_movie', 'war', 'western']
dictgenre = {listgenre[i] : i for i in range(len(listgenre))}

def genre_generate(seed, genre = ['drama'], length = 35, greedy = False, verbose = True):
    input = '<START> '+seed
    output = []
    x_in = pad_sequences(tokenizer.texts_to_sequences([input]), maxlen=10)
    genre_in = np.zeros((1,20))
    for g in genre:
      i = dictgenre[g]
      genre_in[:,i] = 1

    i = -1
    count = 0
    v = 0
    while i != end and count <length:     
      preds = encod_model.predict([genre_in, x_in], verbose=0)[0]
      i = sample(preds, greedy=greedy)
      x_in = np.append(x_in[:,1:],i).reshape((1,10))
      output.append(i)
      count +=1
    if verbose: print(seed+" "+tokenizer.sequences_to_texts([output])[0])
    return str(seed+" "+tokenizer.sequences_to_texts([output])[0]), output

def generate_n(seed,n):
  print("ADVENTURE:")
  genre_generate(seed, genre = ['adventure'], greedy = False)
  print("FANTASY:")
  genre_generate(seed, genre = ['fantasy'], greedy = False)
  print("THRILLER CRIME:")
  genre_generate(seed, genre = ['thriller','crime'], greedy = False)
  print("RANDOM:")
  for i in range(n-1):
    model_generate_text(seed)
  model_generate_text(seed, greedy=True)

In [None]:
def fun_demo_start():
  print("What's your seed?")
  text_input = input()
  return text_input.lower()


## STEP 2 // IT'S DEMO TIME!

In [None]:
# DEMO FULL SAMPLE / RANDOM & BY GENRE
seed = fun_demo_start()
generate_n(seed, 5)


What's your seed?
cs230 students will meet on campus to study
ADVENTURE:
cs230 students will meet on campus to study storms by evil beings from the creator of the army and nature's plague . the fourth may be husband and ten young friends . survived them he finds himself exploring the telekinetic past while he
FANTASY:
cs230 students will meet on campus to study master hulk to guide the nazis to form a power and run back into egypt for years . together they finds himself with a russian soldier trying to determine the killer themselves . <end>
THRILLER CRIME:
cs230 students will meet on campus to study meat a life like deadly fish book system . . . or stand . <end>
RANDOM:
cs230 students will meet on campus to study oil on them . <end>
cs230 students will meet on campus to study vampires who threaten them to enter the town alive . <end>
cs230 students will meet on campus to study seven timing life . <end>
cs230 students will meet on campus to study 12 30 minutes before all seems destined

In [None]:
# DEMO WORD BY WORD "CHOOSE YOUR OWN ADVENTURE" / RANDOM
seed = fun_demo_start()
seed, a = model_next(start = True, seed = seed)
while a != end:
  seed, a =  model_next(start = False, seed = seed)

print(Fore.LIGHTBLUE_EX + "Well done! Here is your story:")
print(seed)

What's your seed?
cs230 students are meeting at school today to study
[94m<START> cs230 students are meeting at school today to study
[30mOptions: 1.king , 2.destruction , 3.robots , 4.rush, 5.various 
[30mSelect your next word! (input the number)
5
[94m<START> cs230 students are meeting at school today to study various
[30mOptions: 1.images , 2.stories , 3.unseen , 4.creatures, 5.college 
[30mSelect your next word! (input the number)
4
[94m<START> cs230 students are meeting at school today to study various creatures
[30mOptions: 1.she , 2.from , 3.they , 4.in, 5.living 
[30mSelect your next word! (input the number)
5
[94m<START> cs230 students are meeting at school today to study various creatures living
[30mOptions: 1.from , 2.life , 3.work , 4.large, 5.in 
[30mSelect your next word! (input the number)
5
[94m<START> cs230 students are meeting at school today to study various creatures living in
[30mOptions: 1.new , 2.a , 3.rio , 4.the, 5.las 
[30mSelect your next word!