# TV Script Generation

Generating Simpson Tv Script using RNNs.
[Simpsons dataset](https://www.kaggle.com/wcukierski/the-simpsons-by-the-data)


In [1]:
import helper

datafile = './data/simpsons/moes_tavern_lines.txt'
data = helper.load_data(datafile)

text = data[81:]

Playing around to view different parts of the data


In [2]:
import numpy as np

view_sentence_range = (0,10)
import numpy as np

print('Dataset Stats')
print('Roughly the number of unique words: {}'.format(len({word: None for word in text.split()})))
scenes = text.split('\n\n')
print('Number of scenes: {}'.format(len(scenes)))
sentence_count_scene = [scene.count('\n') for scene in scenes]
print('Average number of sentences in each scene: {}'.format(np.average(sentence_count_scene)))

sentences = [sentence for scene in scenes for sentence in scene.split('\n')]
print('Number of lines: {}'.format(len(sentences)))
word_count_sentence = [len(sentence.split()) for sentence in sentences]
print('Average number of words in each line: {}'.format(np.average(word_count_sentence)))

print()
print('The sentences {} to {}:'.format(*view_sentence_range))
print('\n'.join(text.split('\n')[view_sentence_range[0]:view_sentence_range[1]]))

Dataset Stats
Roughly the number of unique words: 11492
Number of scenes: 262
Average number of sentences in each scene: 15.248091603053435
Number of lines: 4257
Average number of words in each line: 11.50434578341555

The sentences 0 to 10:
Moe_Szyslak: (INTO PHONE) Moe's Tavern. Where the elite meet to drink.
Bart_Simpson: Eh, yeah, hello, is Mike there? Last name, Rotch.
Moe_Szyslak: (INTO PHONE) Hold on, I'll check. (TO BARFLIES) Mike Rotch. Mike Rotch. Hey, has anybody seen Mike Rotch, lately?
Moe_Szyslak: (INTO PHONE) Listen you little puke. One of these days I'm gonna catch you, and I'm gonna carve my name on your back with an ice pick.
Moe_Szyslak: What's the matter Homer? You're not your normal effervescent self.
Homer_Simpson: I got my problems, Moe. Give me another one.
Moe_Szyslak: Homer, hey, you should not drink to forget your problems.
Barney_Gumble: Yeah, you should only drink to enhance your social skills.




In [288]:
import numpy as np
import problem_unittests as tests
from collections import Counter

#creating two dicts {word:idx}, {idx:word}

def create_lookup_tables(text):
    #word_set = set(text)
    #word_count = Counter(text)
    
    #counts = Counter(text)
    #words = sorted(counts, key=counts.get, reverse=True)
    words = set(text)
    word_2_int = {word: i for i, word in enumerate(words)}
    int_2_word = {i: word for i, word in enumerate(words)}
    #word_2_int = {word: i for i, word in enumerate(word_count.keys(), 0)}
    #int_2_word = {i:word for word, i in word_2_int.items()}

    return word_2_int, int_2_word    
        
tests.test_create_lookup_tables(create_lookup_tables)

Tests Passed


In [289]:
#tokenizing punctuation 

def token_lookup():
    punct_dict = {    
    '.': '||Period||',
    ',': '||Comma||',
    '"': '||Quotation_Mark||', 
    ';': '||Semicolon||',
    '!': '||Exclamation_mark||', 
    '?': '||Question_mark||', 
    '(': '||Left_Parentheses||',   
    ')': '||Right_Parentheses||',   
    '--': '||Dash||',  
    '\n': '||Return||'
    }       
    #text_list = list(text)
    #for chars in text_list:
    #    if chars in punct_list:
    #        text_list[chars] = punct_list[chars]
    return punct_dict        
tests.test_tokenize(token_lookup)

Tests Passed


In [290]:
helper.preprocess_and_save_data(datafile, token_lookup, create_lookup_tables)


### Checkpoint

In [291]:
import helper
import numpy as np
import problem_unittests as tests

int_text, word_2_int, int_2_word, puct_dict = helper.load_preprocess()

In [292]:
from distutils.version import LooseVersion
import warnings
import tensorflow as tf

# Check TensorFlow Version
assert LooseVersion(tf.__version__) >= LooseVersion('1.0'), 'Please use TensorFlow version 1.0 or newer'
print('TensorFlow Version: {}'.format(tf.__version__))

# Check for a GPU
if not tf.test.gpu_device_name():
    warnings.warn('No GPU found. Please use a GPU to train your neural network.')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

TensorFlow Version: 1.0.0


  # This is added back by InteractiveShellApp.init_path()


In [293]:
#creating placeholders for inputs, targets, learning rate.
def get_inputs():
    inputs = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    learning_rate = tf.placeholder(tf.float32, name='learning_rate')
    return inputs, targets, learning_rate

tests.test_get_inputs(get_inputs)

Tests Passed


In [294]:
#creating lstm network

def get_init_cell(batch_size, rnn_size):

    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    #drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=0.8)
    # Stack up multiple LSTM layers, for deep learning
    cell = tf.contrib.rnn.MultiRNNCell([lstm]*1) 
    state = cell.zero_state(batch_size, tf.float32)
    initial_state = tf.identity(state, name='initial_state')

    return (cell, initial_state)

tests.test_get_init_cell(get_init_cell)

Tests Passed


### Word Embedding

In [295]:
#creating embedding for input_data (a list of words as integers come in and get transformed into their vector representations). 
#like matrix multiplication but with a loockup table in order to save computation. Embedding of size (vocab_size x embed_dim)

def get_embed(input_data, vocab_size, embed_dim):
    embedding_weights = tf.Variable(tf.truncated_normal((vocab_size, embed_dim), stddev=0.1))
    embed = tf.nn.embedding_lookup(embedding_weights, input_data)
    
    return embed


tests.test_get_embed(get_embed)

Tests Passed


In [296]:
#Building the RNN
def build_rnn(cell, inputs):
    outputs, final_state = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
    final_state = tf.identity(final_state, name='final_state')
    
    return (outputs, final_state)

tests.test_build_rnn(build_rnn)

Tests Passed


In [297]:
#Building the Neural Network
def build_nn(cell, rnn_size, input_data, vocab_size, embed_dim):
    embed = get_embed(input_data, vocab_size, embed_dim)
    outputs, final_state = build_rnn(cell, embed)
    #reshaping output of rnn to connect it with a fully connected layer (necessary to stack an fc layer without using tf.contrib.layers.fully_connected)
    #outputs_ = tf.reshape(outputs, (-1, [rnn_size]))
    #fc_weights = tf.Variable(tf.random_normal((rnn_size, vocab_size), stddev= 0.1))
    #biases = tf.Variable(tf.zeros(vocab_size))
    #logits = tf.nn.relu(tf.add(tf.matmul(outputs_, fc_weights), biases))
    logits = tf.contrib.layers.fully_connected(outputs, vocab_size, activation_fn=tf.nn.relu)
    #logits = tf.reshape(logits, input_data.shape.as_list() + [vocab_size])
    return logits, final_state  


tests.test_build_nn(build_nn)

Tests Passed


In [298]:
#Making batches of input and target

def get_batches(int_text, batch_size, seq_length):

    n_batches = len(int_text)//(batch_size * seq_length)
    input_text = int_text[:n_batches * batch_size *seq_length]
    target_text = input_text[1:] + input_text[:1]
    inputs = np.array(input_text)
    targets = np.array(target_text)
    input_batches = np.split(inputs.reshape(batch_size, -1), n_batches, 1)
    target_batches = np.split(targets.reshape(batch_size, -1), n_batches, 1)
    
    return np.array(list(zip(input_batches, target_batches)))


tests.test_get_batches(get_batches)

Tests Passed


### Tuning Hyperparameters

In [319]:
num_epochs = 75
rnn_size = 256
batch_size = 128
embed_dim = 300
seq_length = 20
learning_rate = 0.01
# Show stats for every n number of batches
show_every_n_batches = 5

save_dir = './save'


### Building the Graph

In [320]:
from tensorflow.contrib import seq2seq

train_graph = tf.Graph()
with tf.device('/gpu:0'):
    with train_graph.as_default():
        vocab_size = len(int_2_word)
        input_text, targets, lr = get_inputs()
        input_data_shape = tf.shape(input_text)
        cell, initial_state = get_init_cell(input_data_shape[0], rnn_size)
        logits, final_state = build_nn(cell, rnn_size, input_text, vocab_size, embed_dim)
    
    #softmax layer after fully_connected
        probs = tf.nn.softmax(logits, name = 'probs')
    
    
    #Loss function
        cost = seq2seq.sequence_loss(logits, targets, tf.ones([input_data_shape[0], input_data_shape[1]]))
    
        optimizer = tf.train.AdamOptimizer(lr)
    
    #Gradient clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients =[(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)

### Training the Network 

In [321]:
batches = get_batches(int_text, batch_size, seq_length)

with tf.Session(graph=train_graph, config=tf.ConfigProto(log_device_placement=True)) as sess:
    sess.run(tf.global_variables_initializer())
    
    for epochs in range(num_epochs):
        state = sess.run(initial_state, {input_text:batches[0][0]})
        for n, (x,y) in enumerate(batches):
            feed = {input_text:x, targets:y, lr:learning_rate, initial_state:state}
            train_loss, state, _= sess.run([cost, final_state, train_op], feed)
                
            # Show every <show_every_n_batches> batches
            if (epochs * len(batches) + n) % show_every_n_batches == 0:
                print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}'.format(
                    epochs,
                    n,
                    len(batches),
                    train_loss))

    # Save Model
    saver = tf.train.Saver()
    saver.save(sess, save_dir)
    print('Model Trained and Saved')    

Epoch   0 Batch    0/26   train_loss = 8.821
Epoch   0 Batch    5/26   train_loss = 6.451
Epoch   0 Batch   10/26   train_loss = 6.330
Epoch   0 Batch   15/26   train_loss = 6.127
Epoch   0 Batch   20/26   train_loss = 6.121
Epoch   0 Batch   25/26   train_loss = 6.005
Epoch   1 Batch    4/26   train_loss = 5.905
Epoch   1 Batch    9/26   train_loss = 5.887
Epoch   1 Batch   14/26   train_loss = 5.704
Epoch   1 Batch   19/26   train_loss = 5.701
Epoch   1 Batch   24/26   train_loss = 5.702
Epoch   2 Batch    3/26   train_loss = 5.567
Epoch   2 Batch    8/26   train_loss = 5.470
Epoch   2 Batch   13/26   train_loss = 5.446
Epoch   2 Batch   18/26   train_loss = 5.396
Epoch   2 Batch   23/26   train_loss = 5.580
Epoch   3 Batch    2/26   train_loss = 5.322
Epoch   3 Batch    7/26   train_loss = 5.433
Epoch   3 Batch   12/26   train_loss = 5.532
Epoch   3 Batch   17/26   train_loss = 5.341
Epoch   3 Batch   22/26   train_loss = 5.214
Epoch   4 Batch    1/26   train_loss = 5.313
Epoch   4 

### Saving Parameters to generate a new TV Script


In [322]:
helper.save_params((seq_length, save_dir))

### Checkpoint

In [323]:
import tensorflow as tf
import numpy as np
import helper
import problem_unittests as tests

_, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()
seq_length, load_dir = helper.load_params()

### Getting tensors with get_tensor_by_name() from loaded_graph

In [324]:
def get_tensors(loaded_graph):
    input_T = loaded_graph.get_tensor_by_name("input:0")
    initial_state_T = loaded_graph.get_tensor_by_name("initial_state:0")
    final_state_T = loaded_graph.get_tensor_by_name("final_state:0")
    probs_T = loaded_graph.get_tensor_by_name("probs:0")
    
    return input_T, initial_state_T, final_state_T, probs_T

tests.test_get_tensors(get_tensors)

Tests Passed


### Selection next word with pick_word()

In [325]:
#Selecting next word with an argmax between probabilities (that is the result of the softmax layer)
def pick_word(probabilities, int_2_word):
    x_pos = np.argmax(probabilities, axis=0)
    next_word = int_2_word[x_pos]
    return next_word
    
tests.test_pick_word(pick_word)

Tests Passed


### Generating TV Scripts

In [326]:
gen_length = 200
prime_word = 'moe_szyslak'
loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    loader = tf.train.import_meta_graph(load_dir + '.meta')
    loader.restore(sess, load_dir)
    
    #loading tensors
    input_text, initial_state, final_state, probs = get_tensors(loaded_graph)
    gen_sentences = [prime_word + ':']
    prev_state = sess.run(initial_state, {input_text:np.array([[1]])})
    for n in range(gen_length):
        dyn_input = [[word_2_int[word] for word in gen_sentences[-gen_length:]]]
        dyn_seq_length = len(dyn_input[0])
        
        #getting probabilities
        probabilities, prev_state = sess.run([probs, final_state], {input_text:dyn_input, initial_state:prev_state})
        
        next_word= pick_word(probabilities[dyn_seq_length -1], int_2_word)
        
        gen_sentences.append(next_word)
        
    tv_script = ' '.join(gen_sentences)
    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        tv_script = tv_script.replace(' ' + token.lower(), key)
    tv_script = tv_script.replace('\n ', '\n')
    tv_script = tv_script.replace('( ', '(')
        
    print(tv_script)
        
        

moe_szyslak:(into a little more") moe! you know, i know a thing in the back of my life.
barney_gumble: hey, homer. you know what i'm not the bar?
lenny_leonard: i can too!
barney_gumble: hey, you did me a beer?
homer_simpson:(with the man) god!
homer_simpson:(with the man) god!
homer_simpson:(with the man) god!
homer_simpson:(with the man) god!
lenny_leonard:(too) and, uh, i ain't got that this is the only one from you and me"


homer_simpson:(with the man) huh?
homer_simpson: oh moe, you don't tell me more!
homer_simpson:(with the man) huh?
lenny_leonard:(" the springfield a"
homer_simpson: who was my life?
lenny_leonard: huh? well, i just know what about you, you don't tell me more and i'm gonna have to go, and i get it.
homer_simpson:(with the man) huh?
homer_simpson: oh


### If TV Script is non-sensical, the model needs to be trained on more data [another dataset](https://www.kaggle.com/wcukierski/the-simpsons-by-the-data)