## Implementing a LSTM based Word generator from Simpsons Movie Scripts.
The strategy is suggested by Kaushal Shetty here https://github.com/keras-team/keras/issues/2009

In [1]:
import numpy as np
import pandas as pd

import collections

import re, nltk

from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Activation

from keras.layers.embeddings import Embedding

# Visualization
import seaborn as sns

# this allows plots to appear directly in the notebook
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
INPUT_FILE = "data/moes_tavern_lines.txt"

In [3]:
content = open(INPUT_FILE).read()

In [4]:
content[:500]

"[YEAR DATE 1989] Â© Twentieth Century Fox Film Corporation. All rights reserved.\n\nMoe_Szyslak: (INTO PHONE) Moe's Tavern. Where the elite meet to drink.\nBart_Simpson: Eh, yeah, hello, is Mike there? Last name, Rotch.\nMoe_Szyslak: (INTO PHONE) Hold on, I'll check. (TO BARFLIES) Mike Rotch. Mike Rotch. Hey, has anybody seen Mike Rotch, lately?\nMoe_Szyslak: (INTO PHONE) Listen you little puke. One of these days I'm gonna catch you, and I'm gonna carve my name on your back with an ice pick.\nMoe_Szys"

In [5]:
from keras.preprocessing.text import text_to_word_sequence
word_sequence = text_to_word_sequence(content)

We take a window size of SEQLEN and create a new training data set each of SEQLEN words and (SEQLEN+1) th word becomes our label.

In [6]:
SEQLEN = 10

In [7]:
input_sentences = []
target_words = []
for i in range(0, len(word_sequence) - SEQLEN):
    input_sentences.append(" ".join(word_sequence[i:i + SEQLEN]))
    target_words.append(word_sequence[i + SEQLEN])

In [8]:
input_sentences[:10]

['year date 1989 â© twentieth century fox film corporation all',
 'date 1989 â© twentieth century fox film corporation all rights',
 '1989 â© twentieth century fox film corporation all rights reserved',
 'â© twentieth century fox film corporation all rights reserved moe',
 'twentieth century fox film corporation all rights reserved moe szyslak',
 'century fox film corporation all rights reserved moe szyslak into',
 'fox film corporation all rights reserved moe szyslak into phone',
 "film corporation all rights reserved moe szyslak into phone moe's",
 "corporation all rights reserved moe szyslak into phone moe's tavern",
 "all rights reserved moe szyslak into phone moe's tavern where"]

In [9]:
target_words[:10]

['rights',
 'reserved',
 'moe',
 'szyslak',
 'into',
 'phone',
 "moe's",
 'tavern',
 'where',
 'the']

Use keras tokenizer to tokenize the texts and do a texts_to_sequences on them

In [10]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()

In [11]:
tokenizer.fit_on_texts(input_sentences)

In [12]:
input_sequences = tokenizer.texts_to_sequences(input_sentences)
input_sequences[:10]

[[467, 503, 6453, 6452, 2927, 1942, 1217, 1490, 1941, 30],
 [503, 6453, 6452, 2927, 1942, 1217, 1490, 1941, 30, 2928],
 [6453, 6452, 2927, 1942, 1217, 1490, 1941, 30, 2928, 1943],
 [6452, 2927, 1942, 1217, 1490, 1941, 30, 2928, 1943, 1],
 [2927, 1942, 1217, 1490, 1941, 30, 2928, 1943, 1, 7],
 [1942, 1217, 1490, 1941, 30, 2928, 1943, 1, 7, 119],
 [1217, 1490, 1941, 30, 2928, 1943, 1, 7, 119, 172],
 [1490, 1941, 30, 2928, 1943, 1, 7, 119, 172, 162],
 [1941, 30, 2928, 1943, 1, 7, 119, 172, 162, 343],
 [30, 2928, 1943, 1, 7, 119, 172, 162, 343, 134]]

In [13]:
np.array(input_sequences).shape

(53158, 10)

Map target_words to tokenizer.word_index and convert it into a categorical variable.

In [14]:
target_sequences = list(map(lambda w: tokenizer.word_index[w], target_words))
target_sequences[:10]

[2928, 1943, 1, 7, 119, 172, 162, 343, 134, 2]

In [15]:
np.array(target_sequences).shape

(53158,)

In [16]:
# Investigate first two labels
list(map(lambda w: tokenizer.word_index[w], ['rights', 'reserved']))

[2928, 1943]

In [17]:
word2index = tokenizer.word_index
index2word = {v:k for k, v in word2index.items()}

In [18]:
VOCAB_SIZE = len(word2index) + 1

In [19]:
VOCAB_SIZE

6454

In [20]:
target_one_hot = np.zeros((len(target_sequences), VOCAB_SIZE))
for i, wid in enumerate(target_sequences):
    target_one_hot[i, wid] = 1

In [21]:
target_one_hot

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  1.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [22]:
target_one_hot.shape

(53158, 6454)

### Using Embedding Layer

Since the vocabulary is very large the numerical sequences turn into sparse arrays and it's more efficient to cast everything to a lower dimension with the `Embedding` layer.

In [23]:
EMBED_SIZE  = 300
HIDDEN_SIZE = 256

In [24]:
# Build model
model = Sequential()
model.add(Embedding(input_dim=VOCAB_SIZE, 
                    output_dim=EMBED_SIZE, 
                    input_length=SEQLEN, 
                    weights=None,
                    trainable=True))
model.add(LSTM(units=HIDDEN_SIZE, return_sequences=True, unroll=True))
model.add(Dropout(0.2))
model.add(LSTM(units=HIDDEN_SIZE, return_sequences=True, unroll=True))
model.add(Dropout(0.2))
model.add(LSTM(units=HIDDEN_SIZE, return_sequences=False, unroll=True))
model.add(Dropout(0.2))
model.add(Dense(VOCAB_SIZE))
model.add(Activation("softmax"))

In [25]:
model.compile(loss="categorical_crossentropy", optimizer="rmsprop")

In [26]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 10, 300)           1936200   
_________________________________________________________________
lstm_1 (LSTM)                (None, 10, 256)           570368    
_________________________________________________________________
dropout_1 (Dropout)          (None, 10, 256)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 10, 256)           525312    
_________________________________________________________________
dropout_2 (Dropout)          (None, 10, 256)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dropout_3 (Dropout)          (None, 256)               0         
__________

In [27]:
BATCH_SIZE = 128
NUM_ITERATIONS = 50
NUM_EPOCHS_PER_ITERATION = 1
NUM_PREDS_PER_EPOCH = 50

Since we don't have any labeled data here, we train the model for an epoch (`NUM_EPOCHS_PER_ITERATION=1`) then test it. We continue training like this for 25 (`NUM_ITERATIONS=25`) iterations, stopping once we see intelligible output. So effectively, we are training for `NUM_ITERATIONS` epochs and testing the model after each epoch.

Our test consists of generating a word from the model given a random input, then dropping the first word from the input and appending the predicted word from our previous run, and generating another word from the model. We continue this 100 times (`NUM_PREDS_PER_EPOCH=100`) and generate and print the resulting string. The string gives us an indication of the quality of the model:

In [28]:
X = np.array(input_sequences)
y = target_one_hot

In [29]:
X.shape, y.shape

((53158, 10), (53158, 6454))

In [30]:
test_sequence = input_sequences[0]
test_sequence

[467, 503, 6453, 6452, 2927, 1942, 1217, 1490, 1941, 30]

In [31]:
test_sentence = input_sentences[0] #" ".join(list(map(lambda wid: index2word[wid], test_sequence)))
test_sentence

'year date 1989 â© twentieth century fox film corporation all'

In [32]:
test_sequence = np.array(test_sequence).reshape(1, 10)
test_sequence

array([[ 467,  503, 6453, 6452, 2927, 1942, 1217, 1490, 1941,   30]])

In [33]:
test_sequence.shape

(1, 10)

In [34]:
input_sentences[10699]

"work without the fork in the eye moe szyslak there's"

In [35]:
# We train the model in batches and test output generated at each step
for iteration in range(NUM_ITERATIONS):
    print("=" * 50)
    print("Iteration #: %d" % (iteration))
    
    model.fit(X, y, epochs=NUM_EPOCHS_PER_ITERATION) # train for 1 epoch
    
    # testing model
    # randomly choose a row from input_sentences, then use it to 
    # generate text from model for next 100 chars
    test_idx = np.random.randint(len(input_sequences))
    print(test_idx)
    test_sequence = input_sequences[test_idx]
    #test_sentence = " ".join(list(map(lambda wid: index2word[wid], test_sequence)))
    test_sentence = input_sentences[test_idx]
    print("Generating from seed: %s" % test_sentence)
    print(test_sentence, end=" ")
    for i in range(NUM_PREDS_PER_EPOCH):
        X_test = np.array(test_sequence).reshape(1, SEQLEN)
        #X_test = np.zeros((1, SEQLEN, nb_words))
        #for i, word in enumerate(test_words):
        #    X_test[0, i, word2index[word]] = 1
        pred = model.predict(X_test, verbose=0)[0]
        ypred = index2word[np.argmax(pred)]
        print(ypred, end=" ")
        # move forward with test_chars + ypred
        #test_words = test_words[1:].append(ypred)
        del test_sequence[0]
        test_sequence.append(np.argmax(pred))
print()

Iteration #: 0
Epoch 1/1
818
Generating from seed: a purveyor of mind numbing intoxicants like myself homer simpson
Iteration #: 1
Epoch 1/1
51149
Generating from seed: crummy little kids that nobody can control homer simpson you
Iteration #: 2
Epoch 1/1
38541
Generating from seed: head in ominous the roof carl carlson man you sure
Iteration #: 3
Epoch 1/1
16663
Generating from seed: a sea of decadent luxury and meaningless sex moe szyslak
Iteration #: 4
Epoch 1/1
41056
Generating from seed: totally surprising entrance snake jailbird hand over your cash and
Iteration #: 5
Epoch 1/1
36991
Generating from seed: marge simpson pants briefly oh my god i've never felt
Iteration #: 6
Epoch 1/1
4914
Generating from seed: went wild barney gumble yoo hoo homer simpson sings so
Iteration #: 7
Epoch 1/1
17030
Generating from seed: tap for jazz and i've never regretted it and here's
Iteration #: 8
Epoch 1/1
52086
Generating from seed: instead of my big dumb wife homer simpson i shouldn't
Iteration 

Iteration #: 17
Epoch 1/1
37647
Generating from seed: it's official smithers is now worse than mr burns lenny
Iteration #: 18
Epoch 1/1
34853
Generating from seed: kent brockman with a channel 6 exclusive the evil ned
Iteration #: 19
Epoch 1/1
9950
Generating from seed: simpson to scully you are one fine lookin' woman lady
Iteration #: 20
Epoch 1/1
30887
Generating from seed: boy where's fat tony's fifty bucks moe szyslak scared look
Iteration #: 21
Epoch 1/1
11141
Generating from seed: moe betty confused what moe szyslak explaining go near moe
Iteration #: 22
Epoch 1/1
13956
Generating from seed: oh homer simpson murderously you moe szyslak homer thank god
Iteration #: 23
Epoch 1/1
36621
Generating from seed: y makes me wanna have mo' kang this is the
Iteration #: 24
Epoch 1/1
52843
Generating from seed: words won't do it to moe i love you moe
Iteration #: 25
Epoch 1/1
8538
Generating from seed: moe szyslak well i dunno are you man enough to
Iteration #: 26
Epoch 1/1
50053
Generating 

Iteration #: 33
Epoch 1/1
15435
Generating from seed: paid your tab or at least cleaned up that mess
Iteration #: 34
Epoch 1/1
45116
Generating from seed: for loudly helllp normal 'cause this time there's no one
Iteration #: 35
Epoch 1/1
11641
Generating from seed: moe szyslak uh here have a margarita homer simpson but
Iteration #: 36
Epoch 1/1
38756
Generating from seed: rip out his voice box but i did stretch out
Iteration #: 37
Epoch 1/1
30135
Generating from seed: simpson hey bar boy dance around like an idiot like
Iteration #: 38
Epoch 1/1
11818
Generating from seed: just twenty four hours of freedom left homer simpson checking
Iteration #: 39
Epoch 1/1
52260
Generating from seed: don't see your name engraved on the bar stool richard
Iteration #: 40
Epoch 1/1
11769
Generating from seed: chase the blues away lenny leonard yeah you got that
Iteration #: 41
Epoch 1/1
34473
Generating from seed: yeah that's what i meant too i have no inclination
Iteration #: 42
Epoch 1/1
1804
Generati

Iteration #: 49
Epoch 1/1
22710
Generating from seed: my head he's like a spy in the house of
my head he's like a spy in the house of you man is your best thing i'm not right barney homer simpson no man okay how like you got a moe szyslak ah that's this beer homer simpson so i'm not here we want to moe szyslak i'm not a moe szyslak oh yeah no good man is so good 


### Using Embedding Layer initialised with pre-trained GloVe embeddings

In [36]:
GLOVE_MODEL = "data/glove.6B.300d.txt"

In [37]:
# load GloVe vectors
word2emb = {}
with open(GLOVE_MODEL, "rb") as fglove:
    for line in fglove:
        cols = line.strip().split()
        word = cols[0]
        embedding = np.array(cols[1:], dtype="float32")
        word2emb[word] = embedding

In [38]:
glove_embedding_weights = np.zeros((VOCAB_SIZE, EMBED_SIZE))
for word, index in word2index.items():
    try:
        glove_embedding_weights[index, :] = word2emb[word]
    except KeyError:
        # words not found in glove embedding dictionary word2emb will be all-zeros.
        pass

In [39]:
VOCAB_SIZE = len(word2index) + 1
EMBED_SIZE  = 300
HIDDEN_SIZE = 128

In [41]:
# Build model
model = Sequential()
model.add(Embedding(input_dim=VOCAB_SIZE, 
                    output_dim=EMBED_SIZE, 
                    input_length=SEQLEN, 
                    weights=[glove_embedding_weights], 
                    trainable=True))
model.add(LSTM(units=HIDDEN_SIZE, return_sequences=True, unroll=True))
model.add(Dropout(0.2))
model.add(LSTM(units=HIDDEN_SIZE, return_sequences=False, unroll=True))
model.add(Dropout(0.2))
model.add(Dense(VOCAB_SIZE))
model.add(Activation("softmax"))

In [42]:
model.compile(loss="categorical_crossentropy", optimizer="rmsprop")

In [43]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 10, 300)           1936200   
_________________________________________________________________
lstm_4 (LSTM)                (None, 10, 128)           219648    
_________________________________________________________________
dropout_4 (Dropout)          (None, 10, 128)           0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_5 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 6454)              832566    
_________________________________________________________________
activation_2 (Activation)    (None, 6454)              0         
Total para

In [44]:
BATCH_SIZE = 128
NUM_ITERATIONS = 50
NUM_EPOCHS_PER_ITERATION = 1
NUM_PREDS_PER_EPOCH = 50

In [45]:
# We train the model in batches and test output generated at each step
for iteration in range(NUM_ITERATIONS):
    print("=" * 50)
    print("Iteration #: %d" % (iteration))
    
    model.fit(X, y, epochs=NUM_EPOCHS_PER_ITERATION) # train for 1 epoch
    
    # testing model
    # randomly choose a row from input_sentences, then use it to 
    # generate text from model for next 100 chars
    test_idx = np.random.randint(len(input_sequences))
    print(test_idx)
    test_sequence = input_sequences[test_idx]
    #test_sentence = " ".join(list(map(lambda wid: index2word[wid], test_sequence)))
    test_sentence = input_sentences[test_idx]
    print("Generating from seed: %s" % test_sentence)
    print(test_sentence, end=" ")
    for i in range(NUM_PREDS_PER_EPOCH):
        X_test = np.array(test_sequence).reshape(1, SEQLEN)
        #X_test = np.zeros((1, SEQLEN, nb_words))
        #for i, word in enumerate(test_words):
        #    X_test[0, i, word2index[word]] = 1
        pred = model.predict(X_test, verbose=0)[0]
        ypred = index2word[np.argmax(pred)]
        print(ypred, end=" ")
        # move forward with test_chars + ypred
        #test_words = test_words[1:].append(ypred)
        del test_sequence[0]
        test_sequence.append(np.argmax(pred))
print()

Iteration #: 0
Epoch 1/1
30280
Generating from seed: it was the first day of summer moe szyslak i
Iteration #: 1
Epoch 1/1
20338
Generating from seed: the ol' eye gouger in the pickle brine that'll keep
Iteration #: 2
Epoch 1/1
7734
Generating from seed: slow horrified good god turns to homer homer y'know i
Iteration #: 3
Epoch 1/1
35170
Generating from seed: you always been this size or is this like a
Iteration #: 4
Epoch 1/1
50571
Generating from seed: hold that thought teenage homer psst barney my dad's asleep
Iteration #: 5
Epoch 1/1
50928
Generating from seed: after all these years i deserve an explanation moe szyslak
Iteration #: 6
Epoch 1/1
42156
Generating from seed: mexican duffman mexican duffman oh yeah ho la carl carlson
Iteration #: 7
Epoch 1/1
45342
Generating from seed: am gonna tie a rope around your neck and hang
Iteration #: 8
Epoch 1/1
11991
Generating from seed: simpson oh yeah moe szyslak i made that last one
Iteration #: 9
Epoch 1/1
7912
Generating from seed: barn

Iteration #: 17
Epoch 1/1
7032
Generating from seed: but you know we had good writers william faulkner could
Iteration #: 18
Epoch 1/1
31425
Generating from seed: i just had this awful feeling that uh homer's in
Iteration #: 19
Epoch 1/1
8056
Generating from seed: you wanna marry a guy like selma homer simpson okay
Iteration #: 20
Epoch 1/1
21894
Generating from seed: the rainforest peter buck good enough let's eat lisa simpson
Iteration #: 21
Epoch 1/1
45699
Generating from seed: turning your life around lenny leonard with the three desperate
Iteration #: 22
Epoch 1/1
4673
Generating from seed: simpson hey moe can i look too moe szyslak sure
Iteration #: 23
Epoch 1/1
47704
Generating from seed: marge simpson sadly now i know why homie comes here
Iteration #: 24
Epoch 1/1
33082
Generating from seed: frontrunner and go back to their mansions moe szyslak who
Iteration #: 25
Epoch 1/1
24774
Generating from seed: you honey you bulked up but managed to keep your
Iteration #: 26
Epoch 1/1
43

Iteration #: 33
Epoch 1/1
20617
Generating from seed: trust you how'd you like to be a snitch the
Iteration #: 34
Epoch 1/1
4643
Generating from seed: problemo points hey moe look over there moe szyslak what
Iteration #: 35
Epoch 1/1
44110
Generating from seed: barney gumble i bet something disillusioned you as a child
Iteration #: 36
Epoch 1/1
11537
Generating from seed: anger so you lost my car eh well that's just
Iteration #: 37
Epoch 1/1
45878
Generating from seed: into money that mostly goes to us but you get
Iteration #: 38
Epoch 1/1
27304
Generating from seed: freedom you gotta have some fun seymour skinner who are
Iteration #: 39
Epoch 1/1
9661
Generating from seed: i have one thing that can never be broken a
Iteration #: 40
Epoch 1/1
16494
Generating from seed: dr julius hibbert guiltily well i guess we are ned
Iteration #: 41
Epoch 1/1
12293
Generating from seed: tapping mic uh is this thing on barney gumble no
Iteration #: 42
Epoch 1/1
5751
Generating from seed: offer homer 

Iteration #: 49
Epoch 1/1
52943
Generating from seed: szyslak beat just a minute let me check to the
szyslak beat just a minute let me check to the little bar oh well then what are you one homer simpson okay i'm not back to go carl carlson hey now all is a bar more time homer simpson i'm not here moe szyslak who can never get your beer here moe szyslak well i'm not so so no man 
