# CSI5386 Course Project - Automatic Poem Generation Using Deep Neural Networks
# Group 5
#### Abhilasha (300168332)
#### Ravisha Sharma (300162406)
#### Rajitha Muthukrishnan (300161725)

In [1]:
import json
import random
import numpy as np
import pandas as pd

import os
import contractions
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"
import keras
import keras.utils as ku
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
import nltk
from nltk.corpus import stopwords 

from keras.layers import LSTM, Dense, Dropout, Flatten, Bidirectional, SimpleRNN
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.optimizers import RMSprop
from keras.utils import np_utils

import lm_scorer

Using plaidml.keras.backend backend.


# Gutenberg Poetry Dataset

#### Read data from file

In [2]:
all_lines = []
for line in open("gutenberg-poetry-v001.ndjson"):
    all_lines.append(json.loads(line.strip()))
    
corpus = "\n".join([line['s'] for line in random.sample(all_lines, 1000)])

In [3]:
corpus[0:1000]

'Of green-room, gambling-hell, saloon,\nAnd loosed the props below.\nAcross the land, by thee is shed:--\nAttic maid, honey-fed, chatterer, snatchest thou and bearest the\nSpontaneous beauties all around advance,\nLet them take of my treasures, and clothes and steeds provide."\nPale Grief, and pleasing Pain,\nToo suddenly still and mute.\nAnd it shows\nBright was her body withal,     and golden cups her breasts.\nThe sun will shine ageean.\nThey blow an old-time way for me,\nTo wedlock and the pastor\'s daughter.\nI know not how, I know not when,\nWarm, hands, warm, daddy\'s gone to plough;\nDriven by an Onward-ache,\nMy song shall raise the mountain-deer;\nFrom my joys hath me removèd,\nIn life or tenderest in heart.  I came\natque alius latices pressis resupinus ab uuis\nChe l\'ale sue, tra liti si lontani\nCaptain Pierce and his daughters\n_Daffin_, merriment, foolishness.\nIn his young charge\'s throat: as if his crime\nThen the blacksmith, Ilmarinen,\nNor longer shall your princel

#### Expand contracted words

In [4]:
expanded_data = contractions.fix(corpus)
expanded_data[0:1000]

'Of green-room, gambling-hell, saloon,\nAnd loosed the props below.\nAcross the land, by thee is she would:--\nAttic maid, honey-fed, chatterer, snatchest thou and bearest the\nSpontaneous beauties all around advance,\nLet them take of my treasures, and clothes and steeds provide."\nPale Grief, and pleasing Pain,\nToo suddenly still and mute.\nAnd it shows\nBright was her body withal,     and golden cups her breasts.\nThe sun will shine ageean.\nThey blow an old-time way for me,\nTo wedlock and the pastor\'s daughter.\nI know not how, I know not when,\nWarm, hands, warm, daddy\'s gone to plough;\nDriven by an Onward-ache,\nMy song shall raise the mountain-deer;\nFrom my joys hath me removèd,\nIn life or tenderest in heart.  I came\natque alius latices pressis resupinus ab uuis\nChe l\'ale sue, tra liti si lontani\nCaptain Pierce and his daughters\n_Daffin_, merriment, foolishness.\nIn his young charge\'s throat: as if his crime\nThen the blacksmith, Ilmarinen,\nNor longer shall your pr

#### Extract new lines from data

In [5]:
# Lowercase all text
raw_text = expanded_data.lower()
raw_text = raw_text
raw_text = raw_text.split('\n')

In [6]:
# raw_text

#### Remove unwanted punctuations from lines

In [133]:
text = [txt.strip('"!\"#$%&\)*+-/(:;<=>?@][\\^_}{|~--[0-9]') for txt in raw_text]
# text = raw_text.strip("\\'\"!\"#$%&\)*+-/(:;<=>?@][\^_}{|~--")

In [134]:
text[0:10]

['of green-room, gambling-hell, saloon,',
 'and loosed the props below.',
 'across the land, by thee is she would',
 'attic maid, honey-fed, chatterer, snatchest thou and bearest the',
 'spontaneous beauties all around advance,',
 'let them take of my treasures, and clothes and steeds provide.',
 'pale grief, and pleasing pain,',
 'too suddenly still and mute.',
 'and it shows',
 'bright was her body withal,     and golden cups her breasts.']

#### Tokenize the lines extracted

In [264]:
# Create Tokenizer object to convert words to sequences of integers
tokenizer = Tokenizer(num_words = None, filters = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower = False)
# tokenizer = Tokenizer(num_words = None, lower = False, char_level = False)

In [265]:
# Train tokenizer to the texts
tokenizer.fit_on_texts(text)
total_words = len(tokenizer.word_index) + 1

In [266]:
total_words

2544

#### Convert text to ngram sequences

In [267]:
# Convert list of strings into flat dataset of sequences of tokens
sequences = []
for line in text:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        sequences.append(n_gram_sequence)

In [268]:
len(sequences)

6288

#### Pad the sequences
length to pad = max (length of lines)

In [269]:
# Pad sequences to ensure equal lengths
max_seq_len = max([len(x) for x in sequences])
sequences = np.array(pad_sequences(sequences, maxlen = max_seq_len, padding = 'pre'))

#### n-gram sequence - predictors and labels

In [270]:
# Create n-grams sequence predictors and labels
predictors, label = sequences[:, :-1], sequences[:, -1]
label = ku.to_categorical(label, num_classes = total_words)

#### Input length for models

In [271]:
input_len = max_seq_len - 1

#### Stop words for poem generation task

In [272]:
# Used to monitor the generation of sentences in poem
stop_words = set(stopwords.words('english'))

# PART - 1 : Markov model

In [273]:
import markovify

In [274]:
markov_model = markovify.NewlineText(expanded_data)

In [275]:
def gen_poem_markov():
    output_text = ''
    for i in range(5):
        for i in range(random.randrange(1, 4)):
            text = markov_model.make_short_sentence(30)
            print(text)
            output_text += ' ' + text
    return output_text

# PART - 2: Vanilla RNN - Simple word embedding model

In [286]:
rnn_model = Sequential()
rnn_model.add(Embedding(total_words, 100, input_length = input_len))
rnn_model.add(SimpleRNN(150))
rnn_model.add(Dropout(0.1))
rnn_model.add(Dense(total_words, activation = 'softmax'))
rnn_model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

# Use 100 epoch for efficacy
rnn_model.fit(predictors, label, epochs = 100, verbose = 1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7fa8b9048160>

# PART - 3: LSTM - Simple word embedding model

In [287]:
lstm_model = Sequential()
lstm_model.add(Embedding(total_words, 100, input_length = input_len))
lstm_model.add(LSTM(150))
lstm_model.add(Dropout(0.1))
lstm_model.add(Dense(total_words, activation = 'softmax'))
lstm_model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

# Use 100 epoch for efficacy
lstm_model.fit(predictors, label, epochs = 100, verbose = 1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7fa8bc8c9760>

# Part - 4: Bidirectional LSTM - Simple word embedding model

In [288]:
bi_lstm_model = Sequential()
bi_lstm_model.add(Embedding(total_words, 100, input_length = input_len))
bi_lstm_model.add(Bidirectional(LSTM(100)))
bi_lstm_model.add(Dropout(0.1))
bi_lstm_model.add(Dense(total_words, activation = 'softmax'))
bi_lstm_model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

# Use 100 epoch for efficacy
bi_lstm_model.fit(predictors, label, epochs = 100, verbose = 1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7fa8bc6fba00>

# Using GloVe embedding

In [279]:
word_index = tokenizer.word_index
print(len(word_index))

2543


In [280]:
def cal_embedding_matrix(word_index):
    embedded_words = {}
    with open('glove.6B.100d.txt') as file:
        for line in file:
            words, coeff = line.split(maxsplit=1)
            coeff = np.array(coeff.split(),dtype = float)
            embedded_words[words] = coeff

    embedding_matrix = np.zeros((len(word_index)+1, 100))
    for word, i in word_index.items():
        embedding_vector = embedded_words.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [281]:
embedding_matrix = cal_embedding_matrix(word_index)

In [282]:
print('Shape of embedding matrix:',embedding_matrix.shape)

Shape of embedding matrix: (2544, 100)


# Part - 5 : Vanilla RNN - Glove embedding model

In [283]:
rnn_glove_model = Sequential()
rnn_glove_model.add(Embedding(len(word_index)+1, 100, weights=[embedding_matrix], input_length = input_len))
rnn_glove_model.add(SimpleRNN(150))
rnn_glove_model.add(Dropout(0.1))
rnn_glove_model.add(Dense(total_words, activation = 'softmax'))
rnn_glove_model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

# Use 100 epoch for efficacy
rnn_glove_model.fit(predictors, label, epochs = 100, verbose = 1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7faabf9c30d0>

# PART - 6 : LSTM - Glove embedding model

In [284]:
lstm_glove_model = Sequential()
lstm_glove_model.add(Embedding(len(word_index)+1, 100, weights=[embedding_matrix], input_length = input_len))
lstm_glove_model.add(LSTM(150))
lstm_glove_model.add(Dropout(0.1))
lstm_glove_model.add(Dense(total_words, activation = 'softmax'))
lstm_glove_model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

# Use 100 epoch for efficacy
lstm_glove_model.fit(predictors, label, epochs = 100, verbose = 1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7fa8d05e2460>

# Part - 7: Bi-directional LSTM Glove embedding model

In [285]:
bi_lstm_glove_model = Sequential()
bi_lstm_glove_model.add(Embedding(len(word_index)+1, 100, weights=[embedding_matrix], input_length = input_len))
bi_lstm_glove_model.add(Bidirectional(LSTM(100)))
bi_lstm_glove_model.add(Dropout(0.1))
bi_lstm_glove_model.add(Dense(total_words, activation = 'softmax'))
bi_lstm_glove_model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

# Use 100 epoch for efficacy
bi_lstm_glove_model.fit(predictors, label, epochs = 100, verbose = 1)

Epoch 1/100


INFO:plaidml:Analyzing Ops: 2629 of 2991 operations complete


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

<keras.callbacks.History at 0x7fa8b8cce9d0>

# POEM generation function

In [208]:
# Function to generate line - based on STOP WORDS at the end of generated sentence. And every sentence should have only 5 words
def generate_line(init_text, max_seq_len, num_sen, model):
    output_word = ''
    num_words = 5  
    last_word = 'a'
    text = init_text
    init_words = nltk.word_tokenize(text)
    
    while last_word in stop_words:
        if num_sen == 0:
            num_words = num_words - len(init_words)
    
        output_text = ''
        for j in range(num_words):
            token_list = tokenizer.texts_to_sequences([text])[0]
            token_list = pad_sequences([token_list], maxlen = max_seq_len - 1, padding = 'pre')
            predicted = model.predict_classes(token_list, verbose = 0)

            for word, index in tokenizer.word_index.items():
                if index == predicted:
                    output_word = word
                    break

            text += ' ' + output_word
            if num_sen == 0:
                output_text = text
            else:       
                output_text += ' ' + output_word
        words = nltk.word_tokenize(output_text)
        last_word = words[-1]
        
    return output_text  

def format_line(text, num_sen, sen):
    text = text[:1].upper() + text[1:]
    words = nltk.word_tokenize(text)
    last_word = words[-1]
    if sen != num_sen-1:
        if last_word not in stop_words:
            text = text + ','
    else:
        text = text + '.'
    return text

# Function to generate poem - multiple lines
## Arguments (sample text, padding length, number of sentences needed, model)
def generate_poem(text, max_seq_len, num_sen, model):
    output_sentence = text
    output = ''
    for sen in range(num_sen):
        gen_sent = generate_line(output_sentence, max_seq_len, sen, model)  
        sentence = format_line(gen_sent, num_sen, sen)
        output_sentence += '' + sentence
        print(sentence)
        output += sentence
        text = output_sentence
    return output

# Poem generation

## Markov model

In [77]:
markov_text = gen_poem_markov()

Went ruffling up the past,
To wedlock and the high,
And of the awful maid;
And chaplets on the land;
Who, like the starry lights;
And that is free.
Then to the faithful,
Who will not be one of them.
And of the flower.
And as the moon spun,
Here, all around advance,


## Vanilla RNN - Simple word embedding model

In [289]:
## Arguments - (sample text, padding length, number of sentences needed, model)
# gen_text_rnn = generate_poem("Love to", max_seq_len, 5, rnn_model)
gen_text_rnn = generate_poem("Full of hope", max_seq_len, 5, rnn_model)

Full of hope and passion,
 victory are barest voices sweet,
 that almost seem to drop,
 man seems night the past,
 all the king come back.


In [290]:
gen_text_rnn

'Full of hope and passion, victory are barest voices sweet, that almost seem to drop, man seems night the past, all the king come back.'

## LSTM - Simple word embedding model

In [291]:
## Arguments - (sample text, padding length, number of sentences needed, model)
# gen_text_lstm = generate_poem("Love to", max_seq_len, 5, lstm_model)
gen_text_lstm = generate_poem("Full of hope", max_seq_len, 5, lstm_model)

Full of hope and passion,
 more i more are boast,
 brought kept boast sails longs,
 to him to the rescue,
 has are open more would.


In [292]:
gen_text_lstm

'Full of hope and passion, more i more are boast, brought kept boast sails longs, to him to the rescue, has are open more would.'

## Bidirectional LSTM - Simple word embedding model

In [293]:
## Arguments - (sample text, padding length, number of sentences needed, model)
# gen_text_bilstm = generate_poem("Love to", max_seq_len, 5, bi_lstm_model)
gen_text_bilstm = generate_poem("Full of hope", max_seq_len, 5, bi_lstm_model)

Full of hope and passion,
 hundred moss grown altars steep,
 steep disguises of actors qu,
 il il went chuck darkness,
 reached to one voices voices.


In [294]:
gen_text_bilstm

'Full of hope and passion, hundred moss grown altars steep, steep disguises of actors qu, il il went chuck darkness, reached to one voices voices.'

## Vanilla RNN - Glove embedding model

In [227]:
## Arguments - (sample text, padding length, number of sentences needed, model)
# gen_text_rnn_glove = generate_poem("Love to", max_seq_len, 5, rnn_glove_model)
gen_text_rnn_glove = generate_poem("Full of hope", max_seq_len, 5, rnn_glove_model)

Full of hope and passion,
 or leave it their eyes,
 you orter seen the pettibone,
 shef doover unaim'd are love,
 in heaven the lordly meed.


In [228]:
gen_text_rnn_glove

"Full of hope and passion, or leave it their eyes, you orter seen the pettibone, shef doover unaim'd are love, in heaven the lordly meed."

## LSTM - Glove embedding model

In [229]:
## Arguments - (sample text, padding length, number of sentences needed, model)
gen_text_lstm_glove = generate_poem("Full of hope", max_seq_len, 5, lstm_glove_model)

Full of hope and passion,
 free more thrall of new,
 some some some at memory,
 them fell on some barbarian,
 see as it were bound.


In [230]:
gen_text_lstm_glove

'Full of hope and passion, free more thrall of new, some some some at memory, them fell on some barbarian, see as it were bound.'

## Bi Directional LSTM - Glove embedding model

In [231]:
## Arguments - (sample text, padding length, number of sentences needed, model)
# Sweet life, There he sang
gen_text_bilstm_glove = generate_poem("Full of hope", max_seq_len, 5, bi_lstm_glove_model)

Full of hope and passion,
 mind and say dryden's less,
 to me are good spread,
 me to say their eyes,
 awake to say their offends.


In [232]:
gen_text_bilstm_glove

"Full of hope and passion, mind and say dryden's less, to me are good spread, me to say their eyes, awake to say their offends."

# Evaluate - Probability

In [236]:
from lm_scorer.models.auto import AutoLMScorer
scorer = AutoLMScorer.from_pretrained("gpt2-large")

def prob_score(sentence):
    return scorer.sentence_score(sentence, reduce='mean')

In [239]:
print('Markov model text')
print(markov_text)
print(prob_score(markov_text))

Markov model text
 Went ruffling up the past, To wedlock and the high, And of the awful maid; And chaplets on the land; Who, like the starry lights; And that is free. Then to the faithful, Who will not be one of them. And of the flower. And as the moon spun, Here, all around advance,
0.08049044013023376


In [295]:
print('Vanilla RNN - tokenized data Model text')
print(gen_text_rnn)
print(prob_score(gen_text_rnn))

Vanilla RNN - tokenized data Model text
Full of hope and passion, victory are barest voices sweet, that almost seem to drop, man seems night the past, all the king come back.
0.06673873960971832


In [296]:
print('LSTM - tokenized data Model text')
print(gen_text_lstm)
print(prob_score(gen_text_lstm))

LSTM - tokenized data Model text
Full of hope and passion, more i more are boast, brought kept boast sails longs, to him to the rescue, has are open more would.
0.05296219885349274


In [297]:
print('BiLSTM - tokenized data Model text')
print(gen_text_bilstm)
print(prob_score(gen_text_bilstm))

BiLSTM - tokenized data Model text
Full of hope and passion, hundred moss grown altars steep, steep disguises of actors qu, il il went chuck darkness, reached to one voices voices.
0.06630679965019226


In [243]:
print('Vanilla RNN - glove embedding Model text')
print(gen_text_rnn_glove)
print(prob_score(gen_text_rnn_glove))

Vanilla RNN - glove embedding Model text
Full of hope and passion, or leave it their eyes, you orter seen the pettibone, shef doover unaim'd are love, in heaven the lordly meed.
0.05898677930235863


In [244]:
print('LSTM - glove embedding Model text')
print(gen_text_lstm_glove)
print(prob_score(gen_text_lstm_glove))

LSTM - glove embedding Model text
Full of hope and passion, free more thrall of new, some some some at memory, them fell on some barbarian, see as it were bound.
0.07978662103414536


In [298]:
print('Bidirectional LSTM - glove embedding Model text')
print(gen_text_bilstm_glove)
print(prob_score(gen_text_bilstm_glove))

Bidirectional LSTM - glove embedding Model text
Full of hope and passion, mind and say dryden's less, to me are good spread, me to say their eyes, awake to say their offends.
0.05788024887442589


# Evaluate - Perplexity

In [247]:
import math
import torch
from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel

In [248]:
# Load pre-trained model (weights)
gpt_model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
gpt_model.eval()
# Load pre-trained model tokenizer (vocabulary)
gpt_tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

100%|██████████| 478750579/478750579 [20:24<00:00, 391116.84B/s] 
100%|██████████| 656/656 [00:00<00:00, 233689.78B/s]
100%|██████████| 815973/815973 [00:00<00:00, 1000199.55B/s]
100%|██████████| 458495/458495 [00:00<00:00, 977841.16B/s]
ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


In [251]:
def ppl_score(sentence):
    tokenize_input = gpt_tokenizer.tokenize(sentence)
    tensor_input = torch.tensor([gpt_tokenizer.convert_tokens_to_ids(tokenize_input)])
    loss = gpt_model(tensor_input, lm_labels=tensor_input)
    return math.exp(loss)

In [257]:
print('Markov model text')
print(markov_text)
print(ppl_score(markov_text))

Markov model text
 Went ruffling up the past, To wedlock and the high, And of the awful maid; And chaplets on the land; Who, like the starry lights; And that is free. Then to the faithful, Who will not be one of them. And of the flower. And as the moon spun, Here, all around advance,
167.33391422642183


In [299]:
print('Vanilla RNN - tokenized data Model text')
print(gen_text_rnn)
print(ppl_score(gen_text_rnn))

Vanilla RNN - tokenized data Model text
Full of hope and passion, victory are barest voices sweet, that almost seem to drop, man seems night the past, all the king come back.
486.6997205576134


In [300]:
print('LSTM - tokenized data Model text')
print(gen_text_lstm)
print(ppl_score(gen_text_lstm))

LSTM - tokenized data Model text
Full of hope and passion, more i more are boast, brought kept boast sails longs, to him to the rescue, has are open more would.
1007.7326672166716


In [301]:
print('BiLSTM - tokenized data Model text')
print(gen_text_bilstm)
print(ppl_score(gen_text_bilstm))

BiLSTM - tokenized data Model text
Full of hope and passion, hundred moss grown altars steep, steep disguises of actors qu, il il went chuck darkness, reached to one voices voices.
1194.1185692984177


In [261]:
print('Vanilla RNN - glove embedding Model text')
print(gen_text_rnn_glove)
print(ppl_score(gen_text_rnn_glove))

Vanilla RNN - glove embedding Model text
Full of hope and passion, or leave it their eyes, you orter seen the pettibone, shef doover unaim'd are love, in heaven the lordly meed.
608.1980240739014


In [262]:
print('LSTM - glove embedding Model text')
print(gen_text_lstm_glove)
print(ppl_score(gen_text_lstm_glove))

LSTM - glove embedding Model text
Full of hope and passion, free more thrall of new, some some some at memory, them fell on some barbarian, see as it were bound.
552.7443675864874


In [263]:
print('Bidirectional LSTM - glove embedding Model text')
print(gen_text_bilstm_glove)
print(ppl_score(gen_text_bilstm_glove))

Bidirectional LSTM - glove embedding Model text
Full of hope and passion, mind and say dryden's less, to me are good spread, me to say their eyes, awake to say their offends.
445.25010160175884
