In [1]:
import gensim
import tensorflow as tf
import numpy as np
import pickle
import itertools

from keras.models import Sequential, model_from_json
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
def end_next(prev_end):
    """
    Find the next end word given previous, finding a similar word that
    ends in stressed.
    """
    try:
        w, p = zip(*word2vec.most_similar(prev_end, topn=10))
    except KeyError:
        return np.random.choice(inverted_rhyme.keys())

    w = list(w)
    # Make sure it starts out with unstressed
    ends = []
    for word in w:
        if word == prev_end:
            continue
        if word not in inverted_rhyme:
            continue
        ends.append(word)

    return np.random.choice(ends)


def end_next_volta(prev_end):
    try:
        w, p = zip(*word2vec.most_similar(positive=["rich", prev_end], \
                                              negative=["poor"], topn=10))
    except KeyError:
        return np.random.choice(inverted_rhyme.keys())
        
    w = list(w)
    # Make sure it starts out with unstressed
    ends = []
    for word in w:
        if word == prev_end:
            continue
        if word not in inverted_rhyme:
            continue
        ends.append(word)

    return np.random.choice(ends)


def end_next_rhyme(prev_rhyme):
    """
    Find the next end word given previous, and a word that must rhyme 
    with it.
    """
    ending = inverted_rhyme[prev_rhyme][0]
        
    rhymes = rhyme[ending]

    threshold_similarity = 0.1
    best_words = []
    for r in rhymes:
        if r == prev_rhyme:
            continue
        try:
            sim = word2vec.similarity(prev_rhyme, r)
            if sim > threshold_similarity:
                best_words.append(r)
        except KeyError:
            # probably a stopword
            best_words.append(r)

    if len(best_words) == 0:
        return np.random.choice(rhymes)

    return np.random.choice(best_words)

def sample(preds, temperature=1.0):
    # Helper function to sample an index from a probability array
    with np.errstate(divide='ignore'):
        preds = np.asarray(preds).astype('float64')
    
        preds = np.log(preds) / temperature
        
        # Fix division by 0
        preds[preds == np.inf] = 0

        exp_preds = np.exp(preds)
        preds =  exp_preds / np.sum(exp_preds)
    
    return np.argmax(np.random.multinomial(1, preds, 1))

In [3]:
import nltk
from nltk.corpus import cmudict
nltk.download('cmudict')

d = cmudict.dict()
def count_syl(word):
    """ Returns number of syllables in a word. Taken
    from StackOverflow. """
    if word in d:
        return len(list(y for y in d[word][0] if y[-1].isdigit()))
    count = 0
    vowels = 'aeiouy'
    if word[0] in vowels:
        count +=1
    for index in range(1,len(word)):
        if word[index] in vowels and word[index-1] not in vowels:
            count +=1
    if word.endswith('e'):
        count -= 1
    if word.endswith('le'):
        count+=1
    if count == 0:
        count +=1
    return count

[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\alyci\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [4]:
def get_sonnets(filename):
    """ Returns entire txt file as string. """
    str = ""
    with open(filename) as file:
        for line in file:
            line = line.strip()
            if line.isdigit() == False and len(line) > 0:
                str += line.lower().translate(str.maketrans('', '', ':;,.?!()')) + "\n"
    return str

file = 'data/shakespeare.txt'
sons = get_sonnets(file)
lines = sons.split("\n")

In [5]:
len(sons)

91006

In [6]:
chars = sorted(list(set(sons))) # get set of all characters
print(chars)
print ("# unique chars: " + str(len(chars)))

# create mapping of characters to unique ids
dic_char_to_id = dict((c, i) for i, c in enumerate(chars))
dic_id_to_char = dict((i, c) for i, c in enumerate(chars))

['\n', ' ', "'", '-', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
# unique chars: 30


In [7]:
import random
import copy

def get_rhyming_words(word, rhyme_data):
    for pot_list in rhyme_data:
        if word in pot_list:
            return pot_list
    return []

def choose_ending_words(rhyme_data):
    temp = copy.deepcopy(rhyme_data)
    # choosing rhyming scheme per uniform distribution
    choices = [np.random.randint(0, len(rhyme_data)) for _ in range(7)]
    words = [''] * 14
    ind = 0
    for i in [0, 1, 4, 5, 8, 9, 12]:
        choice1 = random.choice(temp[choices[ind]])
        words[i] = choice1
        temp[choices[ind]].remove(choice1)
        if i == 12:
            choice2 = random.choice(temp[choices[ind]])
            words[i+1] = choice2
            temp[choices[ind]].remove(choice2)
        else:
            choice2 = random.choice(temp[choices[ind]])
            words[i+2] = choice2
            temp[choices[ind]].remove(choice2)
        ind += 1
    
    
    return words

In [8]:
with open('../data/shakespeare_rhymes.pkl', 'rb') as f:
    rhymes = pickle.load(f)
    
end_words = choose_ending_words(rhymes)

In [9]:
import re
files = ['data/shakespeare.txt']
text = ''

for filename in files:
    with open(filename) as f:
        for line in f:
            line = line.strip()
            #line = re.sub(r'[^\w\s]','',line)
            line = re.sub(r'[^\w\'\-\s]','',line)
            if len(line) > 0 and not line.isdigit():
                text += line.lower() + '\n'

# create mapping of unique chars to integers
# chars = sorted(list(set(text)))
# print('Total chars:', len(chars))
# dic_char_to_id = dict((c, i) for i, c in enumerate(chars))
# dic_id_to_char = dict((i, c) for i, c in enumerate(chars))

chars = sorted(list(set(text))) # get set of all characters
print(chars)
print ("# unique chars: " + str(len(chars)))

# create mapping of characters to unique ids
dic_char_to_id = dict((c, i) for i, c in enumerate(chars))
dic_id_to_char = dict((i, c) for i, c in enumerate(chars))

['\n', ' ', "'", '-', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
# unique chars: 30


In [10]:
leng = 40
model = Sequential()
model.add(LSTM(200, return_sequences=True, input_shape=((leng, len(chars)))))
model.add(Dropout(0.3))
model.add(LSTM(200, return_sequences=False))
model.add(Dropout(0.3))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [11]:
model.load_weights('models/new_BACKW.h5')

In [12]:
def sample(preds, temperature = 1.0):
    ''' Helper function to sample an index from
    a probability array. Taken from StackOverflow/
    open source code on GitHub. '''

    preds = np.asarray(preds).astype('float')
    preds = np.log(preds) / temperature
    preds[preds == np.inf] = 0

    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    
    return np.argmax(np.random.multinomial(1, preds, 1))

In [13]:
son = ''
temp = 0.25
for i in range(14):
    line_prod = ' ' + end_words[i]
    line = line_prod
    line_prod = line_prod.ljust(leng - 1) + '\n'
    seq = line_prod[::-1]
    
    while True:
        x = np.zeros((1, leng, len(chars)))
        for t, char in enumerate(sequence):
            x[0, t, dic_char_to_id[char]] = 1.

        preds = model.predict(x, verbose=0)[0]
        next_char = dic_id_to_char[sample(preds, temp)]
        
        # Ignore special characters
        if (next_char == '\n'):
            next_char = ' '

        # Check syllables
        if (next_char == ' '): 
            syls = sum([count_syl(str(w)) for w in line.split(' ')])
            if syls >= 10:
                break
        
        line = next_char + line
        seq = seq[1:] + next_char
        
    if ((i + 1) % 4 == 0) or (i == 13):
        line += '.\n'
    else:
        line += ',\n'
        
    son += line
print(son)
        

to time and and fore and the invited,
and time no fore when time and time be dimmed,
and and and time no fore that delighted,
and the fore thes my loves the untrimmed.
eyes to the sence for that i loves the seem,
time and and no more my loves the passion,
and love and and and therefore when the deem,
and and and and and and in the fashion.
and that to make and o love and the score,
tome and more that me that whose and the charged,
and and and and and no love and the store,
that fore when in the praised and some enlarged.
fore when the fore when that more when the change,
that raceous and and that me when the strange.



In [14]:

and and and and and and o love the feel,
and and and and and that fore in the dross,
time and thence for that thence or wist the steel,
and and that that me and love and why cross.
love's caurest and and and and to lome knit,
time and of time and are me when the trim,
and that me to time and that my love sit,
and and with time and mine eyes to the him.
more therefore but whose and that fore the bark,
that he in the time in the resemble,
and that lace therefore and and and and mark,
and and o hate no fore thine assemble.
best and and and and that the lest the spring,
and time in thine eyes the prefiguring.

SyntaxError: invalid syntax (<ipython-input-14-5e2cd5b4d067>, line 2)