-
Notifications
You must be signed in to change notification settings - Fork 1
/
GeneratePoem.py
366 lines (346 loc) · 18.7 KB
/
GeneratePoem.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
# For Python 2.7 to work using Python 3 module
from future.standard_library import install_aliases
install_aliases()
import csv
import itertools
import collections
import nltk
import numpy
import operator
import sys
import random
import requests
import json
import pronouncing
from urllib.request import urlopen
from RhymingDictionary import RhymingDictionary
number_lines = 14
line_length = [6, 7, 8, 9, 10]
line_start_token = "line_start"
line_end_token = "line_end"
unknown_token = "unknown_token"
# List of words to never end a line with
banned_end_words = ['a', 'ah', 'an', 'and', 'at', 'are', 'as', 'but', 'by', 'has', 'i', 'is', 'in', 'my', 'no', 'not', 'o', 'of', 'or', 's', 'than', 'the', 'their', 'then', 'to', 'was', 'with', 'your', line_start_token, line_end_token, unknown_token]
# The Big Huge Thesaurus API Key
bht_api = 'acf445020c3778040589b030b88031a1'
# Remove extension of words that are not found in the syllable guide, ie 'st, 'er
def remove_extension(word):
if len(word) > 3:
if word[-3:] == '\'st' or word[-3:] == '\'er' or word[-3:] == '\'ry':
word = word[:len(word)-3]
return word
# Generate the rhyming dictionary
def generate_rhyming_dictionary(training_data):
print("Generating rhyming dictionary...")
rhyming_model = RhymingDictionary(training_data)
syllable_count, syllable_pronounciation = rhyming_model.create_syllable_guide()
rhyming_dictionary = rhyming_model.create_rhyming_dictionary(syllable_pronounciation)
return rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary
# Read and process the training data
def load_data(training_data, syllable_count):
# Read the data
print("Reading %s..." %training_data)
with open(training_data, mode='r') as o:
reader = csv.reader(o, skipinitialspace = True)
# Add start and end token at each line
lines = itertools.chain(*[x for x in reader])
lines = ["%s %s %s" %(line_start_token, x.lower(), line_end_token) for x in lines]
print("Parsed %d lines." %len(lines))
# Tokenize each line into words
tokenized_lines = [nltk.word_tokenize(x) for x in lines]
# Get dictionary
words = collections.Counter(itertools.chain(*tokenized_lines))
dictionary = []
dictionary.append(line_start_token)
dictionary.append(line_end_token)
dictionary.append(unknown_token)
# Only add the word if it is found in the rhyming dictionary
for word in words:
word_alt = remove_extension(word)
if word_alt.upper() in syllable_count:
dictionary.append(word)
dictionary_size = len(dictionary)
print("Using vocabulary size %d." %dictionary_size)
# Replace all words not in the dictionary with the unknown token
for i, line in enumerate(tokenized_lines):
tokenized_lines[i] = [w if w in dictionary else unknown_token for w in line]
word_to_index = dict([(w,i) for i,w in enumerate(dictionary)])
# Create training data
start_train = numpy.asarray([[word_to_index[w] for w in line[:-1]] for line in tokenized_lines])
end_train = numpy.asarray([[word_to_index[w] for w in line[1:]] for line in tokenized_lines])
return dictionary, dictionary_size, word_to_index, start_train, end_train
# Get the syllables of the given word
def get_syllables(word, dictionary, syllable_count):
word_alt = remove_extension(dictionary[word])
if word_alt.upper() in syllable_count:
if word_alt == dictionary[word]:
return syllable_count[word_alt.upper()]
else:
return syllable_count[word_alt.upper()] + 1
else:
return 0
# Get the next word using forward propagation
# Try to get a word from the theme if possible
def get_next_word(model, line, syllables_remaining, dictionary, word_to_index, syllable_count, keywords_in_dictionary):
sampled_word = word_to_index[unknown_token]
sampled_word_syllables = 0
next_word_probs = model.forward_propagation(line)
# A theme is given, try to replace with given theme words
if keywords_in_dictionary != None:
while sampled_word_syllables == 0 or sampled_word_syllables > syllables_remaining:
samples = numpy.random.multinomial(1, next_word_probs[-1])
sampled_word = numpy.argmax(samples)
if dictionary[sampled_word] == line_end_token or dictionary[sampled_word] == unknown_token:
sampled_word = word_to_index[random.choice(keywords_in_dictionary)]
sampled_word_syllables = get_syllables(sampled_word, dictionary, syllable_count)
# No theme is given
else:
while sampled_word_syllables == 0 or sampled_word_syllables > syllables_remaining or dictionary[sampled_word] == line_end_token or dictionary[sampled_word] == unknown_token:
samples = numpy.random.multinomial(1, next_word_probs[-1])
sampled_word = numpy.argmax(samples)
sampled_word_syllables = get_syllables(sampled_word, dictionary, syllable_count)
return sampled_word, sampled_word_syllables
# A custom exception to be thrown when a rhyme could not be found
class NoRhymeError(Exception):
pass
# Get a rhyming word given a word
def get_rhyme(rhyme_word, syllables_required, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary):
rhyme_word_alt = remove_extension(rhyme_word)
if rhyme_word_alt.upper() in syllable_pronounciation:
pronounciation = syllable_pronounciation[rhyme_word_alt.upper()]
last_occurence_syllable_unstressed = pronounciation.rfind("0")
last_occurence_syllable_stressed = pronounciation.rfind("1")
if last_occurence_syllable_stressed > last_occurence_syllable_unstressed:
sub_pronounciation = pronounciation[:last_occurence_syllable_stressed]
else:
sub_pronounciation = pronounciation[:last_occurence_syllable_unstressed]
space_index = sub_pronounciation.rfind(" ")
last_syllable = pronounciation[space_index + 1:]
rhymes = rhyming_model.rhyme_pick(last_syllable, rhyming_dictionary, syllables_required, syllable_count, syllable_pronounciation)
for rhyme in rhymes:
if rhyme in banned_end_words or rhyme == rhyme_word:
rhymes.remove(rhyme)
if len(rhymes) >= 1:
rhyme = random.choice(rhymes)
else:
# No rhymes can be found from the rhyming dictionary, use RhymeBrain
rhymes = rhyming_model.rhyme_brain(rhyme_word, syllables_required, syllable_pronounciation)
if len(rhymes) >= 1:
rhyme = random.choice(rhymes)
# No rhymes can be found from RhymeBrain, use pronouncing
else:
rhymes = pronouncing.rhymes(rhyme_word)
if len(rhymes) >= 1:
rhyme = random.choice(rhymes)
else:
raise NoRhymeError("Cannot find a rhyming word.")
else:
# No rhymes can be found from the rhyming dictionary, use RhymeBrain
rhymes = rhyming_model.rhyme_brain(rhyme_word, syllables_required, syllable_pronounciation)
if len(rhymes) >= 1:
rhyme = random.choice(rhymes)
else:
# No rhymes can be found from RhymeBrain, use pronouncing
rhymes = pronouncing.rhymes(rhyme_word)
if len(rhymes) >= 1:
rhyme = random.choice(rhymes)
else:
raise NoRhymeError("Cannot find a rhyming word.")
return rhyme.lower()
# Generate line
def generate_line(model, dictionary, word_to_index, syllable_count, keywords_in_dictionary=None):
# Start the line with the LINE START token
new_line = [word_to_index[line_start_token]]
syllables_remaining = 10
# A theme is given, try to use a random given theme word as starting word
if keywords_in_dictionary != None:
if len(keywords_in_dictionary) >= 8:
random_number = numpy.random.randint(18)
elif len(keywords_in_dictionary) >= 6:
random_number = numpy.random.randint(23)
elif len(keywords_in_dictionary) >= 3:
random_number = numpy.random.randint(35)
else:
random_number = numpy.random.randint(70)
if random_number <= 5:
theme_word = word_to_index[random.choice(keywords_in_dictionary)]
new_line.append(theme_word)
syllables_remaining = syllables_remaining - get_syllables(theme_word, dictionary, syllable_count)
sampled_word = unknown_token
sampled_word_syllables = 0
next_word_probs = model.forward_propagation(new_line)
while sampled_word_syllables == 0 or dictionary[sampled_word] == line_end_token or dictionary[sampled_word] == unknown_token:
samples = numpy.random.multinomial(1, next_word_probs[-1])
sampled_word = numpy.argmax(samples)
sampled_word_syllables = get_syllables(sampled_word, dictionary, syllable_count)
syllables_remaining = syllables_remaining - sampled_word_syllables
new_line.append(sampled_word)
while syllables_remaining > 0:
next_word = get_next_word(model, new_line, syllables_remaining, dictionary, word_to_index, syllable_count, keywords_in_dictionary)
new_line.append(next_word[0])
syllables_remaining = syllables_remaining - next_word[1]
# Convert vectors back to words using dictionary
line = [dictionary[x] for x in new_line[1:]]
# Randomly replaces nouns in the sentence if a theme is given
if keywords_in_dictionary != None:
line_temp = " ".join(line)
line_pos = nltk.pos_tag(nltk.word_tokenize(line_temp))
counter = 0
for pos in line_pos:
if "NN" in pos[1]:
random_number = numpy.random.randint(4)
if random_number == 1:
random_theme_word = random.choice(keywords_in_dictionary)
if get_syllables(word_to_index[pos[0]], dictionary, syllable_count) == get_syllables(word_to_index[random_theme_word], dictionary, syllable_count):
line[counter] = random_theme_word
counter = counter + 1
return line
# Ensure that the lines end with a rhyme (without theme)
def couplet(x, y, lines, dictionary, word_to_index, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary):
line_1 = lines[x]
line_2 = lines[y]
end_word_1 = line_1[-1]
end_word_2 = line_2[-1]
# Replace the second end word with a word that rhymes with the first end word
if end_word_1 not in banned_end_words:
end_word_1_alt = remove_extension(end_word_1)
if end_word_1_alt.upper() in syllable_pronounciation:
syllables_required = get_syllables(word_to_index[end_word_2], dictionary, syllable_count)
line_2[-1] = get_rhyme(end_word_1, syllables_required, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary)
# Replace the first end word with a word that rhymes with the second end word
elif end_word_2 not in banned_end_words:
end_word_2_alt = remove_extension(end_word_2)
if end_word_2_alt.upper() in syllable_pronounciation:
syllables_required = get_syllables(word_to_index[end_word_1], dictionary, syllable_count)
line_1[-1] = get_rhyme(end_word_2, syllables_required, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary)
# Both the first and second end word are in the banned end word list
# Select two random rhyming words to use as the end words
else:
while end_word_1 not in banned_end_words:
end_word_1 = random.choice(dictionary)
end_word_1_alt = remove_extension(end_word_1)
if end_word_1_alt.upper() in syllable_pronounciation:
syllables_required = get_syllables(word_to_index[end_word_2], dictionary, syllable_count)
line_2[-1] = get_rhyme(end_word_1, syllables_required, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary)
return [line_1, line_2]
# Ensure that the lines end with a rhyme
# Try to use a word from theme if possible
def couplet_theme(x, y, lines, dictionary, word_to_index, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary, keywords):
line_1 = lines[x]
line_2 = lines[y]
end_word_1 = line_1[-1]
end_word_2 = line_2[-1]
end_word_1_alt = remove_extension(end_word_1)
end_word_2_alt = remove_extension(end_word_2)
# If either end word is in the baneed end words list, replace with a theme word
if end_word_1 in banned_end_words or end_word_2 in banned_end_words:
if end_word_1 in banned_end_words:
end_word_1 = random.choice(keywords)
end_word_1_alt = remove_extension(end_word_1)
if end_word_1_alt.upper() in syllable_pronounciation:
syllables_required = get_syllables(word_to_index[end_word_2], dictionary, syllable_count)
line_2[-1] = get_rhyme(end_word_1, syllables_required, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary)
else:
end_word_2 = random.choice(keywords)
end_word_2_alt = remove_extension(end_word_2)
if end_word_2_alt.upper() in syllable_pronounciation:
syllables_required = get_syllables(word_to_index[end_word_1], dictionary, syllable_count)
line_1[-1] = get_rhyme(end_word_2, syllables_required, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary)
else:
if end_word_1_alt.upper() in syllable_pronounciation:
syllables_required = get_syllables(word_to_index[end_word_2], dictionary, syllable_count)
line_2[-1] = get_rhyme(end_word_1, syllables_required, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary)
return [line_1, line_2]
# Generate lines with the rhyming scheme ABABCDCDEFEFGG (without theme)
def generate_lines(model, dictionary, word_to_index, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary):
lines = []
for i in range(number_lines):
this_line = generate_line(model, dictionary, word_to_index, syllable_count)
lines.append(this_line)
c1 = couplet(0, 2, lines, dictionary, word_to_index, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary)
c2 = couplet(1, 3, lines, dictionary, word_to_index, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary)
c3 = couplet(4, 6, lines, dictionary, word_to_index, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary)
c4 = couplet(5, 7, lines, dictionary, word_to_index, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary)
c5 = couplet(8, 10, lines, dictionary, word_to_index, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary)
c6 = couplet(9, 11, lines, dictionary, word_to_index, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary)
c7 = couplet(12, 13, lines, dictionary, word_to_index, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary)
return [c1[0], c2[0], c1[1], c2[1], c3[0], c4[0], c3[1], c4[1],
c5[0], c6[0], c5[1], c6[1], c7[0], c7[1]]
# Generate lines with the rhyming scheme ABABCDCDEFEFGG (with theme)
def generate_lines_theme(model, dictionary, word_to_index, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary, keywords, keywords_in_dictionary):
lines = []
for i in range(number_lines):
this_line = generate_line(model, dictionary, word_to_index, syllable_count, keywords_in_dictionary)
lines.append(this_line)
c1 = couplet_theme(0, 2, lines, dictionary, word_to_index, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary, keywords)
c2 = couplet_theme(1, 3, lines, dictionary, word_to_index, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary, keywords)
c3 = couplet_theme(4, 6, lines, dictionary, word_to_index, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary, keywords)
c4 = couplet_theme(5, 7, lines, dictionary, word_to_index, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary, keywords)
c5 = couplet_theme(8, 10, lines, dictionary, word_to_index, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary, keywords)
c6 = couplet_theme(9, 11, lines, dictionary, word_to_index, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary, keywords)
c7 = couplet_theme(12, 13, lines, dictionary, word_to_index, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary, keywords)
return [c1[0], c2[0], c1[1], c2[1], c3[0], c4[0], c3[1], c4[1],
c5[0], c6[0], c5[1], c6[1], c7[0], c7[1]]
# Generate a poem with no theme
def generate_poem(model, dictionary, word_to_index, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary):
poem = generate_lines(model, dictionary, word_to_index, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary)
return poem
# Generate a poem with a given theme
# Find synonyms of the given theme using the Big Huge Thesaurus
def generate_poem_theme(model, dictionary, word_to_index, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary, theme):
keywords = []
keywords_in_dictionary = []
url = "http://api.datamuse.com/words?ml=" + theme
response = urlopen(url)
data = json.load(response)
words = [theme]
for related_word in data:
if 'n' in related_word['tags']:
words.append(related_word['word'])
keywords_count = 0
for word in words:
if word.upper() in syllable_count:
keywords.append(word)
if word in dictionary:
keywords_in_dictionary.append(word)
keywords_count += 1
if keywords_count > 8:
break
if len(keywords_in_dictionary) == 0:
print("Unfortunately no poem could be generated using the theme \"" + theme + "\". Generating a standard poem instead...")
poem = generate_lines(model, dictionary, word_to_index, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary)
else:
poem = generate_lines_theme(model, dictionary, word_to_index, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary, keywords, keywords_in_dictionary)
return poem, keywords
# Generate a poem with a given image
# Find synonyms of the three keywords obtained from the image using the Big Huge Thesaurus
def generate_poem_image(model, dictionary, word_to_index, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary, image_words):
keywords = []
keywords_in_dictionary = []
for image_word in image_words:
url = "http://api.datamuse.com/words?ml=" + image_word
response = urlopen(url)
data = json.load(response)
words = [image_word]
for related_word in data:
if 'n' in related_word['tags']:
words.append(related_word['word'])
keywords_count = 0
for word in words:
if word.upper() in syllable_count:
keywords.append(word)
if word in dictionary:
keywords_in_dictionary.append(word)
keywords_count += 1
if keywords_count > 6:
break
if len(keywords_in_dictionary) == 0:
print("Unfortunately no poem could be generated based on the image. Generating a standard poem instead...")
poem = generate_lines(model, dictionary, word_to_index, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary)
else:
poem = generate_lines_theme(model, dictionary, word_to_index, rhyming_model, syllable_count, syllable_pronounciation, rhyming_dictionary, keywords, keywords_in_dictionary)
return poem, keywords
# Print out the poem
def print_poem(poem):
for i in range(number_lines):
print(" ".join(poem[i]))