In [160]:
import numpy as np
import random
import models
from collections import Counter


In [138]:
def obs_map(tokens, vocab_list):
    """HMM can only handle integer sequences, so we
    map every word to an integer"""
    mapped_tokens = []
    for token in tokens:
        mapped_tokens.append(vocab_list.index(token))
    return mapped_tokens


In [139]:
f = open("data/shakespeare.txt", "r")
text = f.read()
poems = text.split("\n\n")
shakesphere_poems = []
for i, poem in enumerate(poems):
    poem = poem.replace(",", " , ")
    poem = poem.replace(":", " : ")
    poem = poem.replace("?", " ? ")
    poem = poem.replace(";", " ; ")
    poem = poem.replace(";", " ; ")
    poem = poem.replace("!", " ! ")
    poem = poem.replace("\n", " \n ")
    poem = poem.replace(".", " . ")
    poem = poem.replace("(", "")
    poem = poem.replace(")", "")
    lines = poem.split("\n")
    if i == 0:
        lines = lines[1:]
    else:
        lines = lines[2:]
    shakesphere_poems.append([])
    for line in lines:
        tokens = line.split(" ")
        tokens = [x.lower() for x in tokens if (x != "" and not x.isdigit())]
        tokens.insert(0, "<START>")
        tokens.append("<STOP>")
        shakesphere_poems[-1].append(tokens)


In [140]:
f = open("data/spenser.txt", "r")
text = f.read()
poems = text.split("\n\n")
spenser_poems = []
for i, poem in enumerate(poems):
    if i % 2 == 0:
        continue
    poem = poem.replace(",", " , ")
    poem = poem.replace(":", " : ")
    poem = poem.replace("?", " ? ")
    poem = poem.replace(";", " ; ")
    poem = poem.replace(";", " ; ")
    poem = poem.replace("!", " ! ")
    poem = poem.replace("\n", " \n ")
    poem = poem.replace(".", " . ")
    poem = poem.replace("(", "")
    poem = poem.replace(")", "")
    lines = poem.split("\n")
    spenser_poems.append([])
    for line in lines:
        tokens = line.split(" ")
        tokens = [x.lower() for x in tokens if (x != "" and not x.isdigit())]
        tokens.insert(0, "<START>")
        tokens.append("<STOP>")
        spenser_poems[-1].append(tokens)


In [141]:
all_poems = []
all_poems.extend(shakesphere_poems)
# all_poems.extend(spenser_poems)

all_poems_backwards = (
    []
)  # All poems but starts each line with last word of line and reads backwards
for poem in all_poems:
    all_poems_backwards.append([])
    for line in poem:
        line = line[1:-1]
        if len(line[-1]) == 1:
            line = line[:-1]
        all_poems_backwards[-1].append(line[::-1])


In [142]:
mega_list = []
for poem in all_poems_backwards:
    for line in poem:
        mega_list.extend(line)
vocab_counts = Counter(mega_list)
vocab_list = sorted(vocab_counts.items(), key=lambda x: x[1])
vocab_list = [x for (x, y) in vocab_list]
vocab_list.reverse()

tokenized_poems = []
for i in range(len(all_poems_backwards)):
    for j in range(len(all_poems_backwards[i])):
        tokenized_poems.append(obs_map(all_poems_backwards[i][j], vocab_list))


In [143]:
patterns = [[0, 2], [1, 3], [4, 6], [5, 7], [8, 10], [9, 11], [12, 13]]

rhyme_dict = []

for poem in all_poems_backwards:
    if len(poem) != 14:
        continue
    for pattern in patterns:
        word_1 = poem[pattern[0]][0]
        word_2 = poem[pattern[1]][0]
        in_dict = False
        for rhymes in rhyme_dict:
            if word_1 in rhymes or word_2 in rhymes:
                rhymes.add(word_1)
                rhymes.add(word_2)
                in_dict = True
                break
        if not in_dict:
            rhyme_dict.append(set([word_1, word_2]))


In [144]:
model = models.unsupervised_HMM(tokenized_poems, 16, 20)


Unsupervised Learning Begins
Iteration 20  - And Input 2154

In [153]:
modified_vocab_list = []
for vocab in vocab_list:
    if vocab[-1] == "'":
        vocab = vocab[:-1]
    modified_vocab_list.append(vocab)

f = open("data/Syllable_dictionary.txt", "r")
text = f.read()
lines = text.split("\n")
word_to_syllables = dict()
syllables_to_word = dict()
for line in lines:
    words = line.split(" ")
    if words[0] not in modified_vocab_list:
        continue
    word = modified_vocab_list.index(words[0])
    count = words[-1]
    if "E" in count:
        count = words[-2]
    word_to_syllables[word] = int(count)

    if word_to_syllables[word] in syllables_to_word.keys():
        syllables_to_word[word_to_syllables[word]].append(word)
    else:
        syllables_to_word[word_to_syllables[word]] = [word]


In [201]:
def reverse_obs_map(tokens, vocab_list):
    """For unmapping words back to real tokens"""
    unmapped_tokens = []
    for token in tokens:
        unmapped_tokens.append(vocab_list[token])
    return unmapped_tokens


def sample_line(hmm, vocab_list, end_word, end_punctuation="", max_words=10):
    start_token = obs_map([end_word], vocab_list)[0]
    emission, states = hmm.generate_emission(max_words, start_token)
    sentence = emission[::-1]
    sentence.append(start_token)

    syllables = [
        word_to_syllables[word] if (word in word_to_syllables.keys()) else 0
        for word in sentence
    ]
    sums = [sum(syllables[i:]) for i in range(len(syllables))]
    for i in range(len(sums)):
        if sums[i] == sums[i + 1]:
            i += 1
        if sums[i] == 10:
            sentence = sentence[i:]
            break
        if sums[i] < 10:
            sentence = sentence[i:]
            sentence.insert(0, random.choice(syllables_to_word[10 - sums[i]]))
            break

    sentence = reverse_obs_map(sentence, vocab_list)
    output = " ".join(sentence).capitalize()
    output = output.replace(" i ", " I ")
    output = output.replace(" .", ".")
    output = output.replace(" ?", "?")
    output = output.replace(" ,", ",")
    output = output.replace(" :", ":")
    output = output.replace(" ;", ";")
    output = output.replace(" !", "!")
    output = output + end_punctuation
    return output


In [212]:
for _ in range(20):
    sample_poem = [[] for _ in range(14)]

    for i, pattern in enumerate(patterns):
        j = random.choice(range(len(rhyme_dict)))
        end_word1 = random.choice([x for x in rhyme_dict[j]])
        end_word2 = random.choice([x for x in rhyme_dict[j] if (x != end_word1)])

        end_punc1 = random.choice(["", ",", ":", "."])
        end_punc2 = random.choice(["", ",", ":", "."])
        if pattern[1] == 13:
            end_punc2 = "."

        sample_poem[pattern[0]] = sample_line(
            model, vocab_list, end_word1, end_punc1
        )
        sample_poem[pattern[1]] = sample_line(
            model, vocab_list, end_word2, end_punc2
        )

    print("\n".join(sample_poem))
    print()


Him thinking your pace rough is and be sort:
And yet so wilt roses I woe complain.
Hid you canker I burthen whose me sport,
'scaped I ugly thus grace come bastard reign,
The of and it can when know of half thief.
Sighs great which but praise is in suited sea:
Which thou niggard which say o back dost chief:
Look he you, when I no thee one bud plea:
Are to true might none stone her who for fair
Many me rigour a three living head
Thou not me oft thou with fierce to far heir:
In the and love thought turns is have this bed,
Of so against for full up as bevel.
Grant of mine fist dost my clouds do level.

In by all absence tendered thy be were.
If beweep it not shall now away hope
With her mine to knows mayst too well appear
Low buried wet the vex strength it urge scope:
Sorrows and I and they trenches away.
Nurseth older flowers thy made lover,
Lacked an twofold, then spur art shalt decay,
Years life's when at of am admire cover.
Thy another heart learn disabled:
Long star to shall the ah if