In [1]:
import numpy as np
import csv
import pandas as pd
import re
import os
import requests
from collections import Counter
from num2words import num2words
from nltk.corpus import cmudict
cmudict = cmudict.dict()
import copy
import torch
import json
import TextAnalyzer as ta

In [2]:
def gethaikus(text):
    haikus = pd.Series(text.split("\n\n"))
    return haikus


source_root = "Haikus"

# Source 1
text = open(source_root + "/haikuzao.txt", "r").read()
text = text.lower()
haikus = gethaikus(text)
# Source 2
gutenberg = pd.read_csv(source_root + "/gutenberg.csv")
haikus = haikus.append(
    pd.Series(gutenberg["haiku"]).apply(lambda x: x.lower())
)
# Source 3
modern_renaissance = pd.read_csv(source_root + "/modern_renaissance.csv")
# make lower case and ensure that the new line notation is the same
modern_renaissance = pd.Series(modern_renaissance["content"]).apply(
    lambda x: x.lower().replace("\r\n", "\n")
)
haikus = haikus.append(modern_renaissance)
# Source 4
sballas = pd.read_csv(source_root + "/sballas8.csv", header=None)
haikus = haikus.append(pd.Series(sballas[0]))
# Source 5
temps = pd.read_csv(source_root + "/tempslibres.csv", encoding="ISO-8859-1")
# Only English
temps = temps[temps["lang"] == "en"]
# make lower case and ensure that the new line notation is the same
haikus = haikus.append(
    pd.Series(temps["haiku"]).apply(lambda x: x.lower().replace("\r\n", "\n"))
)
# Source 6
hjson = pd.read_json(source_root + "/unim_poem.json")
haikus = haikus.append(pd.Series(hjson["poem"])).reset_index(drop=True)

In [3]:
# Clean words with text format class
formatter = ta.TextFormat()

def cleanwords(text):
    text = text.replace("-", " ")
    text = text.replace("'", "")
    text = formatter.numtotext(text)
    return text


haikus = haikus.apply(cleanwords)

In [4]:
# Import CMUDICT and use it to instantiate a transformer class
ptransformer = ta.PhonemeTransform(cmudict)
# Get Full text and list of Words
text = formatter.arraytotext(haikus)
words = formatter.getwords(text)
# Find unknownwords
unknowns = ptransformer.getunknowns(words)
# Split unknown words to find two words within one; add to dictionary
ptransformer.addsplitwords(unknowns)

In [5]:
# takes in haikus and separates haikus that have words not in the CMUDICT
def splithaikus(haikus, pdict):

    bad_haikus = []
    good_haikus = []

    for haiku in haikus:
        words = formatter.getwords(haiku)
        if all(word in pdict.keys() for word in words):
            good_haikus.append(haiku)
        else:
            bad_haikus.append(haiku)

    return bad_haikus, good_haikus


bad_haikus, valid_haikus = splithaikus(haikus, ptransformer.phodict)

In [6]:
haikus_transformed = pd.Series(map(ptransformer.transform, valid_haikus))
# Convert all syllables to 0
haikus_transformed = pd.Series(
    map(ptransformer.convertsyllables, haikus_transformed)
)

In [7]:
# Create Inverted Phoneme Dictionary
idict = ptransformer.invertdictionary()
# Use Inverted Phoneme Dictionary and Regular Dictionary to instantiate class
rtransformer = ta.PhonemeReverse(ptransformer.phodict, idict)

In [8]:
# Return Haikus Back to Text
haiku_text = formatter.arraytotext(haikus_transformed)
# Get all words
words = formatter.getwords(haiku_text)
# Get syllables for each word
syllables = words.apply(ptransformer.syllablecount)
# zip lists together
syll_list = [*zip(words, syllables)]

In [9]:
# Convert list of words into 5-7-5 snippets
haikus17 = []
while len(syll_list) > 17:
    cum = 0
    haiku = []
    
    for i,tup in enumerate(syll_list):
        cum = cum + tup[1]
        if cum < 17:      
            haiku.append(syll_list.pop(i)[0])

        elif cum == 17:
            haiku.append(syll_list.pop(i)[0])
            haiku.append("\n")
            haikus17.append(haiku)
            break
            
        else:
            cum = cum - tup[1]
            continue

In [10]:
# Check to see if Haikus are 17 syllables structure
haikus17[len(haikus17) - 1]

['OW0NLIY0',
 'DAY0VER0Z',
 'DEH0SAH0LAH0T',
 'DHAH0',
 'DHAH0',
 'RAY0T',
 'JHEH0NTLIY0',
 'TUW0',
 'IH0KWEY0TER0',
 'NAY0TS',
 '\n']

In [11]:
rtransformer.getenglish(
    formatter.arraytotext(haikus17[len(haikus17) - 1]), runs=2
)

['only',
 'divers',
 'desolate',
 'the',
 'the',
 'right',
 'gently',
 'to',
 'equator',
 'knights']

In [13]:
# Output Haikus to CSV so transformations do not have to be ran again
haiku_series = pd.Series(haikus17)
haiku_series = haiku_series.apply(formatter.arraytotext)
haiku_series.to_csv(
    "Haikus/PhonemeHaikusStructured.csv", index=False, header=False
)
# Also output dictionaries so they do not have to be recreated when testing
# CMUDICT with added split words
with open("pdict.json", "w") as fp:
    json.dump(ptransformer.phodict, fp)

# Inverse of CMUDICt with added split words
with open("idict.json", "w") as fp:
    json.dump(idict, fp)