In [1]:
import numpy as np
import csv
import pandas as pd
import re
import os
import requests
from collections import Counter
from num2words import num2words
from nltk.corpus import cmudict
import copy
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
#%load_ext line_profiler
cmudict = cmudict.dict()
# Load pre-trained BERT model (weights)
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create Functions to Assist in Analysis

In [2]:
# Split text into words
def getwords(text):
    words = pd.Series(re.findall(r"[\w']+", text))
    return words
# Split text into individual haikus
def gethaikus(text):
    haikus = pd.Series(text.split("\n\n"))
    return haikus
# Convert an array of words to a single string
def arraytotext(arr):
    text = ' '.join(arr)
    return text
# Find words that are not located in the cmudict phoenetic dictionary
def getunknowns(words):
    unknown_words = np.array([word for word in words if word.lower() not in cmudict.keys()])
    return unknown_words
# Gets frequency of word use in a given list of words
def wordcount(unknown_words):
    prob_word_freq = pd.Series(Counter(unknown_words)).sort_values(ascending=False)
    return prob_word_freq

# Import All Haikus in the Same Format

In [3]:
source_root = '../Haikus'

# Source 1
text = open(source_root+'/haikuzao.txt', 'r').read()
text = text.lower()
haikus = gethaikus(text)
# Source 2
gutenberg = pd.read_csv(source_root+'/gutenberg.csv')
haikus = haikus.append(pd.Series(gutenberg['haiku']).apply(lambda x: x.lower()))
# Source 3
modern_renaissance = pd.read_csv(source_root+'/modern_renaissance.csv')
# make lower case and ensure that the new line notation is the same
modern_renaissance = pd.Series(modern_renaissance['content']).apply(lambda x: x.lower().replace("\r\n", "\n"))
haikus = haikus.append(modern_renaissance)
# Source 4
sballas = pd.read_csv(source_root+'/sballas8.csv', header=None)
haikus = haikus.append(pd.Series(sballas[0]))
# Source 5
temps = pd.read_csv(source_root+'/tempslibres.csv', encoding = "ISO-8859-1")
# Only English
temps = temps[temps['lang']=='en']
# make lower case and ensure that the new line notation is the same
haikus = haikus.append(pd.Series(temps['haiku']).apply(lambda x: x.lower().replace("\r\n", "\n")))
# Source 6
hjson = pd.read_json(source_root+'/unim_poem.json')
haikus = haikus.append(pd.Series(hjson['poem'])).reset_index(drop=True)

In [36]:
# Get words from haikus and determine whih ones do not exist in 
text = arraytotext(haikus)
unknown_words = getunknowns(getwords(text))
wordcount(unknown_words).head(30)

youll         466
didnt         418
iii           417
_             378
nbsp          358
doesnt        279
couldnt       258
aught         248
isnt          235
colours       185
heavn         184
wasnt         181
theyll        181
aint          173
twixt         168
wouldnt       144
whateer       123
beauteous     121
splendour     120
gie           116
loveliness    113
evry          112
theyd         100
instr          96
whereer        94
wretch         91
lovd           89
theyve         87
neighbours     87
nokomis        86
dtype: int64

In [42]:
unknown_words

array(['cuisses', 'lespace', 'morceau', ..., 'doesnt', 'didnt', 'wifi'],
      dtype='<U127')

# Clean Words

In [5]:
# Remove unwanted characters
def cleanwords(text):    
    #Clean dashes
    text = text.replace('-'," ")
    #Clean apostrophe
    text = text.replace('\'',"")
    # Clean numbers
    words = getwords(text)
    for word in words:
        if word.isdigit():
            text = re.sub(rf'\b{word}\b',num2words(word),text)
    
    return text

In [6]:
# Clean Haikus
haikus = haikus.apply(cleanwords)

  This is separate from the ipykernel package so we can avoid doing imports until


# Convert Words to Phonemes

In [46]:
# Convert plain text to Phoenetic
def getphoneme(word):
    phoneme = ''.join(cmudict[word][0]) # always use first pronuciations at index 0
    return phoneme

# Finds unknown words that can be split into two words
def findwordsplits(unknown_words):
    splitwords = {}
    for word in unknown_words:
        wordlength = len(word)
        for i in range(wordlength):
            split1 = word[0:i+1]
            split2 = word[i+1:wordlength]

            if split1 in cmudict.keys():
                if split2 in cmudict.keys():
                    splitwords[word] = [split1,split2]      
    return splitwords
splitwords = findwordsplits(set(unknown_words))

In [45]:
# inspect what words have single letters as vowels to determine appropriate sounds
vowels = ['a', 'e', 'i','o', 'u', 'y']
splitvowlels = []
for i in splitwords:
    if (len(splitwords[i][0]) ==1 or len(splitwords[i][1]) ==1):
        if(splitwords[i][0] in vowels or splitwords[i][1] in vowels):
            splitvowlels.append(splitwords[i])
splitvowlels[0:20]

[['oo', 'o'],
 ['atter', 'y'],
 ['singul', 'o'],
 ['git', 'e'],
 ['pianger', 'e'],
 ['vii', 'i'],
 ['e', 'letto'],
 ['hetend', 'e'],
 ['sublunar', 'y'],
 ['indi', 'o'],
 ['wilf', 'u'],
 ['voler', 'i'],
 ['secg', 'e'],
 ['tov', 'e'],
 ['yes', 'e'],
 ['villan', 'y'],
 ['u', 'ensis'],
 ['y', 'cleane'],
 ['bursat', 'i'],
 ['u', 'omo']]

In [9]:
# handle words with single letters separately
# Note: this may not always be accruacte but should approximate well enough for the intended goal
def convertsplitwords(splitwords):
    worddict = copy.deepcopy(splitwords)
    special_cases = { 
        'a':'AH0',
        'i': 'IY0',
        'o': 'OW0',
        'u' : 'UW0',
        'y':'IY0',
        's': 'Z'
    }
    
    for i in worddict:
        
        firstword = worddict[i][0]
        secondword = worddict[i][1]

        # If the first word is a single letter then make it all caps
        if (len(firstword)==1):
            firstword = firstword.upper()
            
        # Otherwise use CMUDICT to convert
        else:
            firstword = getphoneme(firstword)

        # If the second word is a single letter then apply the special cases where applicable
        if (len(secondword)==1):
            
            # If letter is a special case replace it
            if secondword in special_cases.keys():
                secondword = pd.Series(worddict[i]).replace(special_cases)[1]
                
            # Otherwise make it all caps
            else:
                secondword = secondword.upper()
                
       # Otherwise use CMUDICT to convert
        else:
            secondword = getphoneme(secondword)
            
        # added nested list to match format of CMUDICT
        worddict[i] = [[firstword,secondword]]   
        
    return worddict

# takes in haikus and separates haikus that have words not in the CMUDICT
def splithaikus(haikus):
    
    bad_haikus = []
    good_haikus = []
    
    for haiku in haikus:
        words = getwords(haiku)
        if all(word in cmudict.keys() for word in words):
            good_haikus.append(haiku)
        else:
            bad_haikus.append(haiku)
                        
    return bad_haikus, good_haikus

# Takes in a haiku and transforms it into the equivalent phoneme version
def haikutransform(haiku):
    words = getwords(haiku)
    try:
        phonemes = [* map(getphoneme,words)]
        phonemes = ' '.join(phonemes)
    except:
        raise ValueError('A word in the Haiku was not in the CMUDICT.' \
        ' Make sure only valid haikus are used for this function.')
        return
        
    return phonemes



In [10]:
# add new word phoenetics to cmudict
cmudict = {**cmudict ,**convertsplitwords(splitwords)}
# split haikus into a usuable set and a set that can be further inspected for transformations
bad_haikus, valid_haikus = splithaikus(haikus)
# transform the good haikus into phonemes
haikus_transformed = pd.Series(map(haikutransform,valid_haikus))
# Convert all syllables to 0
haikus_transformed = pd.Series(map(convertsyllables,haikus_transformed))

  This is separate from the ipykernel package so we can avoid doing imports until


In [11]:
# Check to see if we didnt lose any haikus
len(bad_haikus) + len(haikus_transformed) == len(haikus)

True

# Create Function to Convert Back to Regular English

In [12]:
# Invert the cmudict so that we can transform from a phoneme to english
def invertdictionary(cmudict):
    
    idict = {}
    
    for word in cmudict:
        # Create a list to hold all of the possible words associated with a phoneme
        p_list = []
        
        phoneme = ''.join(cmudict[word][0]) # always use first phoneme for a word
        phoneme = convertsyllables(phoneme)
        
        # if the phoneme already exists add it to that list
        if phoneme in idict.keys():
            p_list = idict[phoneme]
            p_list.append(word)
            
        # Otherwise create a new list
        else:
            p_list.append(word)
           
        idict[phoneme] = p_list
    return idict

In [13]:
# Create Inverted Phoneme Dictionary
idict = invertdictionary(copy.deepcopy(cmudict))

In [14]:
def syllablecount(phoneme):
    count = 0
    for letter in phoneme:
        if letter.isdigit():
            count = count + 1
    return count

def getwordarray(phoneme):
    try:
        words = idict[phoneme]
    # if there are no words in the dictionary then create a flag to handle later
    except:
        print("BadPhoneme")
        syllables = syllablecount(phoneme)
        words = ['',syllables]
    return words

In [15]:
# A function that takes in haikus and predicts what words should be used in the list. The algorithm works as such:
# 1) Create a sentence using a "naive" prediction which simply uses the first word in the array
# 2) Iterate through the phonemes in the sentence, if it only has one option than use it, else move to step 3
# 3) Use BERT a bi-directional NLP package to create a similarity matrix of most likely words in the sentence
# 4) Join array of similar words with possible words associated with the phoneme
# 5) If the array is empty default to the first word, else pick the highest ranked word
# 6) Add the word to the list and move to the next phoneme 

def getenglish(haiku):
    english_haiku = []
    # use getwords to get phonemes
    phonemes = getwords(haiku)
    # get 2D array of lists of possibilities for each word
    wordsarray = phonemes.apply(getwordarray)
    # Get a baseline to use predictions off of by taking the first word for each list
    baseline ='[CLS] '
    baseline = baseline + ' '.join(wordsarray.apply(lambda x: x[0])) # Use first word
    baseline = baseline + ' [SEP]'
    for index, word in enumerate(wordsarray):
        if len(word)>1 :
            try:                
                    #Replace word with mask
                sentence = re.sub(word[0],'[MASK]',baseline)

                tokenized_text = tokenizer.tokenize(sentence)
                indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

                # Create the segments tensors.
                segments_ids = [0] * len(indexed_tokens)

                # Convert inputs to PyTorch tensors
                tokens_tensor = torch.tensor([indexed_tokens])
                segments_tensors = torch.tensor([segments_ids])

                # Predict all tokens
                with torch.no_grad():
                    predictions = model(tokens_tensor, segments_tensors)

                predicted_words = pd.DataFrame(predictions[0,index])
                predicted_words['Word'] = [tokenizer.convert_ids_to_tokens([x])[0] for x in range(len(predicted_words))]
                # If its an actual word then merget the predicted words with the word array and grab the
                # highest ranked word
                if type(word[1]) != int :
                    # Create a word dataframe to merge with
                    wordlist = pd.DataFrame(word,columns=['Word'])
                    best_word = pd.merge(predicted_words,wordlist, on ='Word').sort_values(0, ascending= False).loc[0,'Word']

                # If the word is a number then that means there was are no english words associated with the phoneme
                # So instead use the highest ranked word with the same syllable count
                else:
                    # The second value is the syllable count created by the get word array function
                    syllabes = word[1]
                    #Sort Predicted words
                    predicted_words.sort_values(0, ascending= False, inplace = True)
                    #Loop through and find the first word with the same syllable count
                    for word in predicted_words['Word']:
                        # Try to get syllable count of the word
                        try:
                            pred_syllable = syllablecount(getphoneme(word))
                            if pred_syllable == syllabes:
                                best_word = word
                                break
                        # If the word doesnt exist then skip that word
                        except:
                            continue

                    # If the word still wasnt found then grab the best word
                    # Noted: This will create an error in the syllable co
                    if type(word[0]) == int:
                        best_word = predicted_words.loc[0,'Word']
                # If BERT is unable to predict word then use the first word
            except:
                best_word=word[0]
                
        else:
            best_word=word[0]
            
        english_haiku.append(best_word)
    return english_haiku

In [22]:
# Check to see if the function correctly tranforms phonemes back to english correctly
print(arraytotext(getenglish(haikus_transformed[0])))
print(valid_haikus[0])

a skein of birds twines across the sky the northbound train departs
a skein of birds
twines across the sky
the northbound train departs


# Format Haikus w/ Correct Syllable Structure

In [17]:
# Count Syllables 
df_syll = pd.DataFrame(zip(haikus_transformed,
    haikus_transformed.apply(syllablecount)))

In [18]:
# We need poems with 17 syllables (5 + 7 + 5)
df_17_syll = df_syll[df_syll[1]==17]
print(len(df_17_syll),len(arraytotext(df_17_syll[0])))

1816 172055


This is an extremley dissapointing result. 1890 haikus wont even scratch the surface of the amount of text needed for an RNN to learn a 5-7-5 syllable structure. 

However, the goal of the project is to create an RNN model that can understand syllables and learn enlgish as phoenemes. Although some context will be loss, these goals can be met by treating all haikus as one single text and then split into several 17 syllables poems. If the model can learn 

In [25]:
# Return Haikus Back to Text
haiku_text = arraytotext(haikus_transformed)
# Get all words
words = getwords(haiku_text)
# Get syllables for each word
syllables = words.apply(syllablecount)
# zip lists together
syll_list = [*zip(words,syllables)]     
len(syll_list)

2449369

In [30]:
# Convert list of words into 5-7-5 snippets
haikus17 = []
while len(syll_list) > 17:
    cum = 0
    haiku = []
    
    for i,tup in enumerate(syll_list):
        cum = cum + tup[1]
        if cum < 17:      
            haiku.append(syll_list.pop(i)[0])

        elif cum == 17:
            haiku.append(syll_list.pop(i)[0])
            haiku.append('\n')
            haikus17.append(haiku)
            break
            
        else:
            cum = cum - tup[1]
            continue

In [27]:
# Check to see if Haikus are 17 syllables structure
haikus17[len(haikus17)-1]

['AH0',
 'STIH0L',
 'BAY0',
 'DAA0NGKIY0Z',
 'PEY0N',
 'AA0N',
 'PAE0N',
 'GAA0D',
 'DAA0RKAH0NIH0NG',
 'AH0',
 'AH0V',
 'BRAY0TLIY0',
 'MUW0N',
 '\n']

In [28]:
getenglish(arraytotext(haikus17[len(haikus17)-1]))

['a',
 'still',
 'by',
 'donkeys',
 'pain',
 'on',
 'pan',
 'god',
 'darkening',
 'a',
 'of',
 'brightly',
 'moon']

In [35]:
# Text looks good; Output it to CSV so transformations do not have to be ran again
haiku_series = pd.Series(haikus17)
haiku_series = haiku_series.apply(arraytotext)
haiku_series.to_csv('../Haikus/PhonemeHaikusStructured.csv',index=False, header=False)

In [33]:
len(haiku_series)

185880