# Identifying Iambic Pentameter

## Creating Functions

In [3]:
import nltk 
from nltk.corpus import cmudict
import string
import numpy as np
import pandas as pd
from pprint import pprint
import re
from math import ceil
from random import randint

After parsing the words I decode the text, remove unnecessary punctuation, and break up hyphenated words so it can be read by the CMU dictionary

In [4]:
def clean_words(words, verbose=True):    
    # returns cleans words so they are usable
    junk = ',.:?!"_'                             # Punctuation I dont want
    poss_is = "'s"                              # remove possessive since doesnt add syllable and is not in corpus
    poss_are = "'re"
    new_words = []                              # list to collect all the cleaned words
    for word in words:
        word = str(word)
        word = word.decode('utf-8').encode('ascii', 'ignore').strip()  # decode weird text
        if poss_is in word and poss_is == word[-2:]:
            word = word.replace(poss_is,'')     # removed poss_is
        elif poss_are in word and poss_are == word[-3:]:
            word = word.replace(poss_are, '')   # remove poss_are
        elif word[-2:] == "'d":
            word = word.replace("'d","")
        else:
            word = word.replace("'","")
        if '...' in word:
            word = word.replace('...','')
        for j in junk:
            word = word.replace(j,'')           # remove junk
        if '--' in word:
            word = word.replace('-','')
        elif '-' in word:                         # splits hyphenated words
            hyph = word.split('-')
            new_words.append(hyph[0].decode('utf-8').encode('ascii', 'ignore').strip())
            new_words.append(hyph[1].decode('utf-8').encode('ascii', 'ignore').strip())
        else:
            new_words.append(word)              # put cleaned word in new_words
    return new_words

In [58]:
# EXAMPLE

line = str(raw_input('enter iambic: '))
# line = 'From fairest creatures we desire increase,'
words = line.split()
cleaned_words = clean_words(words)
print cleaned_words

enter iambic: But as the riper should by time decease,
['But', 'as', 'the', 'riper', 'should', 'by', 'time', 'decease']


This is a representation of what is in the CMU dictionary and how I extract meter

In [7]:
# !! Not in use !!

def iambic_meter(new_words): 
    # returns cmu corpus info on words
    iambic = cmudict.dict()                     # connect to cmu corpus, called iambic
    meter = []                                  # list to collect meter/stresses
    for word in new_words:                      # get word from list of clean words
        try:                                    # if the word is in the corpus
            for x in iambic[word.lower()]:      # for the cleaned word get a iambic version
                count = 0                       # syllables in word
                for y in x:                     # for the iambic version get text
                    for char in y:              # for text get character
                        if char.isdigit() == True: # if character is a digit
                            count += 1          # count the syllables given by the corpus
                            meter.append(int(char)) # add meter to list
                            print char, word, x, count
        except:                                 # if the word isnt in the corpus
            print word + " ---> NOT IN CMU DICT <---"
    return meter

In [59]:
# EXAMPLE

iambic_meter(cleaned_words)

1 But [u'B', u'AH1', u'T'] 1
1 as [u'AE1', u'Z'] 1
1 as [u'EH1', u'Z'] 1
0 the [u'DH', u'AH0'] 1
1 the [u'DH', u'AH1'] 1
0 the [u'DH', u'IY0'] 1
riper ---> NOT IN CMU DICT <---
1 should [u'SH', u'UH1', u'D'] 1
1 by [u'B', u'AY1'] 1
1 time [u'T', u'AY1', u'M'] 1
0 decease [u'D', u'IH0', u'S', u'IY1', u'S'] 1
1 decease [u'D', u'IH0', u'S', u'IY1', u'S'] 2


[1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1]

My old function that counted syllables based on 3 rules. 1) Count the number of vowels in each word. 2) Subtract one for each dipthong. 3) Subtract one for any silent vowels.

In [9]:
# !! Not in use !!

def count_syllable(new_words, verbose=True):
    # counts the number of syllables in list of words
    syllable = []                               # empty list to collect syllable
    for word in new_words:
        neighbor = 0                            # switch that says if the character is next to a vowel
        count = 0                               # counts the number of syllables in word
        for char in word:
            if char in 'aeiouy' and neighbor == 0: # is this character a vowel?
                count += 1                      # Rule 1: Count the vowels in the word
                neighbor = 1                    # the next letter is next to a vowel
            elif char in 'aeiouy' and neighbor == 1: # is the character next to a vowel?
                neighbor = 1                    # Rule 2: Subtract one vowel from every diphthong
            else:                               # not a vowel
                neighbor = 0 
        if word[-2:] == 'le' or word[-3:] == 'les':
            count -= 1                          # Rule 3: Subtract any silent vowels
        elif word[:-1] == 'e' or word[-2:] =='es':
            count -= 1                          # Rule 3: Subtract any silent vowels
        elif 'ere' in word:
            count -= 1                          # Rule 3: Subtract any silent vowels
        if count < 1:
            count = 1                           # if after count is applied and is less than 1 there must be at least
                                                    # one syllable
        syllable.append(count)                  # add syllable number to list
    sum_syl = sum(syllable)                     # sum each word's syllable count
    
    if verbose:
        print sum_syl
        
    return syllable

In [10]:
# Example

line_syllables = count_syllable(cleaned_words)
print line_syllables

12
[1, 2, 2, 1, 3, 3]


I realized I was missing a number of common words. So I looked around and found M. Emre Aydın wrote a similar function, but with common exceptions. I added my original function to his excaptions to get a much better function. Thank you M. Emre Aydın!

In [11]:
# The exception cases were taken from 'http://eayd.in/?p=232'. I found, with the modifications, this model to be more 
# effective for known words than the function I originally created.

def sylco(new_words, verbose=False) :
    syllables = []
    for word in new_words:
        word = word.lower()
        exception_add = ['serious','crucial']       # exception_add are words that need extra syllables 
        exception_del = ['fortunately','unfortunately'] # exception_del are words that need less syllables

        co_one = ['cool','coach','coat','coal','count','coin','coarse','coup','coif','cook','coign','coiffe','coof','court']
        co_two = ['coapt','coed','coinci']

        pre_one = ['preach']

        syls = 0                                    #added syllable number
        disc = 0                                    #discarded syllable number

        if len(word) <= 3 : #1) if letters < 3 : return 1
            word_syl = 1
            syllables.append(word_syl)
            continue

        if word[-2:] == "es" or word[-2:] == "ed" : #2) if doesn't end with "ted" or "tes" or "ses" or "ied" or "ies", 
                                                    # discard "es" and "ed" at the end. If it has only 1 vowel or 1 set 
                                                    # of consecutive vowels, discard. (like "speed", "fled" etc.)
            doubleAndtripple_1 = len(re.findall(r'[eaoui][eaoui]',word))
            if doubleAndtripple_1 > 1 or len(re.findall(r'[eaoui][^eaoui]',word)) > 1 :
                if word[-3:] == "ted" or word[-3:] == "tes" or word[-3:] == "ses" or word[-3:] == "ied" or word[-3:] == "ies" :
                    pass
                else :
                    disc+=1

                                                    #3) discard trailing "e", except where ending is "le"  
        le_except = ['whole','mobile','pole','male','female','hale','pale','tale','sale','aisle','whale','while']

        if word[-1:] == "e" :
            if word[-2:] == "le" and word not in le_except :
                pass

            else :
                disc+=1

                                                    #4) check if consecutive vowels exists, triplets or pairs, count them as one.
        doubleAndtripple = len(re.findall(r'[eaoui][eaoui]',word))
        tripple = len(re.findall(r'[eaoui][eaoui][eaoui]',word))
        disc+=doubleAndtripple + tripple

                                                    #5) count remaining vowels in word.
        numVowels = len(re.findall(r'[eaoui]',word))

                                                    #6) add one if starts with "mc"
        if word[:2] == "mc" :
            syls+=1

                                                    #7) add one if ends with "y" but is not surrouned by vowel
        if word[-1:] == "y" and word[-2] not in "aeoui" :
            syls +=1

                                                    #8) add one if "y" is surrounded by non-vowels and is not in the last word.
        for i,j in enumerate(word) :
            if j == "y" :
                if (i != 0) and (i != len(word)-1) :
                    if word[i-1] not in "aeoui" and word[i+1] not in "aeoui" :
                        syls+=1

                                                    #9) if starts with "tri-" or "bi-" and is followed by a vowel, add one.
        if word[:3] == "tri" and word[3] in "aeoui" :
            syls+=1

        if word[:2] == "bi" and word[2] in "aeoui" :
            syls+=1

                                                    #10) if ends with "-ian", should be counted as two syllables, except for
                                                    #"-tian" and "-cian"
        if word[-3:] == "ian" : 
            if word[-4:] == "cian" or word[-4:] == "tian" :
                pass
            else :
                syls+=1

        #11) if starts with "co-" and is followed by a vowel, check if exists in the double syllable dictionary, 
        # if not, check if in single dictionary and act accordingly.

        if word[:2] == "co" and word[2] in 'eaoui' :

            if word[:4] in co_two or word[:5] in co_two or word[:6] in co_two :
                syls+=1
            elif word[:4] in co_one or word[:5] in co_one or word[:6] in co_one :
                pass
            else :
                syls+=1

        #12) if starts with "pre-" and is followed by a vowel, check if exists in the double syllable dictionary, 
        # if not, check if in single dictionary and act accordingly.

        if word[:3] == "pre" and word[3] in 'eaoui' :
            if word[:6] in pre_one :
                pass
            else :
                syls+=1

        #13) check for "-n't" and cross match with dictionary to add syllable.

        negative = ["doesn't", "isn't", "shouldn't", "couldn't","wouldn't"]

        if word[-3:] == "n't" :
            if word in negative :
                syls+=1
            else :
                pass  

        #14) Handling the exceptional words.

        if word in exception_del :
            disc+=1

        if word in exception_add :
            syls+=1    

        # calculate the output
        word_syl = numVowels - disc + syls
        syllables.append(word_syl)
    syllables = [x for x in syllables if x != 0]
    return syllables

In [60]:
# EXAMPLE

line_syllables = sylco(cleaned_words)
print line_syllables

[1, 1, 1, 2, 1, 1, 1, 2]


This function collects all the different versions of each word from the CMU dictionary since I need to use the correct version of each word in order to get the 'true' meter. 

In [13]:
def compile_meter_list(new_words, verbose=True):
    # simplifies and compiles cmu cormpus info into listed list
    iambic = cmudict.dict()                     # connect to cmu corpus, called iambic
    big_list = []                               # list to collect all the different versions of words and their meter
    for word in new_words:                      # get word from list of clean words
        syl_num = sylco([word])
        word_n_versions_list = []               # list has each word and the different versions
        word_n_versions_list.append(word)       # add word
        versions_list = []                      # list of all diff versions
        try:                                    # if word is in corpus
            for n,x in enumerate(iambic[word.lower()]): # get versions for each word
                version = []                    # list for each version
                version.append(word+str(n))     # add word+version
                meter_list = []                 # list holds word version's meter
                for y in x:                     # for word in cmu-dict sent
                    for char in y:              # for character in word
                        if char.isdigit() == True: # if the char is a number
                            meter_list.append(int(char)) # add number to meter
                version.append(meter_list)      # add meter to the word version
                versions_list.append(version)   # add all the versions to one list
            word_n_versions_list.append(versions_list) # add list of diff versions to word and versions list
            big_list.append(word_n_versions_list)       
        except:                                 # if word isnt in corpus
            version = []                        # empty version
            version.append(word+str(0))         # add word1
            meter_list = []                     # empty meter list
            if len(syl_num) == 1:
                for syl in range(syl_num[0]):   # for each syllable...
                    meter_list.append(-1)       # add 0 to meter_list
                version.append(meter_list)      # add empty meter list to version
                versions_list.append(version)   # add version w/ word1 to versions list
                word_n_versions_list.append(versions_list) # add list of diff versions to word and versions list
                big_list.append(word_n_versions_list) # adds word and versions to big list
    return big_list

In [61]:
# EXAMPLE

word_listed_list = compile_meter_list(cleaned_words)
compile_meter_list(cleaned_words, verbose=False)

[['But', [['But0', [1]]]],
 ['as', [['as0', [1]], ['as1', [1]]]],
 ['the', [['the0', [0]], ['the1', [1]], ['the2', [0]]]],
 ['riper', [['riper0', [-1, -1]]]],
 ['should', [['should0', [1]]]],
 ['by', [['by0', [1]]]],
 ['time', [['time0', [1]]]],
 ['decease', [['decease0', [0, 1]]]]]

This function uses the compiled list of meters and a pre-set optimal version of the meter to find which version of each word best fits this line. Keifer helped me write how to pick the version that got the best 'score' compared with the optimal. Thanks Keifer!

In [15]:
# received help from keifer

def find_best(line, intended_syllables, optimal=[0,1,0,1,0,1,0,1,0,1,0,1], verbose=True):
    # finds best version of meter for each word and creates the best meter for the line.
    optimal_line = []
    optimal_meter = []
    
    syllable_index = 0
    
    for syllables, word_list in zip(intended_syllables, line): # zips syllable list and the words
        
        if verbose:
            print 'Current syllable index:', syllable_index
            print 'Current word key:', word_list
        
        best_score = float('Inf')
        best_inflections = []
        
        for word, inflection_option in word_list[1]:
                
            if not len(inflection_option) == 0: # make sure it is not empty
                
                if len(inflection_option) > syllables: # if CMU dict has more syllables than my syllable function
                    inflection_option = inflection_option[:syllables] # cut off the extra syllables
                
                current_optimal = optimal[syllable_index:syllable_index+syllables] # sets the meter we are looking at
                # this 'scores' how well the option fits the optimal
                score = sum([0 if i==2 else abs(i - o) for i, o in zip(inflection_option, current_optimal)])
                
                if verbose:
                    print 'Word inflections option:', inflection_option, 'score:', score

                if score < best_score:          # If score is better than the best score so far
                    best_inflections = inflection_option # save the inflection option that had the best score
                    best_score = score          # set new score as the new best score
                elif score == 10 and syllables == 10: # if the score points to an inverse meter
                    best_inflections = inflection_option # save the inflection option that had the best score
                    best_score = score          # set new score as the new best score
        
        optimal_line.append([word_list[0],[word_list[0], best_inflections]])
        optimal_meter.append(best_inflections)
        syllable_index = syllable_index + syllables
        
        if verbose:
            print 'Current optimal line:\n', 
            pprint(optimal_line)
            print '-----------------------------------------\n'
        
    return optimal_line, optimal_meter

In [62]:
# EXAMPLE

optimal_line, optimal_meter = find_best(word_listed_list, line_syllables, verbose=False)
pprint(optimal_line)

[['But', ['But', [1]]],
 ['as', ['as', [1]]],
 ['the', ['the', [0]]],
 ['riper', ['riper', [-1, -1]]],
 ['should', ['should', [1]]],
 ['by', ['by', [1]]],
 ['time', ['time', [1]]],
 ['decease', ['decease', [0, 1]]]]


This simply uses textblob to get basic sentiment. Polarity measures good (1) and bad (-1). Subjectivity measures how subjective (0) or objective (1) the phrase is.

In [17]:
from textblob import TextBlob

def get_sentiment(new_words, verbose=False):
    line = ' '.join(new_words)
    feels = TextBlob(str(line))
    return feels.sentiment

In [63]:
# EXAMPLE

feels = get_sentiment(cleaned_words)
print feels.polarity
print feels.subjectivity

0.0
0.0


Fixes lines to fit the dateframe or removes lines that had errors in them. It also changes the binary format of the meter to 'stress', 'unstress', and 'missing'.

In [19]:
#received help from keifer

def parse_sonnet_lines(sonnet_lines, author, verbose=1):
    
    # We're making the DataFrame at the end, from the full set of parsed
    # lines, so first set up the "container" of columns.
    # DataFrames can take numpy arrays where a list of lists is equivalent
    # to each row an internal list, columns lengths of the interal lists.
    syllable_inflection_columns = []
    
    word_list_column = []
    
    sonnet_num_list = []
    
    author_list = []
    
    polarity_list = []
    
    subjectivity_list = []
    
    # Track how many completed and skipped lines. Ideally the only skipped
    # lines are at the beginning and end, but skips regardless.
    completed_lines = 0
    skipped_lines = 0
    
    # Iterate through the sonnet lines. Keep track of line index
    # in case you want to add more information from the sonnet lines later.
    for line_index, line in enumerate(sonnet_lines):
        
        # Print out tracking information. It's slow so nice to see where it is.
        if verbose == 1:
            if (line_index % 10 == 0):
                print 'sonnet line:', line_index+1, 'complete inflections:', len(syllable_inflection_columns)
        
        # skip if nothing there
        if line == np.nan:
            line = 'empty'
        
        # Split words in the line. Then cleans the words
        words = line.split()
        cleaned_words = clean_words(words, verbose=False)
        
        # If len < 2 skip.
        if len(cleaned_words) < 1:
            continue
        elif clean_words == ['empty']:
            continue
        elif len(cleaned_words) == 1:
            sonnet_num = cleaned_words[0]
        else:
            
            # Set up the current row for the DataFrame, which is an internal list
            # for the list of lists.
            syllable_inflection_row = []
            
            # Append the index of the line in the original sonnet_lines list.
            # This will be column 1 of the DataFrame.
            syllable_inflection_row.append(line_index)
            
            if (verbose == 2): 
                print 'counting syllables'
            line_syllables = sylco(cleaned_words, verbose=False)
            sum_line_syllables = sum(line_syllables)
            
            # Append the syllable count. Column 2 of the DataFrame.
            syllable_inflection_row.append(sum_line_syllables)
            
            # Append sentiment
            feels = get_sentiment(cleaned_words)
            
            if (verbose == 2): 
                print 'compiling meter list'
            word_listed_list = compile_meter_list(cleaned_words, verbose=False)
            
            if (verbose == 2): 
                print 'finding optimal meter'
            optimal_line, optimal_meter = find_best(word_listed_list, line_syllables,
                                                    verbose=False)
            optimal_meter_compress = [i for sublist in optimal_meter for i in sublist]
            
            # Some lines don't have 12 syllables. For the missing syllables add -1s 
            # to the end of the list until 12 syllables long.
            if len(optimal_meter_compress) < 12:
                missing_inflections = [-1 for i in range(12-len(optimal_meter_compress))]
                optimal_meter_compress = optimal_meter_compress + missing_inflections
            
            for inflection in optimal_meter_compress:
                # Append string indicators for inflections, for dummy coding in your
                # model later.
                if inflection == 1:
                    syllable_inflection_row.append('stress')
                elif inflection == 0:
                    syllable_inflection_row.append('unstress')
                elif inflection == -1:
                    syllable_inflection_row.append('missing')
                elif inflection == 2:
                    if syllable_inflection_row[-1] == 1:
                        syllable_inflection_row.append('unstress')
                    elif syllable_inflection_row[-1] == 0:
                        syllable_inflection_row.append('stress')
                    else:
                        syllable_inflection_row.append('unstress')
                        
            
            # Check to make sure the row is actually 14 columns long, 
            # or the DataFrame creation at the end will break.
            if len(syllable_inflection_row) == 13:
                if verbose: 
                    print 'Missing columns in row!!', len(syllable_inflection_row)
                syllable_inflection_row.append('missing')
                print "Fixed it!!", len(syllable_inflection_row)
                # adds the syllable row
                syllable_inflection_columns.append(syllable_inflection_row)
                # adds list of words for that row
                word_list_column.append(cleaned_words)
                # adds which sonnet it is
                if author == "shakespeare" or author == 'Shakespeare':
                    sonnet_num_list.append(int(1+ceil(line_index/16)))
                else:
                    sonnet_num_list.append(int(1+ceil(line_index/14)))
                # adds author
                author_list.append(author)
                # adds sentiment
                polarity_list.append(feels.polarity)
                subjectivity_list.append(feels.subjectivity)
            elif len(syllable_inflection_row) < 14:
                if verbose: 
                    print 'Missing columns in row !!', len(syllable_inflection_row)
                    print 'LEAVING THIS LINE OUT!!'
                    skipped_lines += 1
            elif len(syllable_inflection_row) == 15:
                if syllable_inflection_row[-1] == "missing":
                    del syllable_inflection_row[-1]
                    print "Fixed it !!", len(syllable_inflection_row)
                    # adds the syllable row
                    syllable_inflection_columns.append(syllable_inflection_row)
                    # adds list of words for that row
                    word_list_column.append(cleaned_words)
                    # adds which sonnet it is
                    if author == "shakespeare" or author == 'Shakespeare':
                        sonnet_num_list.append(int(1+ceil(line_index/16)))
                    else:
                        sonnet_num_list.append(int(1+ceil(line_index/14)))
                    # adds author
                    author_list.append(author)
                    # adds sentiment
                    polarity_list.append(feels.polarity)
                    subjectivity_list.append(feels.subjectivity)
            elif len(syllable_inflection_row) > 14:
                if verbose: 
                    print 'Too many columns in row !!', len(syllable_inflection_row)
                    print 'LEAVING THIS LINE OUT!!' 
                    skipped_lines += 1
            else:
                # adds the syllable row
                syllable_inflection_columns.append(syllable_inflection_row)
                # adds list of words for that row
                word_list_column.append(cleaned_words)
                # adds which sonnet it is
                if author == "shakespeare" or author == 'Shakespeare':
                    sonnet_num_list.append(int(1+ceil(line_index/16)))
                else:
                    sonnet_num_list.append(int(1+ceil(line_index/14)))
                # adds author
                author_list.append(author)
                # adds sentiment
                polarity_list.append(feels.polarity)
                subjectivity_list.append(feels.subjectivity)
            
            print 'sonnet number', line_index
            completed_lines += 1
            
    if verbose == 1:
        print 'completed_lines:', completed_lines, 'skipped lines:', skipped_lines
    
    # Turn the list of lists into a numpy array. This creates a matrix
    # of dimensions (num_sonnet_lines x 14).
    syllable_inflection_columns = np.array(syllable_inflection_columns)
    
    print "FINISHED !!!"
    return syllable_inflection_columns, word_list_column, sonnet_num_list, author_list, polarity_list, subjectivity_list

Cuts off each sentence at a specified length, sets the text's optimal meter, and fixes/removes errored lines.

In [41]:
def text_line_parser(noise_list):
    text_list = []
    meter_list = []

    for sind, sentence in enumerate(noise_list):
        sentence = sentence.split()
        cleaned_words = clean_words(sentence)
    #     print cleaned_words
        line_syllables = sylco(cleaned_words)
        
        #rand_syll_num = randint(10,11)
        line_cutoff = 0
        cutoff_sentences = []
        for ind, ls in enumerate(line_syllables):
            line_cutoff += ls
            if (line_cutoff >= 9) and (line_cutoff <= 12):
                cutoff_sentence = cleaned_words[0:ind+1]
                cutoff_sentences.append(cutoff_sentence)
                if line_cutoff == 12:
                    print cutoff_sentences[-1]
            elif line_cutoff > 12:
                break

        if len(cutoff_sentences) == 0:
            continue
        else:
            if len(cutoff_sentences) > 1:
                cutoff_sentence = np.random.choice(cutoff_sentences, size=1)[0]
            else:
                cutoff_sentence = cutoff_sentences[0]
                
            #valid_sentences.append(cutoff_sentence)
            print 'viable sentence index:', sind

            #print line_syllables
            word_listed_list = compile_meter_list(cutoff_sentence)

            error = 3
            bad_optimal = [[1,1,1,1,1,1,1,1,1,1,1,1],[0,0,0,0,0,0,0,0,0,0,0,0]]

            for optimal in bad_optimal:
                optimal_line, optimal_meter = find_best(word_listed_list, line_syllables, optimal=optimal, verbose=False)
                omc = [item for sublist in optimal_meter for item in sublist]

                matched_length = len([x for x, o, in zip(omc, optimal[0:len(omc)]) if x in [o, 2]])

                if not (matched_length >= len(omc)-error):
                        continue
                else:
                    print 'actually optimal:', sind
                    text_list.append(cutoff_sentence)
                    meter_list.append(omc)
    return text_list, meter_list

# import cPickle
#
# saving:
# filepath = open('filename.p', 'w')
# cPickle.dump(current_text_lines, filepath)
# filepath.close()
#
# loading:
# filepath = open('filename.p', 'r')
# loaded_text_lines = cPickle.load(filepath)
# filepath.close()


# running a script:
# first, add the function to the class
# change the functions it needs to self.whatever()
# at the bottom of the script your class is in:
#

# this goes at the very bottom at the script:
# if __name__ == '__main__':
    
#     austen_text_file = 'some_file_path'
#     pp = PoetryParser()
    
#     austen_text = pp.parse_austen_text()
    
#     pickle_filepath = 'pickle_filepath.p'
    
#     # your new function which is part of PoetryParser now:
#     pp.get_invalid_lines(austen_text_file, pickle_filepath)

# now, to run the script:
# > python poetry_parser.py

In [21]:
# EXAMPLE

print optimal_line
print ""
print optimal_meter
print ""
opt_meter_compress = [item for sublist in optimal_meter for item in sublist] # flattens listed lists
print opt_meter_compress

[['From', ['From', [0]]], ['fairest', ['fairest', [1, 0]]], ['creatures', ['creatures', [1, 0]]], ['we', ['we', [1]]], ['desire', ['desire', [0, 1]]], ['increase', ['increase', [0, 1]]]]

[[0], [1, 0], [1, 0], [1], [0, 1], [0, 1]]

[0, 1, 0, 1, 0, 1, 0, 1, 0, 1]


Organize all categories into lists in order to get them in a dataframe

In [48]:
def text_to_df(text_list, meter_list, author):    

    syllable_inflection_columns = []

    word_list_column = []

    sonnet_num_list = []

    author_list = []

    polarity_list = []

    subjectivity_list = []

    # Track how many completed and skipped lines. Ideally the only skipped
    # lines are at the beginning and end, but skips regardless.
    completed_lines = 0
    skipped_lines = 0

    for line_index, (t, m) in enumerate(zip(text_list, meter_list)):
        print t, m
        # Set up the current row for the DataFrame, which is an internal list
        # for the list of lists.
        syllable_inflection_row = []

        # Append the index of the line in the original sonnet_lines list.
        # This will be column 1 of the DataFrame.
        syllable_inflection_row.append(line_index)

        print 'counting syllables'
        line_syllables = sylco(t, verbose=False)
        sum_line_syllables = sum(line_syllables)
        print sum_line_syllables

        # Append the syllable count. Column 2 of the DataFrame.
        syllable_inflection_row.append(sum_line_syllables)

        # Append sentiment
        feels = get_sentiment(t)

        if len(m) < 12:
            missing_inflections = [-1 for i in range(12-len(m))]
            optimal_meter_compress = m + missing_inflections
        elif len(m) == 12:
            optimal_meter_compress = m

        for inflection in optimal_meter_compress:
            # Append string indicators for inflections, for dummy coding in your
            # model later.
            if inflection == 1:
                syllable_inflection_row.append('stress')
            elif inflection == 0:
                syllable_inflection_row.append('unstress')
            elif inflection == -1:
                syllable_inflection_row.append('missing')
            elif inflection == 2:
                if syllable_inflection_row[-1] == 1:
                    syllable_inflection_row.append('unstress')
                elif syllable_inflection_row[-1] == 0:
                    syllable_inflection_row.append('stress')
                else:
                    syllable_inflection_row.append('unstress')

                # Check to make sure the row is actually 14 columns long, 
                # or the DataFrame creation at the end will break.
        if len(syllable_inflection_row) == 13:
            print 'Missing columns in row!!', len(syllable_inflection_row)
            syllable_inflection_row.append('missing')
            print "Fixed it!!", len(syllable_inflection_row)
            # adds the syllable row
            syllable_inflection_columns.append(syllable_inflection_row)
            # adds list of words for that row
            word_list_column.append(t)
            # adds which sonnet it is
            sonnet_num_list.append(line_index)
            # adds author
            author_list.append(author)
            # adds sentiment
            polarity_list.append(feels.polarity)
            subjectivity_list.append(feels.subjectivity)
        elif len(syllable_inflection_row) < 14:
            print 'Missing columns in row !!', len(syllable_inflection_row)
            print 'LEAVING THIS LINE OUT!!'
            skipped_lines += 1
        elif len(syllable_inflection_row) == 15:
            if syllable_inflection_row[-1] == "missing":
                del syllable_inflection_row[-1]
                print "Fixed it !!", len(syllable_inflection_row)
                # adds the syllable row
                syllable_inflection_columns.append(syllable_inflection_row)
                # adds list of words for that row
                word_list_column.append(t)
                # adds which sonnet it is
                sonnet_num_list.append(line_index)
                # adds author
                author_list.append(author)
                # adds sentiment
                polarity_list.append(feels.polarity)
                subjectivity_list.append(feels.subjectivity)
        elif len(syllable_inflection_row) > 14:
            print 'Too many columns in row !!', len(syllable_inflection_row)
            print 'LEAVING THIS LINE OUT!!' 
            skipped_lines += 1
        else:
            # adds the syllable row
            syllable_inflection_columns.append(syllable_inflection_row)
            # adds list of words for that row
            word_list_column.append(t)
            # adds which sonnet it is
            sonnet_num_list.append(line_index)
            # adds author
            author_list.append(author)
            # adds sentiment
            polarity_list.append(feels.polarity)
            subjectivity_list.append(feels.subjectivity)

        print 'sonnet number', line_index
        completed_lines += 1

    print 'completed_lines:', completed_lines, 'skipped lines:', skipped_lines

    # Turn the list of lists into a numpy array. This creates a matrix
    # of dimensions (num_sonnet_lines x 14).
    syllable_inflection_columns = np.array(syllable_inflection_columns)

    print "FINISHED !!!"
    return syllable_inflection_columns, word_list_column, sonnet_num_list, author_list, polarity_list, subjectivity_list

Create uniform dataframe

In [23]:
def create_dataframe(syllable_inflection_columns, word_list_column, sonnet_num_list, author_list, polarity_list, subjectivity_list):
    # Set up column names.
    column_names = ['sonnet_index','syllables','s1','s2','s3','s4','s5',
                    's6','s7','s8','s9','s10','s11','s12']
    
    # Turn the matrix into a DataFrame with the column names.
    sonnet_df = pd.DataFrame(syllable_inflection_columns, columns=column_names)
    sonnet_df['word_list'] = word_list_column
    sonnet_df['sonnet_num'] = sonnet_num_list
    sonnet_df['author'] = author_list
    sonnet_df['polarity'] = polarity_list
    sonnet_df['subjectivity'] = subjectivity_list
    
    return sonnet_df

## Example

In [None]:
text_list, meter_list = text_line_parser([line])

In [None]:
author = 'Example'
syllable_inflection_columns, word_list_column, sonnet_num_list, author_list, polarity_list, subjectivity_list = text_to_df(text_list, meter_list, author)

In [None]:
example_text_df = create_dataframe(syllable_inflection_columns, word_list_column, sonnet_num_list, author_list, polarity_list, subjectivity_list)

example_text_df.head()

In [None]:
example_text_df.to_csv('./assets/example_text_df.csv')

## Shakespeare Sonnets

In [3]:
import urllib                                   # import urllib.request
sonnetsUrl = "http://www.gutenberg.org/cache/epub/1041/pg1041.txt"
sonnetsString = urllib.urlopen(sonnetsUrl).read()
len(sonnetsString)

122777

In [4]:
filteredSonnetsStart = sonnetsString.find("  I\r\n") # title of first sonnet
filteredSonnetsEnd = sonnetsString.find("End of Project Gutenberg's") # end of sonnets
filteredSonnetsString = sonnetsString[filteredSonnetsStart:filteredSonnetsEnd].rstrip() # strip spaces from end (right)
print(filteredSonnetsString[:665])              # pull sonnet 1 

  I

  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou, contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bud buriest thy content,
  And tender churl mak'st waste in niggarding:
    Pity the world, or else this glutton be,
    To eat the world's due, by the grave and thee.



In [45]:
sonnet_lines = filteredSonnetsString[:122777].split("\n")
author = 'Shakespeare'
syllable_inflection_columns, word_list_column, sonnet_num_list, author_list, polarity_list, subjectivity_list = parse_sonnet_lines(sonnet_lines, author)

sonnet line: 1 complete inflections: 0
sonnet number 2
sonnet number 3
sonnet number 4
sonnet number 5
sonnet number 6
sonnet number 7
sonnet number 8
sonnet number 9
sonnet line: 11 complete inflections: 8
sonnet number 10
sonnet number 11
sonnet number 12
sonnet number 13
sonnet number 14
sonnet number 15
sonnet number 19
sonnet line: 21 complete inflections: 15
sonnet number 20
sonnet number 21
sonnet number 22
sonnet number 23
sonnet number 24
sonnet number 25
sonnet number 26
sonnet number 27
sonnet number 28
sonnet number 29
sonnet line: 31 complete inflections: 25
sonnet number 30
sonnet number 31
sonnet number 32
sonnet number 36
sonnet number 37
sonnet number 38
sonnet number 39
sonnet line: 41 complete inflections: 32
sonnet number 40
sonnet number 41
sonnet number 42
sonnet number 43
sonnet number 44
sonnet number 45
sonnet number 46
sonnet number 47
sonnet number 48
sonnet number 49
sonnet line: 51 complete inflections: 42
sonnet number 53
sonnet number 54
sonnet number 55


In [46]:
print len(syllable_inflection_columns)
print len(author_list)
print len(word_list_column)
print len(sonnet_num_list)
print len(polarity_list)
print len(subjectivity_list)

2151
2151
2151
2151
2151
2151


In [47]:
sonnet_df = create_dataframe(syllable_inflection_columns, word_list_column, sonnet_num_list, author_list, polarity_list, subjectivity_list)

In [48]:
sonnet_df

Unnamed: 0,sonnet_index,syllables,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,word_list,sonnet_num,author,polarity,subjectivity
0,2,10,unstress,stress,unstress,stress,unstress,stress,unstress,stress,unstress,stress,missing,missing,"[From, fairest, creatures, we, desire, increase]",1,Shakespeare,0.000000,0.000000
1,3,11,unstress,stress,stress,stress,unstress,stress,stress,stress,unstress,stress,missing,missing,"[That, thereby, beauty, rose, might, never, die]",1,Shakespeare,0.600000,0.950000
2,4,10,stress,stress,unstress,missing,missing,stress,stress,stress,unstress,stress,missing,missing,"[But, as, the, riper, should, by, time, decease]",1,Shakespeare,0.000000,0.000000
3,5,10,unstress,stress,unstress,stress,stress,stress,unstress,stress,unstress,unstress,missing,missing,"[His, tender, heir, might, bear, his, memory]",1,Shakespeare,0.000000,0.000000
4,6,10,stress,stress,stress,unstress,unstress,stress,stress,stress,stress,stress,missing,missing,"[But, thou, contracted, to, thine, own, bright...",1,Shakespeare,0.650000,0.900000
5,7,10,missing,stress,stress,stress,unstress,stress,unstress,stress,unstress,stress,missing,missing,"[Feedst, thy, light, flame, with, self, substa...",1,Shakespeare,0.400000,0.700000
6,8,10,stress,unstress,unstress,stress,unstress,stress,unstress,stress,unstress,stress,missing,missing,"[Making, a, famine, where, abundance, lies]",1,Shakespeare,0.000000,0.000000
7,9,10,stress,stress,stress,stress,unstress,stress,stress,stress,stress,stress,missing,missing,"[Thy, self, thy, foe, to, thy, sweet, self, to...",1,Shakespeare,-0.325000,0.825000
8,10,10,stress,stress,stress,stress,unstress,stress,stress,stress,unstress,unstress,missing,missing,"[Thou, that, art, now, the, world, fresh, orna...",1,Shakespeare,0.300000,0.500000
9,11,10,unstress,stress,unstress,stress,unstress,stress,unstress,stress,unstress,stress,missing,missing,"[And, only, herald, to, the, gaudy, spring]",1,Shakespeare,0.000000,1.000000


In [49]:
sonnet_df.to_csv('./assets/shake_sonnet_df.csv')

## Keat Sonnets

In [50]:
keat_df = pd.read_csv('./assets/clean_keat_sonnets.csv')
keat_df.head(2)

Unnamed: 0.1,Unnamed: 0,text
0,0,
1,1,Bright star! would I were steadfast as thou art


In [51]:
keat_df.columns = ['unnamed', 'text']
keat_df = keat_df.drop('unnamed', axis=1)
keat_df = keat_df.dropna(how='any')
# keat_df = keat_df.ix[1:]
keat_df.head()

Unnamed: 0,text
1,Bright star! would I were steadfast as thou art
2,"Not in lone splendour hung aloft the night,"
3,"And watching, with eternal lids apart,"
4,"Like Nature’s patient sleepless Eremite,"
5,The moving waters at their priestlike task


In [52]:
keat_df['text'][:10]

1     Bright star! would I were steadfast as thou art
2         Not in lone splendour hung aloft the night,
3              And watching, with eternal lids apart,
4            Like Nature’s patient sleepless Eremite,
5          The moving waters at their priestlike task
6        Of pure ablution round earth’s human shores,
7               Or gazing on the new soft fallen mask
8            Of snow upon the mountains and the moors
9          Noyet still steadfast, still unchangeable,
10      Pillow’d upon my fair love’s ripening breast,
Name: text, dtype: object

In [53]:
keat_sonnet_lines = [x for x in keat_df['text']]
author = 'Keat'
syllable_inflection_columns, word_list_column, sonnet_num_list, author_list, polarity_list, subjectivity_list = parse_sonnet_lines(keat_sonnet_lines, author)

sonnet line: 1 complete inflections: 0
Missing columns in row!! 13
Fixed it!! 14
sonnet number 0
sonnet number 1
sonnet number 2
sonnet number 3
sonnet number 4
sonnet number 5
sonnet number 6
sonnet number 7
Missing columns in row!! 13
Fixed it!! 14
sonnet number 8
sonnet number 9
sonnet line: 11 complete inflections: 10
sonnet number 10
sonnet number 11
sonnet number 12
sonnet number 13
sonnet number 14
sonnet number 15
sonnet number 16
sonnet number 17
Missing columns in row!! 13
Fixed it!! 14
sonnet number 18
sonnet number 19
sonnet line: 21 complete inflections: 20
sonnet number 20
sonnet number 21
sonnet number 22
sonnet number 23
Missing columns in row!! 13
Fixed it!! 14
sonnet number 24
sonnet number 25
sonnet number 26
sonnet number 27
sonnet number 28
sonnet number 29
sonnet line: 31 complete inflections: 30
sonnet number 30
sonnet number 31
Missing columns in row!! 13
Fixed it!! 14
sonnet number 32
sonnet number 33
sonnet number 34
sonnet number 35
sonnet number 36
sonnet nu

In [54]:
keat_sonnet_df = create_dataframe(syllable_inflection_columns, word_list_column, sonnet_num_list, author_list, polarity_list, subjectivity_list)
keat_sonnet_df.head()

Unnamed: 0,sonnet_index,syllables,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,word_list,sonnet_num,author,polarity,subjectivity
0,0,10,stress,stress,stress,stress,unstress,stress,stress,stress,stress,missing,missing,missing,"[Bright, star, would, I, were, steadfast, as, ...",1,Keat,0.55,0.8
1,1,10,stress,stress,stress,missing,missing,stress,unstress,stress,unstress,stress,missing,missing,"[Not, in, lone, splendour, hung, aloft, the, n...",1,Keat,0.0,0.0
2,2,10,unstress,stress,unstress,stress,unstress,stress,unstress,stress,unstress,stress,missing,missing,"[And, watching, with, eternal, lids, apart]",1,Keat,0.0,0.0
3,3,10,stress,stress,unstress,stress,unstress,stress,unstress,missing,missing,missing,missing,missing,"[Like, Natures, patient, sleepless, Eremite]",1,Keat,0.0,0.0
4,4,10,unstress,stress,unstress,stress,unstress,stress,stress,missing,missing,stress,missing,missing,"[The, moving, waters, at, their, priestlike, t...",1,Keat,0.0,0.0


In [55]:
keat_sonnet_df.to_csv('./assets/keat_sonnet_df.csv')

## Frost Sonnets

In [17]:
frost_df = pd.read_csv('./assets/clean_frost_sonnets.csv')
frost_df.head(2)

Unnamed: 0.1,Unnamed: 0,author,text
0,0,Frost,"One of my wishes is that those dark trees,"
1,1,Frost,"So old and firm they scarcely show the breeze,"


In [18]:
frost_df.columns = ['unnamed', 'author', 'text']
frost_df = frost_df.drop('unnamed', axis=1)
frost_df.head()

Unnamed: 0,author,text
0,Frost,"One of my wishes is that those dark trees,"
1,Frost,"So old and firm they scarcely show the breeze,"
2,Frost,"Were not, as 'twere, the merest mask of gloom,"
3,Frost,But stretched away unto the edge of doom.
4,Frost,I should not be withheld but that some day


In [42]:
frost_sonnet_lines = [x for x in frost_df['text']]
author = 'Frost'
syllable_inflection_columns, word_list_column, sonnet_num_list, author_list, polarity_list, subjectivity_list = parse_sonnet_lines(frost_sonnet_lines, author)

sonnet line: 1 complete inflections: 0
sonnet number 0
sonnet number 1
sonnet number 2
sonnet number 3
sonnet number 4
sonnet number 5
sonnet number 6
Missing columns in row!! 13
Fixed it!! 14
sonnet number 7
sonnet number 8
sonnet number 9
sonnet line: 11 complete inflections: 10
Missing columns in row!! 13
Fixed it!! 14
sonnet number 10
sonnet number 11
sonnet number 12
sonnet number 13
sonnet number 14
sonnet number 15
sonnet number 16
sonnet number 17
sonnet number 18
sonnet number 19
sonnet line: 21 complete inflections: 20
Missing columns in row!! 13
Fixed it!! 14
sonnet number 20
sonnet number 21
sonnet number 22
sonnet number 23
sonnet number 24
sonnet number 25
sonnet number 26
sonnet number 27
sonnet number 28
sonnet number 29
sonnet line: 31 complete inflections: 30
sonnet number 30
sonnet number 31
Missing columns in row!! 13
Fixed it!! 14
sonnet number 32
sonnet number 33
sonnet number 34
sonnet number 35
sonnet number 36
sonnet number 37
Missing columns in row !! 12
LEAVI

In [43]:
frost_sonnet_df = create_dataframe(syllable_inflection_columns, word_list_column, sonnet_num_list, author_list, polarity_list, subjectivity_list)
frost_sonnet_df.head()

Unnamed: 0,sonnet_index,syllables,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,word_list,sonnet_num,author,polarity,subjectivity
0,0,9,stress,stress,stress,stress,unstress,stress,stress,stress,stress,missing,missing,missing,"[One, of, my, wishes, is, that, those, dark, t...",1,Frost,-0.15,0.4
1,1,11,stress,stress,unstress,stress,stress,stress,unstress,stress,stress,stress,missing,missing,"[So, old, and, firm, they, scarcely, show, the...",1,Frost,-0.05,0.3
2,2,10,unstress,stress,stress,missing,unstress,stress,unstress,stress,unstress,stress,missing,missing,"[Were, not, as, twere, the, merest, mask, of, ...",1,Frost,-0.133333,0.133333
3,3,10,stress,stress,unstress,stress,stress,unstress,unstress,stress,unstress,stress,missing,missing,"[But, stretched, away, unto, the, edge, of, doom]",1,Frost,-0.05,0.0
4,4,10,stress,stress,stress,stress,unstress,stress,stress,stress,stress,stress,missing,missing,"[I, should, not, be, withheld, but, that, some...",1,Frost,0.0,0.0


In [44]:
frost_sonnet_df.to_csv('./assets/frost_sonnet_df.csv')

## Various Sonnets

In [56]:
var_df = pd.read_csv('./assets/clean_var_sonnets.csv')
var_df.head(2)

Unnamed: 0.1,Unnamed: 0,author,text
0,0,Helen Hunt Jackson,Some flowers are withered and some joys have d...
1,1,Helen Hunt Jackson,The garden reeks with an East Indian scent


In [57]:
var_df.columns = ['unnamed', 'author', 'text']
var_df = var_df.drop('unnamed', axis=1)
var_df = var_df.dropna(how='any')
var_df.head()

Unnamed: 0,author,text
0,Helen Hunt Jackson,Some flowers are withered and some joys have d...
1,Helen Hunt Jackson,The garden reeks with an East Indian scent
2,Helen Hunt Jackson,From beds where gillyflowers stand weak and sp...
3,Helen Hunt Jackson,The white heat pales the skies from side to si...
4,Helen Hunt Jackson,"But in still lakes and rivers, cool, content,"


In [58]:
var_df['text'][215:225]

220        Was caught up into love, and taught the whole
221             Of life in a new rhythm. The cup of dole
222            God gave for baptism, I am fain to drink,
223    And praise its sweetness, Sweet, with thee anear.
224       The names of country, heaven, are changed away
225       For where thou art or shalt be, there or here;
226    And this . . . this lute and song . . . loved ...
227              (The singing angels know) are only dear
228       Because thy name moves right in what they say.
229                 Is it indeed so? If I lay here dead,
Name: text, dtype: object

In [59]:
var_sonnet_lines = [x for x in var_df['text']]
author = 'Various'
syllable_inflection_columns, word_list_column, sonnet_num_list, author_list, polarity_list, subjectivity_list = parse_sonnet_lines(var_sonnet_lines, author)

sonnet line: 1 complete inflections: 0
sonnet number 0
sonnet number 1
sonnet number 2
sonnet number 3
sonnet number 4
sonnet number 5
sonnet number 6
sonnet number 7
sonnet number 8
sonnet number 9
sonnet line: 11 complete inflections: 10
sonnet number 10
sonnet number 11
sonnet number 12
sonnet number 13
sonnet number 14
sonnet number 15
sonnet number 16
sonnet number 17
sonnet number 18
sonnet number 19
sonnet line: 21 complete inflections: 20
sonnet number 20
sonnet number 21
sonnet number 22
sonnet number 23
sonnet number 24
Missing columns in row!! 13
Fixed it!! 14
sonnet number 25
sonnet number 26
sonnet number 27
sonnet number 28
sonnet number 29
sonnet line: 31 complete inflections: 30
sonnet number 30
sonnet number 31
sonnet number 32
sonnet number 33
sonnet number 34
sonnet number 35
sonnet number 36
sonnet number 37
sonnet number 38
sonnet number 39
sonnet line: 41 complete inflections: 40
sonnet number 40
sonnet number 41
sonnet number 42
sonnet number 43
sonnet number 44


In [60]:
var_sonnet_df = create_dataframe(syllable_inflection_columns, word_list_column, sonnet_num_list, author_list, polarity_list, subjectivity_list)

var_sonnet_df['author'] = var_df['author']
var_sonnet_df.head()

Unnamed: 0,sonnet_index,syllables,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,word_list,sonnet_num,author,polarity,subjectivity
0,0,11,stress,stress,unstress,stress,stress,unstress,unstress,stress,stress,stress,missing,missing,"[Some, flowers, are, withered, and, some, joys...",1,Helen Hunt Jackson,0.0,0.0
1,1,11,unstress,stress,unstress,stress,unstress,stress,stress,stress,unstress,unstress,stress,missing,"[The, garden, reeks, with, an, East, Indian, s...",1,Helen Hunt Jackson,0.0,0.0
2,2,11,unstress,stress,stress,missing,missing,missing,missing,stress,stress,stress,missing,missing,"[From, beds, where, gillyflowers, stand, weak,...",1,Helen Hunt Jackson,-0.2375,0.3625
3,3,11,unstress,stress,stress,stress,unstress,stress,unstress,stress,unstress,missing,missing,missing,"[The, white, heat, pales, the, skies, from, si...",1,Helen Hunt Jackson,0.0,0.0
4,4,10,stress,stress,stress,stress,unstress,stress,unstress,stress,unstress,stress,missing,missing,"[But, in, still, lakes, and, rivers, cool, con...",1,Helen Hunt Jackson,0.35,0.65


In [61]:
var_sonnet_df.to_csv('./assets/var_sonnet_df.csv')

## Austen Text

In [24]:
import urllib
noiseUrl = "http://www.gutenberg.org/cache/epub/1342/pg1342.txt"
NoiseString = urllib.urlopen(noiseUrl).read()
filterednoiseStart = NoiseString.find("By Jane Austen") # title of first sonnet
filterednoiseEnd = NoiseString.find("End of Project Gutenberg's") # end of sonnets
filterednoiseString = NoiseString[filterednoiseStart:filterednoiseEnd].rstrip() # strip spaces from end (right)
len(filterednoiseString)

716917

In [25]:
noise_list = filterednoiseString[:716917].rstrip().replace('\r\n\r\n\r\n\r\n', '. ')
noise_list = noise_list.replace('\r\n\r\n\r\n','. ')
noise_list = noise_list.replace('\r\n',' ')
noise_list = noise_list.replace('Mr.', 'Mr')
noise_list = noise_list.replace('Mrs.', 'Mrs')
noise_list = noise_list.replace('Dr.', 'Dr')
noise_list = noise_list.split('. ')
noise_list[0].split()
print len(noise_list)

4257


In [46]:
text_list, meter_list = text_line_parser(noise_list)


# import cPickle
#
# saving:
# filepath = open('filename.p', 'w')
# cPickle.dump(current_text_lines, filepath)
# filepath.close()
#
# loading:
# filepath = open('filename.p', 'r')
# loaded_text_lines = cPickle.load(filepath)
# filepath.close()


# running a script:
# first, add the function to the class
# change the functions it needs to self.whatever()
# at the bottom of the script your class is in:
#

# this goes at the very bottom at the script:
# if __name__ == '__main__':
    
#     austen_text_file = 'some_file_path'
#     pp = PoetryParser()
    
#     austen_text = pp.parse_austen_text()
    
#     pickle_filepath = 'pickle_filepath.p'
    
#     # your new function which is part of PoetryParser now:
#     pp.get_invalid_lines(austen_text_file, pickle_filepath)

# now, to run the script:
# > python poetry_parser.py

['It', 'is', 'a', 'truth', 'universally', 'acknowledged']
viable sentence index: 2
actually optimal: 2
['However', 'little', 'known', 'the', 'feelings', 'or', 'views', 'of']
viable sentence index: 3
actually optimal: 3
['My', 'dear', 'Mr', 'Bennet', 'said', 'his', 'lady', 'to', 'him', 'one']
viable sentence index: 4
actually optimal: 4
['But', 'it', 'is', 'returned', 'she;', 'for', 'Mrs', 'Long', 'has', 'just', 'been']
viable sentence index: 5
actually optimal: 5
['Do', 'you', 'not', 'want', 'to', 'know', 'who', 'has', 'taken', 'it', 'cried']
viable sentence index: 6
actually optimal: 6
['You', 'want', 'to', 'tell', 'me', 'and', 'I', 'have', 'no', 'objection']
viable sentence index: 7
actually optimal: 7
viable sentence index: 8
actually optimal: 8
['What', 'a', 'fine', 'thing', 'for', 'our', 'girls', 'How', 'so', 'How', 'can', 'it']
viable sentence index: 9
actually optimal: 9
['You', 'and', 'the', 'girls', 'may', 'go', 'or', 'you', 'may', 'send', 'them', 'by']
viable sentence index: 

In [49]:
author = 'Austen'
syllable_inflection_columns, word_list_column, sonnet_num_list, author_list, polarity_list, subjectivity_list = text_to_df(text_list, meter_list, author)

['It', 'is', 'a', 'truth', 'universally', 'acknowledged'] [0, 0, 0, 1, 2, 0, 1, 0, 0, 0, 1, 0]
counting syllables
12
sonnet number 0
['However', 'little', 'known', 'the', 'feelings', 'or'] [2, 1, 0, 1, 0, 1, 1, 1, 0, 1]
counting syllables
10
sonnet number 1
['My', 'dear', 'Mr', 'Bennet', 'said', 'his', 'lady', 'to'] [1, 1, 1, 1, 0, 1, 1, 1, 0, 1]
counting syllables
10
sonnet number 2
['But', 'it', 'is', 'returned', 'she;', 'for', 'Mrs', 'Long'] [1, 1, 1, 0, 1, -1, 1, 1, 1]
counting syllables
9
sonnet number 3
['Do', 'you', 'not', 'want', 'to', 'know', 'who', 'has', 'taken', 'it', 'cried'] [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1]
counting syllables
12
sonnet number 4
['You', 'want', 'to', 'tell', 'me', 'and', 'I', 'have', 'no', 'objection'] [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0]
counting syllables
12
sonnet number 5
['Why', 'my', 'dear', 'you', 'must', 'know', 'Mrs', 'Long', 'says'] [1, 1, 1, 1, 1, 1, 1, 1, 1]
counting syllables
9
sonnet number 6
['What', 'a', 'fine', 'thing', 'for', 'our', '

In [50]:
austen_text_df = create_dataframe(syllable_inflection_columns, word_list_column, sonnet_num_list, author_list, polarity_list, subjectivity_list)

austen_text_df.head()

Unnamed: 0,sonnet_index,syllables,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,word_list,sonnet_num,author,polarity,subjectivity
0,0,12,unstress,unstress,unstress,stress,unstress,unstress,stress,unstress,unstress,unstress,stress,unstress,"[It, is, a, truth, universally, acknowledged]",0,Austen,0.0,0.0
1,1,10,unstress,stress,unstress,stress,unstress,stress,stress,stress,unstress,stress,missing,missing,"[However, little, known, the, feelings, or]",1,Austen,-0.1875,0.5
2,2,10,stress,stress,stress,stress,unstress,stress,stress,stress,unstress,stress,missing,missing,"[My, dear, Mr, Bennet, said, his, lady, to]",2,Austen,0.0,0.0
3,3,9,stress,stress,stress,unstress,stress,missing,stress,stress,stress,missing,missing,missing,"[But, it, is, returned, she;, for, Mrs, Long]",3,Austen,-0.05,0.4
4,4,12,stress,stress,stress,stress,stress,stress,stress,stress,stress,unstress,stress,stress,"[Do, you, not, want, to, know, who, has, taken...",4,Austen,0.0,0.0


In [51]:
austen_text_df.to_csv('./assets/austen_text_df.csv')

## Doyle Text

In [52]:
import urllib
noiseUrl = "http://www.gutenberg.org/cache/epub/1661/pg1661.txt"
NoiseString = urllib.urlopen(noiseUrl).read()
filterednoiseStart = NoiseString.find("XII. The Adventure of the Copper Beeches") # title of first sonnet
filterednoiseEnd = NoiseString.find("End of Project Gutenberg's") # end of sonnets
filterednoiseString = NoiseString[filterednoiseStart:filterednoiseEnd].rstrip() # strip spaces from end (right)
len(filterednoiseString)

593765

In [53]:
noise_list = filterednoiseString[:593765].rstrip().replace('\r\n\r\n\r\n\r\n', '. ')
noise_list = noise_list.replace('\r\n\r\n\r\n','. ')
noise_list = noise_list.replace('\r\n',' ')
noise_list = noise_list.replace('Mr.', 'Mr')
noise_list = noise_list.replace('Mrs.', 'Mrs')
noise_list = noise_list.replace('Dr.', 'Dr')
noise_list = noise_list.split('. ')
print noise_list[3].split()
print len(noise_list)

['A', 'SCANDAL', 'IN', 'BOHEMIA', 'I']
4621


In [54]:
text_list, meter_list = text_line_parser(noise_list)

viable sentence index: 1
actually optimal: 1
viable sentence index: 4
actually optimal: 4
['I', 'have', 'seldom', 'heard', 'him', 'mention', 'her', 'under', 'any']
viable sentence index: 5
actually optimal: 5
['It', 'was', 'not', 'that', 'he', 'felt', 'any', 'emotion', 'akin']
viable sentence index: 7
actually optimal: 7
['All', 'emotions', 'and', 'that', 'one', 'particularly']
viable sentence index: 8
['He', 'was', 'I', 'take', 'it', 'the', 'most', 'perfect', 'reasoning']
viable sentence index: 9
actually optimal: 9
['He', 'never', 'spoke', 'of', 'the', 'softer', 'passions', 'save', 'with']
viable sentence index: 10
actually optimal: 10
['They', 'were', 'admirable', 'things', 'for', 'the', 'for', 'drawing']
viable sentence index: 11
actually optimal: 11
actually optimal: 11
viable sentence index: 12
actually optimal: 12
['Grit', 'in', 'a', 'sensitive', 'instrument', 'or', 'a', 'crack']
viable sentence index: 13
actually optimal: 13
['And', 'yet', 'there', 'was', 'but', 'one', 'woman',

In [55]:
author = 'Doyle'
syllable_inflection_columns, word_list_column, sonnet_num_list, author_list, polarity_list, subjectivity_list = text_to_df(text_list, meter_list, author)

['The', 'Adventure', 'of', 'the', 'Copper', 'Beeches'] [0, 0, 1, 0, 0, 0, 1, 0, -1]
counting syllables
9
sonnet number 0
['To', 'Sherlock', 'Holmes', 'she', 'is', 'always', 'THE'] [1, 1, 2, 1, 1, 1, 1, 2, 1]
counting syllables
9
sonnet number 1
['I', 'have', 'seldom', 'heard', 'him', 'mention', 'her', 'under'] [1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0]
counting syllables
11
sonnet number 2
['It', 'was', 'not', 'that', 'he', 'felt', 'any', 'emotion'] [1, 1, 1, 1, 1, 1, 1, 0, 1, 0]
counting syllables
10
sonnet number 3
['He', 'was', 'I', 'take', 'it', 'the', 'most', 'perfect', 'reasoning'] [1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 0, 0]
counting syllables
12
sonnet number 4
['He', 'never', 'spoke', 'of', 'the', 'softer', 'passions', 'save', 'with'] [1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1]
counting syllables
12
sonnet number 5
['They', 'were', 'admirable', 'things', 'for', 'the'] [1, 1, 1, 0, 0, 1, 1, 1]
counting syllables
9
sonnet number 6
['They', 'were', 'admirable', 'things', 'for', 'the'] [1, 0, 1, 0, 0, 0,

In [56]:
doyle_text_df = create_dataframe(syllable_inflection_columns, word_list_column, sonnet_num_list, author_list, polarity_list, subjectivity_list)

doyle_text_df.head()

Unnamed: 0,sonnet_index,syllables,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,word_list,sonnet_num,author,polarity,subjectivity
0,0,9,unstress,unstress,stress,unstress,unstress,unstress,stress,unstress,missing,missing,missing,missing,"[The, Adventure, of, the, Copper, Beeches]",0,Doyle,0.0,0.0
1,1,9,stress,stress,unstress,stress,stress,stress,stress,unstress,stress,missing,missing,missing,"[To, Sherlock, Holmes, she, is, always, THE]",1,Doyle,0.0,0.0
2,2,11,stress,stress,stress,unstress,stress,stress,stress,unstress,stress,stress,unstress,missing,"[I, have, seldom, heard, him, mention, her, un...",2,Doyle,0.0,0.0
3,3,10,stress,stress,stress,stress,stress,stress,stress,unstress,stress,unstress,missing,missing,"[It, was, not, that, he, felt, any, emotion]",3,Doyle,0.0,0.0
4,4,12,stress,stress,stress,stress,stress,stress,stress,stress,unstress,stress,unstress,unstress,"[He, was, I, take, it, the, most, perfect, rea...",4,Doyle,0.75,0.75


In [57]:
doyle_text_df.to_csv('./assets/doyle_text_df.csv')