# identificaiton of the author of poems from their "the" and "a"word counting

In [1]:
import toolz
import re, itertools
from glob import iglob

In [2]:
def word_ratio(d):
    """This helper function returns the ratio of a's to the's"""
    return float(d.get("a", 0)) / float(d.get("the", 0.0001))
    

In [3]:
class PoemCleaner:
    def __init__(self):
        self.r = re.compile(r'[.,;:!-]')
        
    def clean_poem(self, fp):
        """This helper function opens a poem at a filepath and returns a clean poem
        
         A clean poem will be a punctuation-less sequence of lowercase words, in
        the order that the author of the poem placed them.
        """
        with open(fp) as poem:
            no_punct = self.r.sub("", poem.read())
            return no_punct.lower().split()       

In [4]:
def word_is_desired(w):
    """This helper function detects whether a word is "a" or "the".
    
    It is designed to be used in conjunction with filter to filter a sequence
    of words down to just definite and indefinite articles.
    """
    if w in ["a", "the"]:
        return True
    else:
        return False

In [5]:
def analyze_poems(poems, cleaner):
    return word_ratio(toolz.frequencies(filter(word_is_desired, itertools.chain(*map(cleaner.clean_poem, poems)))))

## main function

In [6]:
if __name__ == "__main__":
    
    Cleaner = PoemCleaner()
    author_a_poems = iglob("./author_a/*.txt")
    author_b_poems = iglob("./author_b/*.txt")
    
    author_a_ratio = analyze_poems(author_a_poems, Cleaner)
    author_b_ratio = analyze_poems(author_b_poems, Cleaner)
    
    print("""
    Original_Poem:  0.3
    Author A:     {:.2f}
    Author B:     {:.2f}    
    """.format(author_a_ratio, author_b_ratio))


    Original_Poem:  0.3
    Author A:     0.41
    Author B:     0.22    
    


In [18]:
i = 1
for row in itertools.product('01', repeat=9):    
    print(list(row))
    i += 1

print("pictures generated: {}".format(i-1))

['0', '0', '0', '0', '0', '0', '0', '0', '0']
['0', '0', '0', '0', '0', '0', '0', '0', '1']
['0', '0', '0', '0', '0', '0', '0', '1', '0']
['0', '0', '0', '0', '0', '0', '0', '1', '1']
['0', '0', '0', '0', '0', '0', '1', '0', '0']
['0', '0', '0', '0', '0', '0', '1', '0', '1']
['0', '0', '0', '0', '0', '0', '1', '1', '0']
['0', '0', '0', '0', '0', '0', '1', '1', '1']
['0', '0', '0', '0', '0', '1', '0', '0', '0']
['0', '0', '0', '0', '0', '1', '0', '0', '1']
['0', '0', '0', '0', '0', '1', '0', '1', '0']
['0', '0', '0', '0', '0', '1', '0', '1', '1']
['0', '0', '0', '0', '0', '1', '1', '0', '0']
['0', '0', '0', '0', '0', '1', '1', '0', '1']
['0', '0', '0', '0', '0', '1', '1', '1', '0']
['0', '0', '0', '0', '0', '1', '1', '1', '1']
['0', '0', '0', '0', '1', '0', '0', '0', '0']
['0', '0', '0', '0', '1', '0', '0', '0', '1']
['0', '0', '0', '0', '1', '0', '0', '1', '0']
['0', '0', '0', '0', '1', '0', '0', '1', '1']
['0', '0', '0', '0', '1', '0', '1', '0', '0']
['0', '0', '0', '0', '1', '0', '1'