# Parsing gendered text

Implement the `parse_gender` function as described on pp. 10-12 of the textbook. Run the function over the three texts indicated below and comment (briefly) on the results.

Starter code is included below. When finished, commit your code and issue a pull request to me.

In [10]:
# Imports
import nltk
import os
from   collections import Counter

# Variables
text_dir = os.path.join('..', 'data', 'texts') # Where are the texts?

In [2]:
# Word lists
MALE = 'male'
FEMALE = 'female'
UNKNOWN = 'unknown'
BOTH = 'both'

MALE_WORDS = set([
    'guy','spokesman','chairman',"men's",'men','him',"he's",'his',
    'boy','boyfriend','boyfriends','boys','brother','brothers','dad',
    'dads','dude','father','fathers','fiance','gentleman','gentlemen',
    'god','grandfather','grandpa','grandson','groom','he','himself',
    'husband','husbands','king','male','man','mr','nephew','nephews',
    'priest','prince','son','sons','uncle','uncles','waiter','widower',
    'widowers'
])

FEMALE_WORDS = set([
    'heroine','spokeswoman','chairwoman',"women's",'actress','women',
    "she's",'her','aunt','aunts','bride','daughter','daughters','female',
    'fiancee','girl','girlfriend','girlfriends','girls','goddess',
    'granddaughter','grandma','grandmother','herself','ladies','lady',
    'lady','mom','moms','mother','mothers','mrs','ms','niece','nieces',
    'priestess','princess','queens','she','sister','sisters','waitress',
    'widow','widows','wife','wives','woman'
])

In [21]:
# Your code here ...
'''
You might want to create your own short text sample for use in developing your code.
To be clear, it's fine to copy the textbook code. This exercise is mostly a shakedown to
check that your environment is working and that the GitHub Classroom submission system
works as intended.
'''
def genderize(words):

    mwlen = len(MALE_WORDS.intersection(words))
    fwlen = len(FEMALE_WORDS.intersection(words))

    if mwlen > 0 and fwlen == 0:
        return MALE
    elif mwlen == 0 and fwlen > 0:
        return FEMALE
    elif mwlen > 0 and fwlen > 0:
        return BOTH
    else:
        return UNKNOWN


def count_gender(sentences):

    sents = Counter() # Counters are like dictionaries, 
    words = Counter() # but handle missing elements better

    for sentence in sentences:
        gender = genderize(sentence)
        sents[gender] += 1             # Number of sentences per gender
        words[gender] += len(sentence) # Number of words in the sentence
                                       # Note ALL words in sentence assigned to one gender

    return sents, words


def parse_gender(text):

    # List of lists. Inner items are tokenized words. Outer items are sentences.
    sentences = [
        [word.lower() for word in nltk.word_tokenize(sentence)]
        for sentence in nltk.sent_tokenize(text)
    ]

    sents, words = count_gender(sentences)
    total = sum(words.values()) # Total text wordcount

    pct_male = 0
    pct_female = 0
    gender_dict = {}
    
    for gender, count in sorted(words.items()): # Each item is one gender
        pcent = (count / total) * 100
        nsents = sents[gender]
        #print(
        #    "{:0.1f}% {} ({} sentences)".format(pcent, gender, nsents)
        #)
        gender_dict[gender] = pcent
    # Female/male ratio
    #print("{} female/male ratio".format(round(words[FEMALE]/words[MALE],2)))
    return gender_dict

In [17]:
path = "../data/texts/"
dirListing = os.listdir(path)
texts = []
for item in dirListing:
    if ".txt" in item:
        texts.append(item)
        
print(len(texts))

16372


In [30]:
%%time
gender_opts = {"female":[],"male":[],"unknown":[],"both":[]}
# Run and examine the output
for text in texts[0:10]: # Loop over texts in corpus directory
    #print(text)
    with open(os.path.join(text_dir, text), 'r') as f: # Open each text in turn
        results = parse_gender(f.read()) # Run the gender-parsing function
        for opt in gender_opts:
            if opt in results:
                gender_opts[opt].append(results[opt])
            else:
                gender_opts[opt].append(0)
    #print('\n**********\n')

for opt in gender_opts:
    gen_avg = sum(gender_opts[opt])/len(gender_opts[opt])
    print("{}, {}".format(opt, gen_avg))

unknown, 51.91504495521268
male, 45.36297101060396
female, 0.6994132384080392
both, 2.022570795775324
CPU times: user 141 ms, sys: 0 ns, total: 141 ms
Wall time: 150 ms
