In [1]:
#meta 2/7/2022 Linguistic Features
#book: Applied Text Analysis with Python
#authors: Benjamin Bengfort, Rebecca Bilbro, Tony Ojeda

#infra: run on-prem 
#compute: my trainbox
#existing default env: Python 3.7.6, nltk 3.4.5, networkx 2.4

In [2]:
import nltk
from collections import Counter

Refer to https://www.nltk.org/data.html

NLTK comes with many corpora, toy grammars, trained models, etc. A complete list is posted at: https://www.nltk.org/nltk_data/

To install the data, first install NLTK (see https://www.nltk.org/install.html), then use NLTK’s data downloader.

$error: Resource punkt not found.  
Please use the NLTK Downloader to obtain the resource:  
> import nltk  
> nltk.download('punkt')

In [3]:
#prerequisite
#nltk.download('punkt')

# book `Applied Text Analysis with Python`
Enabling language-aware data products with ML.  
repo https://github.com/anyaconda/atap

# Ch.1 Language and Computation
Basic hypothesis behind applied ML on text: text is predictable.


## Linguistic Features
A simple model that uses linguistic features to identify the predominant gender in a piece of text.  Refer to Neal Caren blog post (2013).  He applied a gender-based analysis of text to New York Times articles and determined that in fact male and female words appeared in starkly different contexts, potentially reinforcing gender biases.


In [4]:
#import nltk
#nltk.download('wordnet')

In [5]:
# build sets of words that differentiate sentences about gender

# 4 possible states
MALE = 'male'
FEMALE = 'female'
UNKNOWN = 'unknown'
BOTH = 'both'

MALE_WORDS = set([
    'guy','spokesman','chairman',"men's",'men','him',"he's",'his',
    'boy','boyfriend','boyfriends','boys','brother','brothers','dad',
    'dads','dude','father','fathers','fiance','gentleman','gentlemen',
    'god','grandfather','grandpa','grandson','groom','he','himself',
    'husband','husbands','king','male','man','mr','nephew','nephews',
    'priest','prince','son','sons','uncle','uncles','waiter','widower',
    'widowers'
])

FEMALE_WORDS = set([
    'heroine','spokeswoman','chairwoman',"women's",'actress','women',
    "she's",'her','aunt','aunts','bride','daughter','daughters','female',
    'fiancee','girl','girlfriend','girlfriends','girls','goddess',
    'granddaughter','grandma','grandmother','herself','ladies','lady',
    'mom','moms','mother','mothers','mrs','ms','niece','nieces',
    'priestess','princess','queens','she','sister','sisters','waitress',
    'widow','widows','wife','wives','woman'
])

# method to assign gender to a sentence
# desc: examines # of words in a sentence that appear in 2 gendered lists
# return: classification of the sentence: 4 cats
def genderize(words):

    mwlen = len(MALE_WORDS.intersection(words))
    fwlen = len(FEMALE_WORDS.intersection(words))

    if mwlen > 0 and fwlen == 0:
        return MALE
    elif mwlen == 0 and fwlen > 0:
        return FEMALE
    elif mwlen > 0 and fwlen > 0:
        return BOTH
    else:
        return UNKNOWN

# method to count frequency of gendered words and sentences within the complete text of an article
# desc: evaluate the total # of gendered words and gendered sentences
def count_gender(sentences):

    sents = Counter()
    words = Counter()

    for sentence in sentences:
        gender = genderize(sentence)
        sents[gender] += 1
        words[gender] += len(sentence)

    return sents, words

# to parse raw text of the articles into component sentences and words
# desc: break paragraphs into sentences
def parse_gender(text):

    sentences = [
        [word.lower() for word in nltk.word_tokenize(sentence)]
        for sentence in nltk.sent_tokenize(text)
    ]

    sents, words = count_gender(sentences)
    total = sum(words.values())

    for gender, count in words.items():
        pcent = (count / total) * 100
        nsents = sents[gender]

        print(
            "{:0.3f}% {} ({} sentences)".format(pcent, gender, nsents)
        )


In [6]:
# print a doc's % of 4 cats
with open('data/ballet.txt', 'r', newline='\n', encoding='UTF-8') as f:
    parse_gender(f.read())

39.269% unknown (48 sentences)
52.994% female (38 sentences)
4.393% both (2 sentences)
3.344% male (3 sentences)


## My edits

In [7]:
#$my
# method to assign gender to a sentence
# desc: examines # of words in a sentence that appear in 2 gendered lists
# return: classification of the sentence: 4 cats
def my_genderize(words):
    #$my see gendered words per sentence
    print("male words? ", MALE_WORDS.intersection(words))
    print("female words? ", FEMALE_WORDS.intersection(words))

    mwlen = len(MALE_WORDS.intersection(words))
    fwlen = len(FEMALE_WORDS.intersection(words))

    if mwlen > 0 and fwlen == 0:
        return MALE
    elif mwlen == 0 and fwlen > 0:
        return FEMALE
    elif mwlen > 0 and fwlen > 0:
        return BOTH
    else:
        return UNKNOWN
    
# method to count frequency of gendered words and sentences within the complete text of an article
# desc: evaluate the total # of gendered words and gendered sentences
#$note unchanged
#def count_gender(sentences):


# to parse raw text of the articles into component sentences and words
# desc: break paragraphs into sentences
def my_parse_gender(text):

    sentences = [
        [word.lower() for word in nltk.word_tokenize(sentence)]
        for sentence in nltk.sent_tokenize(text)
    ]
    
    #my: total count of sentences
    print("Total sentences: {:d} ".format(len(sentences)))
    
    print(sentences[4])
    
    #my: limit to only a few sentences
    sents, words = count_gender(sentences[:5])
    total = sum(words.values())
    
    print("\n")

    for gender, count in words.items():
        pcent = (count / total) * 100
        nsents = sents[gender]
        
        print(
            "{:0.3f}% {} ({} sentences)".format(pcent, gender, nsents)
        )


In [8]:
# print a doc's % of 4 cats
with open('data/ballet.txt', 'r', newline='\n', encoding='UTF-8') as f:
    my_parse_gender(f.read())

Total sentences: 91 
['spending', 'a', 'week', 'shadowing', 'ms.', 'kretzschmar', 'was', 'exhausting', '—', 'she', 'gave', 'new', 'meaning', 'to', 'the', 'idea', 'of', 'being', 'on', 'your', 'feet', 'all', 'day', '.']


67.176% unknown (3 sentences)
32.824% female (2 sentences)
