In [3]:
from typing import Dict, Iterable, List, Tuple
from convokit import Corpus, Speaker, Utterance, download
from collections import defaultdict, Counter

In [5]:
corpus = Corpus(filename="../supreme_processed")

# Filter out speakers without gender signal
corpus = corpus.filter_utterances_by(
    lambda utt: utt.get_speaker().retrieve_meta("gender_signal") != None
) 

In [11]:
# Our goal is to build a table with each word token in the vocabulary for the Supreme Court corpus, 
# listing the number of male and female speakers as well as some percentages.
counts = defaultdict(Counter)
# keys: tokens in the vocabulary
        # values: Counter dictionaries 
            #keys: gender signals
            #values: how many speakers of given gender have said the token

"""Flattens the "tokens" dictionary of an Utterance into a list."""
def get_tokens(utt: Utterance) -> List[str]:
    return [tok["tok"]
            for sent in utt.retrieve_meta("tokens")
            for tok in sent["toks"]]

for utt in corpus.iter_utterances():
    tokens = get_tokens(utt)
    gender = utt.get_speaker().retrieve_meta("gender_signal")
    for token in tokens:
        counts[token][gender] += 1

# Add percentages in a second run because this is a bit neater.
table = defaultdict(defaultdict)
# keys: tokens in the vocabulary
        # values: defaultdicts 
            #keys: either "counts" or "ratio"
            #values: data for male and female speakers

for token in counts: 
    table[token]["counts"] = counts[token]
    table[token]["ratio"] = defaultdict()
    
    total_speakers = table[token]["counts"]["M"] + table[token]["counts"]["F"]
    table[token]["ratio"]["M"] = table[token]["counts"]["M"] / total_speakers
    table[token]["ratio"]["F"] = table[token]["counts"]["F"] / total_speakers
    table[token]["ratio"]["F - M"] = table[token]["ratio"]["F"] - table[token]["ratio"]["M"]

table

defaultdict(collections.defaultdict,
            {'mr': defaultdict(None,
                         {'counts': Counter({'M': 353, 'F': 52}),
                          'ratio': defaultdict(None,
                                      {'M': 0.8716049382716049,
                                       'F': 0.12839506172839507,
                                       'F - M': -0.7432098765432098})}),
             '.': defaultdict(None,
                         {'counts': Counter({'M': 14717, 'F': 2273}),
                          'ratio': defaultdict(None,
                                      {'M': 0.8662154208357857,
                                       'F': 0.13378457916421424,
                                       'F - M': -0.7324308416715715})}),
             'chief': defaultdict(None,
                         {'counts': Counter({'M': 245, 'F': 40}),
                          'ratio': defaultdict(None,
                                      {'M': 0.8596491228070176,
                     