In [3]:
from typing import Dict, Iterable, List, Tuple
from convokit import Corpus, Speaker, Utterance, download
from collections import defaultdict, Counter

In [5]:
corpus = Corpus(filename="../supreme_processed")

# Filter out speakers without gender signal
corpus = corpus.filter_utterances_by(
    lambda utt: utt.get_speaker().retrieve_meta("gender_signal") != None
) 

In [17]:
# Our goal is to build a table with each word token in the vocabulary for the Supreme Court corpus, 
# listing the number of male and female speakers as well as some percentages.
counts = defaultdict(Counter)
# keys: tokens in the vocabulary
        # values: Counter dictionaries 
            #keys: gender signals
            #values: how many speakers of given gender have said the token

"""Flattens the "tokens" dictionary of an Utterance into a list."""
def get_tokens(utt: Utterance) -> List[str]:
    return [tok["tok"]
            for sent in utt.retrieve_meta("tokens")
            for tok in sent["toks"]]

for utt in corpus.iter_utterances():
    tokens = get_tokens(utt)
    gender = utt.get_speaker().retrieve_meta("gender_signal")
    for token in tokens:
        counts[token][gender] += 1

# Add percentages in a second run because this is a bit neater.
table = defaultdict(dict)
# keys: tokens in the vocabulary
        # values: defaultdicts 
            #keys: either "counts" or "ratio"
            #values: data for male and female speakers

for token in counts: 
    table[token]["counts"] = counts[token]
    table[token]["ratio"] = dict()
    
    table[token]["total"] = table[token]["counts"]["M"] + table[token]["counts"]["F"]
    
    table[token]["ratio"]["M"] = table[token]["counts"]["M"] / table[token]["total"]
    table[token]["ratio"]["F"] = table[token]["counts"]["F"] / table[token]["total"]
    table[token]["ratio"]["F - M"] = table[token]["ratio"]["F"] - table[token]["ratio"]["M"]

table

defaultdict(dict,
            {'mr': {'counts': Counter({'M': 353, 'F': 52}),
              'ratio': {'M': 0.8716049382716049,
               'F': 0.12839506172839507,
               'F - M': -0.7432098765432098},
              'total': 405},
             '.': {'counts': Counter({'M': 14717, 'F': 2273}),
              'ratio': {'M': 0.8662154208357857,
               'F': 0.13378457916421424,
               'F - M': -0.7324308416715715},
              'total': 16990},
             'chief': {'counts': Counter({'M': 245, 'F': 40}),
              'ratio': {'M': 0.8596491228070176,
               'F': 0.14035087719298245,
               'F - M': -0.7192982456140351},
              'total': 285},
             'justice': {'counts': Counter({'M': 962, 'F': 149}),
              'ratio': {'M': 0.8658865886588659,
               'F': 0.1341134113411341,
               'F - M': -0.7317731773177318},
              'total': 1111},
             ',': {'counts': Counter({'M': 19757, 'F': 2859}),
     