In [3]:
from typing import Dict, Iterable, List, Tuple
from convokit import Corpus, Speaker, Utterance, download
from collections import defaultdict, Counter

In [4]:
corpus = Corpus(filename="supreme_processed")

# Filter out speakers without gender signal
corpus = corpus.filter_utterances_by(
    lambda utt: utt.get_speaker().retrieve_meta("gender_signal") != None
) 

In [6]:
# Our goal is to build a table with each word token in the vocabulary for the Supreme Court corpus, 
# listing the number of male and female speakers as well as some percentages.
table = defaultdict(Counter)
# keys: tokens in the vocabulary
        # values: Counter dictionaries 
            #keys: gender signals
            #values: how many speakers of given gender have said the token

"""Flattens the "tokens" dictionary of an Utterance into a list."""
def get_tokens(utt: Utterance) -> List[str]:
    return [tok["tok"]
            for sent in utt.retrieve_meta("tokens")
            for tok in sent["toks"]]

for utt in corpus.iter_utterances():
    tokens = get_tokens(utt)
    gender = utt.get_speaker().retrieve_meta("gender_signal")
    for token in tokens:
        table[token][gender] += 1

table

defaultdict(collections.Counter,
            {'mr': Counter({'M': 353, 'F': 52}),
             '.': Counter({'M': 14717, 'F': 2273}),
             'chief': Counter({'M': 245, 'F': 40}),
             'justice': Counter({'M': 962, 'F': 149}),
             ',': Counter({'M': 19757, 'F': 2859}),
             'and': Counter({'M': 8012, 'F': 1228}),
             'may': Counter({'M': 363, 'F': 58}),
             'it': Counter({'M': 5769, 'F': 906}),
             'please': Counter({'M': 133, 'F': 21}),
             'the': Counter({'M': 22049, 'F': 2563}),
             'court': Counter({'M': 2401, 'F': 349}),
             ':': Counter({'M': 268, 'F': 42}),
             'when': Counter({'M': 673, 'F': 96}),
             'states': Counter({'M': 371, 'F': 32}),
             'infringe': Counter({'M': 9, 'F': 1}),
             'exclusive': Counter({'M': 19}),
             'federal': Counter({'M': 418, 'F': 29}),
             'rights': Counter({'M': 147, 'F': 9}),
             'that': Counter({'M': 1