In [2]:
import numpy as np
import pandas as pd
from convokit import Corpus, download

In [3]:
corpus = Corpus(filename=download('supreme-corpus'))

Dataset already exists at /Users/fvescia/.convokit/downloads/supreme-corpus


In [4]:
# PREPROCESSING

# All cases
cases = pd.read_json(path_or_buf='data/cases.jsonl', lines=True)

# Cases with clear winners
df = cases.loc[cases.loc[:, 'win_side'].isin([1, 0])]

# Roberts court cases with clear winners
roberts = df.loc[df.loc[:, 'court'] == 'Roberts Court', :]

# All utterances
all_utts = corpus.get_utterances_dataframe()

# Roberts court case utterances
roberts_ids = roberts.loc[:, 'id'].unique()
utts = all_utts.loc[all_utts.loc[:, 'meta.case_id'].isin(roberts_ids)]

In [5]:
# FINAL SUBSET

# Roberts court cases with clearn winners and utterance data
    # Unique case IDs from utts indicate the number of Roberts court cases
    # the corpus has utterance data for
subset_ids = utts.loc[:, 'meta.case_id'].unique()
    # Use these ids to subset the roberts data frame
    # (exclude cases without utterance data)
subset = roberts.loc[roberts.loc[:, 'id'].isin(subset_ids)]
print(len(subset), 'cases')
petitioner_wins = subset.loc[:, 'win_side'].mean()
print(round(petitioner_wins * 100, 2),
      'percent of cases were decided favorably for the petitioner')

1024 cases
65.33 percent of cases were decided favorably for the petitioner


In [51]:
# Utterances per case
print('Min:', utts.groupby(['meta.case_id']).size().min()) 
    # TODO: What case(s) have just 1 utterance?
print('Max:', utts.groupby(['meta.case_id']).size().max())
print('Mean', utts.groupby(['meta.case_id']).size().mean())

Min: 66
Max: 1235
Mean 237.62658846529814


In [52]:
utt_counts = pd.DataFrame(utts.groupby(['meta.case_id']).size())
utt_counts = utt_counts.reset_index()
utt_counts = utt_counts.rename(columns={0: 'utt_counts'})
print(utt_counts)


       meta.case_id  utt_counts
0      2005_03-1238         289
1      2005_04-1034         337
2     2005_04-10566         375
3      2005_04-1067         329
4      2005_04-1084         357
...             ...         ...
1018    2019_19-631         187
1019    2019_19-635         286
1020     2019_19-67         214
1021      2019_19-7         232
1022    2019_19-715         256

[1023 rows x 2 columns]


In [53]:
utts = pd.merge(utts, utt_counts, how = 'left')
utts = utts.loc[utts.loc[:, 'utt_counts'] != 1, :]
print('Min:', utts.groupby(['meta.case_id']).size().min()) 
utts.groupby(['meta.case_id']).size()

Min: 66


meta.case_id
2005_03-1238     289
2005_04-1034     337
2005_04-10566    375
2005_04-1067     329
2005_04-1084     357
                ... 
2019_19-631      187
2019_19-635      286
2019_19-67       214
2019_19-7        232
2019_19-715      256
Length: 1023, dtype: int64