In [1]:
import numpy as np
import pandas as pd
from convokit import Corpus, download

**Team:** Amari Bauer, Matt Ryan, Francesca Vescia

**Data:** We will train our model on transcripts from the 1024 Roberts court cases with clear winners and available utterance data. We chose to focus on Roberts court cases because we hope to eventually use our model to predict the outcomes of new cases before rulings are issued. John Roberts is the current Chief Justice of the United States, so upcoming Supreme Court cases will be argued before his court. We expect a model trained exclusively on past Roberts court cases will have higher predictive accuracy for these cases than a model trained on the full corpus, which includes transcripts from older cases heard before other courts in notably different times. Our goal is to predict whether or not a case will be decided favorably for a petitioner, so we exclude cases where information about the winning side is unclear or altogether unavailable. We will use features of case utterances to predict case outcomes, so we exclude cases with no available utterance data.

In [2]:
corpus = Corpus(filename=download('supreme-corpus'))

Dataset already exists at /Users/amaribauer/.convokit/downloads/supreme-corpus


In [3]:
# PREPROCESSING

# All cases
cases = pd.read_json(path_or_buf='data/cases.jsonl', lines=True)

# Cases with clear winners
df = cases.loc[cases.loc[:, 'win_side'].isin([1, 0])]

# Roberts court cases with clear winners
roberts = df.loc[df.loc[:, 'court'] == 'Roberts Court', :]

# All utterances
all_utts = corpus.get_utterances_dataframe()

# Roberts court case utterances
roberts_ids = roberts.loc[:, 'id'].unique()
utts = all_utts.loc[all_utts.loc[:, 'meta.case_id'].isin(roberts_ids)]

In [4]:
# FINAL SUBSET

# Roberts court cases with clearn winners and utterance data
    # Unique case IDs from utts indicate the number of Roberts court cases
    # the corpus has utterance data for
subset_ids = utts.loc[:, 'meta.case_id'].unique()
    # Use these ids to subset the roberts data frame
    # (exclude cases without utterance data)
subset = roberts.loc[roberts.loc[:, 'id'].isin(subset_ids)]
print(len(subset), 'cases')
petitioner_wins = subset.loc[:, 'win_side'].mean()
print(round(petitioner_wins * 100, 2),
      'percent of cases were decided favorably for the petitioner')

1024 cases
65.33 percent of cases were decided favorably for the petitioner


In [5]:
# Utterances per case
print('Min:', utts.groupby(['meta.case_id']).size().min()) 
    # TODO: What case(s) have just 1 utterance?
print('Max:', utts.groupby(['meta.case_id']).size().max())
print('Mean', utts.groupby(['meta.case_id']).size().mean())

Min: 1
Max: 1235
Mean 237.3955078125


In [6]:
# Conversations per case
print('Min:', (utts.loc[:, ['meta.case_id', 'conversation_id']]
               .groupby(['meta.case_id']).nunique().min()))
print('Max:', (utts.loc[:, ['meta.case_id', 'conversation_id']]
               .groupby(['meta.case_id']).nunique().max()))
print('Mean:', (utts.loc[:, ['meta.case_id', 'conversation_id']]
                .groupby(['meta.case_id']).nunique().mean()))

Min: conversation_id    1
dtype: int64
Max: conversation_id    4
dtype: int64
Mean: conversation_id    1.016602
dtype: float64


In [7]:
# Speakers per case
print(utts.loc[:, ['meta.case_id', 'speaker']]
      .groupby(['meta.case_id']).nunique().min()) 
    # TODO: What case(s) have just 1 speaker?
print(utts.loc[:, ['meta.case_id', 'speaker']]
      .groupby(['meta.case_id']).nunique().max())
print(utts.loc[:, ['meta.case_id', 'speaker']]
      .groupby(['meta.case_id']).nunique().mean())

speaker    1
dtype: int64
speaker    15
dtype: int64
speaker    10.225586
dtype: float64


In [8]:
# Speaker type breakdown for the average Roberts court case
speaker_types = (utts.loc[utts.loc[:, 'meta.case_id'].isin(roberts_ids)]
                 .groupby(['meta.speaker_type']).size())
print(speaker_types)
print('Advocates (lawyers):', 116108 / (116108 + 125594))
print('Justices:', 125594 / (116108 + 125594))

meta.speaker_type
A    116108
J    125594
dtype: int64
Advocates (lawyers): 0.4803766621707723
Justices: 0.5196233378292278


In [9]:
# Side breakdown for the average Roberts court case
sides = (utts.loc[utts.loc[:, 'meta.case_id'].isin(roberts_ids)]
         .groupby(['meta.side']).size())
print(sides)
print('Respondent:', 52498 / (52498 + 53130 + 8043 + 2437))
print('Petitioners:', 53130 / (52498 + 53130 + 8043 + 2437))
print('Amicus:', 8043 / (52498 + 53130 + 8043 + 2437))
print('Unknown:', 2437 / (52498 + 53130 + 8043 + 2437))

meta.side
0    52498
1    53130
2     8043
3     2437
dtype: int64
Respondent: 0.4521480001378027
Petitioners: 0.4575912081854825
Amicus: 0.06927171254349399
Unknown: 0.020989079133220793


In [10]:
utts.to_csv('data/utts.csv')