In [1]:
import numpy as np
import pandas as pd
import re
from convokit import Corpus, download
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
import gender_guesser.detector as gender

In [2]:
corpus = Corpus(filename=download('supreme-corpus'))

Dataset already exists at /Users/fvescia/.convokit/downloads/supreme-corpus


In [3]:
# All cases
cases = pd.read_json(path_or_buf='data/cases.jsonl', lines=True)

# Cases with clear winners
df = cases.loc[cases.loc[:, 'win_side'].isin([1, 0])]

# Roberts court cases with clear winners
roberts = df.loc[df.loc[:, 'court'] == 'Roberts Court', :]

# All utterances
all_utts = corpus.get_utterances_dataframe()

# Roberts court case utterances
roberts_ids = roberts.loc[:, 'id'].unique()
utts = all_utts.loc[all_utts.loc[:, 'meta.case_id'].isin(roberts_ids)]

# Roberts court cases with clearn winners and utterance data
    # Unique case IDs from utts indicate the number of Roberts court cases
    # the corpus has utterance data for
subset_ids = utts.loc[:, 'meta.case_id'].unique()
    # Use these ids to subset the roberts data frame
    # (exclude cases without utterance data)
roberts = roberts.loc[roberts.loc[:, 'id'].isin(subset_ids)]
print(len(roberts), 'cases')
petitioner_wins = roberts.loc[:, 'win_side'].mean()
print(round(petitioner_wins * 100, 2),
      'percent of cases were decided favorably for the petitioner')

1024 cases
65.33 percent of cases were decided favorably for the petitioner


In [53]:
# Utterances per case
print('Min:', utts.groupby(['meta.case_id']).size().min()) 
print('Max:', utts.groupby(['meta.case_id']).size().max())
print('Mean', utts.groupby(['meta.case_id']).size().mean())

Min: 66
Max: 1235
Mean 237.62658846529814


In [54]:
# Drop single-utterance cases
utt_counts = pd.DataFrame(utts.groupby(['meta.case_id']).size())
utt_counts = utt_counts.reset_index()
utt_counts = utt_counts.rename(columns={0: 'utt_counts'})
utts = pd.merge(utts, utt_counts, how = 'left')
utts = utts.loc[utts.loc[:, 'utt_counts'] != 1, :]
print('1 case dropped')
print('New min:', utts.groupby(['meta.case_id']).size().min()) 
print(len(utts.groupby(['meta.case_id'])), 'cases')

# Store utts for use in other notebooks
%store utts

1 case dropped
New min: 66
1023 cases
Stored 'utts' (DataFrame)


In [55]:
# Drop single-sided cases

# Case IDs with 0 or 100% petitioner_advocate_utt_share
%store -r advocates
single_sided = (advocates.loc[
    ((advocates.loc[:, 'petitioner_advocate_utt_share'] == 0.0) |
     (advocates.loc[:, 'petitioner_advocate_utt_share'] == 1.0)), :])
single_sided = single_sided.reset_index()

# advocates has one row per case ID --> case_ids are already unique
iffy_ids = single_sided.loc[:, 'meta.case_id']
utts = utts.loc[~utts.loc[:, 'meta.case_id'].isin(iffy_ids)]
print(len(utts.groupby(['meta.case_id'])), 'cases')
print(utts)

1023 cases
       timestamp                                               text   
0           None  We'll hear argument first this morning in -, A...  \
1           None  Thank you, Mr. Chief Justice, and may it pleas...   
2           None  Well, isn't there something different here?\nB...   
3           None  I don't agree, Justice Souter, and here's why....   
4           None  Sure, but they suffered the harm because the f...   
...          ...                                                ...   
243087      None  -- has all sorts of meaning that you're not en...   
243088      None                                  No, Your Honor --   
243089      None                                     -- altogether?   
243090      None  -- we are using the principles of complicity a...   
243091      None        Thank you, counsel.\nThe case is submitted.   

                     speaker      reply_to conversation_id meta.case_id   
0       j__john_g_roberts_jr          None           22620  2

In [56]:
# Compute justice utterance share
utts['justice'] = utts.loc[:, 'meta.speaker_type'] == 'J'
df = utts.groupby('meta.case_id').agg({'justice': ['sum', 'count']})
df.columns = [ 'justice_utts', 'tot_utts']
df['justice_utt_share'] = df.loc[:, 'justice_utts'] / df.loc[:, 'tot_utts']
justices = df.loc[:, ['justice_utt_share']]
print('Min:', justices.loc[:, 'justice_utt_share'].min())
print('Max:', justices.loc[:, 'justice_utt_share'].max())
print('Mean:', justices.loc[:, 'justice_utt_share'].mean())

Min: 0.48
Max: 0.7347826086956522
Mean: 0.5158600540883759


In [57]:
# Compute petitioner advocate utterance share
advocates = utts.loc[utts.loc[:, 'meta.speaker_type'] == 'A', :].copy()
advocates['petitioner_advocate'] = advocates.loc[:, 'meta.side'] == 1
advocates = advocates.groupby('meta.case_id').agg({'petitioner_advocate': ['sum', 'count']})
advocates.columns = [ 'petitioner_advocate_utts', 'total_advocate_utts']
advocates['petitioner_advocate_utt_share'] = advocates.loc[:, 'petitioner_advocate_utts'] / advocates.loc[:, 'total_advocate_utts']
advocates = advocates.loc[:, [ 'petitioner_advocate_utt_share']]
print('Min', advocates.loc[:, 'petitioner_advocate_utt_share'].min())
print('Max', advocates.loc[:, 'petitioner_advocate_utt_share'].max())
print('Mean', advocates.loc[:, 'petitioner_advocate_utt_share'].mean())
%store advocates

Min 0.0
Max 1.0
Mean 0.45755947906632044
Stored 'advocates' (DataFrame)


In [58]:
# Compute female utterance share
d = gender.Detector()

def guess_gender(name):
    name = re.sub('j__', '', name)
    name = re.sub('_\S*', '', name)
    name = name.capitalize()
    guess = d.get_gender(name, 'usa')
    return guess

# Guess gender
gendr = utts.loc[:, ['meta.case_id', 'speaker' ]].copy()
gendr.loc[:, 'gender'] = gendr.loc[:, 'speaker'].map(guess_gender)
print('Before subsetting:')
print(gendr.loc[:, 'gender'].unique())
print(gendr.groupby('gender').agg({'gender': ['count']}).droplevel)

# Restrict to confident guesses
gendr = gendr.loc[gendr.loc[:, 'gender'].isin(['female', 'mostly_female', 
                                               'mostly_male', 'male'])]
print('\nAfter subsetting:')
print(gendr.loc[:, 'gender'].unique())
print(gendr.groupby('gender').agg({'gender': ['count']}).droplevel)

# Compute female_utt_share 
    # = (N female + N mostly_female) / 
    # (N female + N mostly_female + N male + N mostly_male)
gendr.loc[:, 'gender_num'] = gendr.loc[:, 'gender'].isin(['female', 
                                                          'mostly_female'])
gendr = gendr.groupby('meta.case_id').agg({'gender_num': ['sum', 'count']})
gendr.columns = ['female_utts', 'total_utts']
gendr.loc[:, 'female_utt_share'] = (gendr.loc[:, 'female_utts'] /
                                    gendr.loc[:, 'total_utts'])
gendr = gendr.loc[:, ['female_utt_share']]

Before subsetting:
['male' 'female' 'andy' 'mostly_male' 'unknown' 'mostly_female']
<bound method NDFrame.droplevel of                gender
                count
gender               
andy            29443
female          41616
male           158444
mostly_female    1926
mostly_male      3099
unknown          8564>

After subsetting:
['male' 'female' 'mostly_male' 'mostly_female']
<bound method NDFrame.droplevel of                gender
                count
gender               
female          41616
male           158444
mostly_female    1926
mostly_male      3099>


In [59]:
# Combine utterance shares
utt_shares = pd.merge(justices, advocates, how = 'left', left_index = True, right_index = True)
utt_shares = pd.merge(utt_shares, gendr, how = 'left', left_index = True, right_index = True)
utt_shares.head()

Unnamed: 0_level_0,justice_utt_share,petitioner_advocate_utt_share,female_utt_share
meta.case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2005_03-1238,0.515571,1.0,0.145098
2005_04-1034,0.513353,0.603659,0.06639
2005_04-10566,0.525333,0.533708,0.14
2005_04-1067,0.534954,0.48366,0.275081
2005_04-1084,0.526611,0.514793,0.358025


In [60]:
#Code to create conversations/case feature
utts['text'].replace('\d+', '', regex=True, inplace = True) 
convo_case = pd.DataFrame(utts.groupby('meta.case_id')['conversation_id'].nunique())
convo_case.rename(columns = {'conversation_id':'convo_count'}, inplace = True)
print(convo_case.loc[:,'convo_count'].describe())

"""PROPORTION OF UTTERANCES BY JUSTICES APPOINTED BY REPUBLICAN PRESIDENTS
Does not consider length of utterance, which also may be of interest"""
just_dict = {'j__john_g_roberts_jr':0, 'j__david_h_souter':0,
       'j__anthony_m_kennedy':0, 'j__ruth_bader_ginsburg':1,
       'j__john_paul_stevens':0, 'j__samuel_a_alito_jr':0,
       'j__antonin_scalia':0, 'j__stephen_g_breyer':1,
       'j__sandra_day_oconnor':0, 'j__clarence_thomas': 0 ,
       'j__sonia_sotomayor':1, 'j__elena_kagan':1, 'j__neil_gorsuch': 0,
       'j__brett_m_kavanaugh':0} #based on being appointed by cons/liberal president
j_utts = utts.copy()
j_utts = j_utts.loc[utts.loc[:,'justice'] == True]
j_utts.loc[:,'cons_just']= j_utts.loc[:,'speaker'].map(just_dict)
cons_just = j_utts.groupby(['meta.case_id']).agg({'cons_just':'mean'}) 
print(cons_just.loc[:,'cons_just'].describe())

count    1023.000000
mean        1.016618
std         0.149092
min         1.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         4.000000
Name: convo_count, dtype: float64
count    1023.000000
mean        0.435337
std         0.154456
min         0.045455
25%         0.310643
50%         0.446809
75%         0.552912
max         0.915789
Name: cons_just, dtype: float64


In [61]:
#PROPORTION OF REPUBLICAN APPOINTED JUSTICES ON THE COURT AT THE TIME OF THE DECISION
roberts.loc[:,'decided_date'] = pd.to_datetime(roberts.loc[:,'decided_date'])
roberts.loc[roberts.loc[:,'id'] == '2019_17-1268','decided_date'] = pd.to_datetime('2020-06-19')
roberts.loc[roberts.loc[:,'decided_date'].between(pd.to_datetime('2005-10-29'), pd.to_datetime('2009-08-07')), 'prop_cons'] = 7/9
roberts.loc[roberts.loc[:,'decided_date'].between(pd.to_datetime('2009-08-08'), pd.to_datetime('2010-08-06')), 'prop_cons'] = 6/9
roberts.loc[roberts.loc[:,'decided_date'].between(pd.to_datetime('2010-08-07'), pd.to_datetime('2016-02-12')), 'prop_cons'] = 5/9
roberts.loc[roberts.loc[:,'decided_date'].between(pd.to_datetime('2016-02-13'), pd.to_datetime('2017-04-09')), 'prop_cons'] = 4/8
roberts.loc[roberts.loc[:,'decided_date'].between(pd.to_datetime('2017-04-10'), pd.to_datetime('2020-07-10')), 'prop_cons'] = 5/9

prop_cons = roberts.loc[:,['id','prop_cons']].set_index('id')
print(roberts.loc[:,'prop_cons'].describe())

count    1024.000000
mean        0.623210
std         0.103381
min         0.500000
25%         0.555556
50%         0.555556
75%         0.777778
max         0.777778
Name: prop_cons, dtype: float64


In [62]:
group_utts = utts.groupby('meta.case_id')['text'].apply(' '.join)
df = pd.merge(group_utts,roberts[['id','win_side']], how = 'left', 
              left_on = 'meta.case_id', right_on = 'id')
X_train, X_test, y_train, y_test = train_test_split(
    df.loc[:,df.columns != 'win_side'],df['win_side'], test_size = 0.3, random_state = 0)

In [63]:
#Initialize Vectorizer and vectorize train data
count_vect = CountVectorizer(ngram_range = (1,1),min_df = 4,
            stop_words = 'english', token_pattern = r'\b[a-zA-Z]{3,}\b') 
count_df = count_vect.fit_transform(X_train['text'])
count_array = count_df.toarray()
count_df = pd.DataFrame(count_array,columns = count_vect.get_feature_names_out(), index = X_train['id'])
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(count_df).toarray()
X_train_tfidf = pd.DataFrame(X_train_tfidf, columns = count_vect.get_feature_names_out(), index = X_train['id'])

X_train_all = X_train_tfidf.copy()
X_train_all = pd.merge(X_train_all,convo_case, how = 'left', left_index = True, right_index = True)
X_train_all = pd.merge(X_train_all,utt_shares, how = 'left', left_index = True, right_index = True)
X_train_all = pd.merge(X_train_all,cons_just, how = 'left', left_index = True, right_index = True)
X_train_all = pd.merge(X_train_all,prop_cons, how = 'left', left_index = True, right_index = True)
#MERGING IN ADDITIONAL FEATURES HERE
X_train_all

Unnamed: 0_level_0,aaa,aba,abandon,abandoned,abandoning,abandonment,abandons,abate,abated,abbott,...,zip,zone,zones,zoning,convo_count,justice_utt_share,petitioner_advocate_utt_share,female_utt_share,cons_just,prop_cons
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019_19-67,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0.532710,0.540000,0.106796,0.394737,0.555556
2013_12-1315,0.0,0.000000,0.000000,0.000000,0.000000,0.020612,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0.502994,0.228916,0.364407,0.559524,0.555556
2005_04-1186,0.0,0.008259,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0.514151,0.679612,0.137500,0.183486,0.777778
2006_05-6551,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0.516393,0.516949,0.013761,0.222222,0.777778
2005_05-5224,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0.506667,0.513514,0.060109,0.228070,0.777778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016_16-6219,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0.515982,0.462264,0.147959,0.663717,0.555556
2007_06-984,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0.516279,0.341346,0.138047,0.337838,0.777778
2013_12-79,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0.502674,0.354839,0.213415,0.414894,0.555556
2012_11-982,0.0,0.000000,0.005297,0.010163,0.007013,0.014189,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0.514599,0.390977,0.296578,0.624113,0.555556


In [64]:
#Vectorize test data
count_test = count_vect.transform(X_test['text'])
count_test_array = count_test.toarray()
count_test_df = pd.DataFrame(count_test_array, columns = count_vect.get_feature_names_out(),index = X_test['id'])

X_test_tfidf = tfidf_transformer.transform(count_test_df).toarray()
X_test_tfidf = pd.DataFrame(X_test_tfidf, columns = count_vect.get_feature_names_out(), index = X_test['id'])
X_test_all = X_test_tfidf.copy()

X_test_all = pd.merge(X_test_all,convo_case, how = 'left', left_index = True, right_index = True)
X_test_all = pd.merge(X_test_all,utt_shares, how = 'left', left_index = True, right_index = True)
X_test_all = pd.merge(X_test_all,cons_just, how = 'left', left_index = True, right_index = True)
X_test_all = pd.merge(X_test_all,prop_cons, how = 'left', left_index = True, right_index = True)
X_test_all 

Unnamed: 0_level_0,aaa,aba,abandon,abandoned,abandoning,abandonment,abandons,abate,abated,abbott,...,zip,zone,zones,zoning,convo_count,justice_utt_share,petitioner_advocate_utt_share,female_utt_share,cons_just,prop_cons
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016_15-649,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,1,0.512048,0.222222,0.328025,0.729412,0.500000
2005_04-1544,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,1,0.502110,0.483051,0.225166,0.378151,0.777778
2006_05-1074,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,1,0.513514,0.537037,0.137441,0.324561,0.777778
2013_12-158,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.004869,0.0,0.0,1,0.520958,0.475000,0.118056,0.528736,0.555556
2009_08-1553,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,1,0.503425,0.296552,0.191111,0.632653,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015_15-338,0.0,0.0,0.0,0.005636,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,1,0.519149,0.424779,0.337209,0.680328,0.500000
2019_18-1171,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,1,0.556548,0.228188,0.268722,0.443850,0.555556
2015_14-1175,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,1,0.526316,0.488889,0.268519,0.570000,0.500000
2007_06-1498,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,1,0.511737,0.403846,0.346939,0.366972,0.777778


In [65]:
X_train_all.to_csv('data/X_train_all.csv')
X_test_all.to_csv('data/X_test_all.csv')

y_train.to_csv('data/y_train.csv')
y_test.to_csv('data/y_test.csv')