In [1]:
import numpy as np
import pandas as pd
import re
from convokit import Corpus, download
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

In [2]:
all_utts = pd.read_csv('data/utts.csv')
roberts = pd.read_csv('data/subset.csv')

In [3]:
# All cases
cases = pd.read_json(path_or_buf='data/cases.jsonl', lines=True)

# Cases with clear winners
df = cases.loc[cases.loc[:, 'win_side'].isin([1, 0])]

# Roberts court cases with clear winners
roberts = df.loc[df.loc[:, 'court'] == 'Roberts Court', :]

# Roberts court case utterances
roberts_ids = roberts.loc[:, 'id'].unique()
utts = all_utts.loc[all_utts.loc[:, 'meta.case_id'].isin(roberts_ids)]

# Roberts court cases with clearn winners and utterance data
    # Unique case IDs from utts indicate the number of Roberts court cases
    # the corpus has utterance data for
subset_ids = utts.loc[:, 'meta.case_id'].unique()
    # Use these ids to subset the roberts data frame
    # (exclude cases without utterance data)
roberts = roberts.loc[roberts.loc[:, 'id'].isin(subset_ids)]
print(len(roberts), 'cases')
petitioner_wins = roberts.loc[:, 'win_side'].mean()
print(round(petitioner_wins * 100, 2),
      'percent of cases were decided favorably for the petitioner')

1024 cases
65.33 percent of cases were decided favorably for the petitioner


In [4]:
# Utterances per case
print('Min:', utts.groupby(['meta.case_id']).size().min()) 
print('Max:', utts.groupby(['meta.case_id']).size().max())
print('Mean', utts.groupby(['meta.case_id']).size().mean())

Min: 1
Max: 1235
Mean 237.3955078125


In [5]:
# Drop single-utterance cases
utt_counts = pd.DataFrame(utts.groupby(['meta.case_id']).size())
utt_counts = utt_counts.reset_index()
utt_counts = utt_counts.rename(columns={0: 'utt_counts'})
utts = pd.merge(utts, utt_counts, how = 'left')
utts = utts.loc[utts.loc[:, 'utt_counts'] != 1, :]
print('1 case dropped')
print('New min:', utts.groupby(['meta.case_id']).size().min()) 
print(len(utts.groupby(['meta.case_id'])), 'cases')

1 case dropped
New min: 66
1023 cases


In [6]:
utts['justice'] = utts.loc[:, 'meta.speaker_type'] == 'J'
df = utts.groupby('meta.case_id').agg({'justice': ['sum', 'count']})
df.columns = [ 'justice_utts', 'tot_utts']
df['justice_utt_share'] = df.loc[:, 'justice_utts'] / df.loc[:, 'tot_utts']
justices = df.loc[:, ['justice_utt_share']]
print(justices)
print('Min:', justices.loc[:, 'justice_utt_share'].min())
print('Max:', justices.loc[:, 'justice_utt_share'].max())
print('Mean:', justices.loc[:, 'justice_utt_share'].mean())


advocates = utts.loc[utts.loc[:, 'meta.speaker_type'] == 'A', :].copy()
advocates['petitioner_advocate'] = advocates.loc[:, 'meta.side'] == 1
advocates = advocates.groupby('meta.case_id').agg({'petitioner_advocate': ['sum', 'count']})
advocates.columns = [ 'petitioner_advocate_utts', 'total_advocate_utts']
advocates['petitioner_advocate_utt_share'] = advocates.loc[:, 'petitioner_advocate_utts'] / advocates.loc[:, 'total_advocate_utts']
advocates = advocates.loc[:, [ 'petitioner_advocate_utt_share']]
print(advocates)
print('Min', advocates.loc[:, 'petitioner_advocate_utt_share'].min())
print('Max', advocates.loc[:, 'petitioner_advocate_utt_share'].max())
print('Mean', advocates.loc[:, 'petitioner_advocate_utt_share'].mean())
utt_shares = pd.merge(justices, advocates, how='left',left_index = True, right_index = True)
utt_shares

               justice_utt_share
meta.case_id                    
2005_03-1238            0.515571
2005_04-1034            0.513353
2005_04-10566           0.525333
2005_04-1067            0.534954
2005_04-1084            0.526611
...                          ...
2019_19-631             0.572193
2019_19-635             0.583916
2019_19-67              0.532710
2019_19-7               0.538793
2019_19-715             0.593750

[1023 rows x 1 columns]
Min: 0.48
Max: 0.7347826086956522
Mean: 0.5158600540883758
               petitioner_advocate_utt_share
meta.case_id                                
2005_03-1238                        1.000000
2005_04-1034                        0.603659
2005_04-10566                       0.533708
2005_04-1067                        0.483660
2005_04-1084                        0.514793
...                                      ...
2019_19-631                         0.450000
2019_19-635                         0.361345
2019_19-67                          0

Unnamed: 0_level_0,justice_utt_share,petitioner_advocate_utt_share
meta.case_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2005_03-1238,0.515571,1.000000
2005_04-1034,0.513353,0.603659
2005_04-10566,0.525333,0.533708
2005_04-1067,0.534954,0.483660
2005_04-1084,0.526611,0.514793
...,...,...
2019_19-631,0.572193,0.450000
2019_19-635,0.583916,0.361345
2019_19-67,0.532710,0.540000
2019_19-7,0.538793,0.149533


In [114]:
utts['text'].replace('\d+', '', regex=True, inplace = True) #remove numbers, decide as group if this makes sense
convo_case = pd.DataFrame(utts.groupby('meta.case_id')['conversation_id'].nunique())
convo_case.rename(columns = {'conversation_id':'convo_count'}, inplace = True)
#Code to create conversations/case feature
#Proportion of conservative justices on the court at the time of the decision? 

roberts.loc[:,'decided_date'] = pd.to_datetime(roberts.loc[:,'decided_date'])


IndexingError: Too many indexers

In [108]:
just_dict = {'j__john_g_roberts_jr':0, 'j__david_h_souter':0,
       'j__anthony_m_kennedy':0, 'j__ruth_bader_ginsburg':1,
       'j__john_paul_stevens':0, 'j__samuel_a_alito_jr':0,
       'j__antonin_scalia':0, 'j__stephen_g_breyer':1,
       'j__sandra_day_oconnor':0, 'j__clarence_thomas': 0 ,
       'j__sonia_sotomayor':1, 'j__elena_kagan':1, 'j__neil_gorsuch': 0,
       'j__brett_m_kavanaugh':0} #based on being appointed by cons/liberal president
j_utts = utts.copy()
j_utts = j_utts.loc[utts.loc[:,'justice'] == True]
j_utts.loc[:,'speaker'].unique()
j_utts.loc[:,'cons_just']= j_utts.loc[:,'speaker'].map(just_dict)
cons_just = j_utts.groupby(['meta.case_id']).agg({'cons_just':'mean'}) 
#Does not consider length of utterance, which also may be of interest
roberts.loc[roberts.loc[:,'decided_date'].between('2005-10-29','2009-08-07'), 'prop_cons'] = 7/9
roberts.loc[roberts.loc[:,'decided_date'].between('2009-08-08','2010-08-06'), 'prop_cons'] = 6/9
roberts.loc[roberts.loc[:,'decided_date'].between('2010-08-07','2016-02-12'), 'prop_cons'] = 5/9
roberts.loc[roberts.loc[:,'decided_date'].between('2016-02-13','2017-04-09'), 'prop_cons'] = 4/8
roberts.loc[roberts.loc[:,'decided_date'].between('2017-04-10','2020-07-10'), 'prop_cons'] = 5/9

roberts.loc[roberts.loc[:,'prop_cons'].isna()]
#2019_17-1268 of our one NA. Looks like it has actually been decided and could be updated. 

Unnamed: 0,id,year,citation,title,petitioner,respondent,docket_no,court,decided_date,url,...,known_respondent_adv,advocates,win_side,win_side_detail,scdb_docket_id,votes,votes_detail,is_eq_divided,votes_side,prop_cons
7726,2019_17-1268,2019,590 US _,Opati v. Republic of Sudan,"Monicah Okoba Opati, et al.","Republic of Sudan, et al.",17-1268,Roberts Court,NaT,https://www.oyez.org/cases/2019/17-1268,...,True,{'Matthew D. McGill': {'id': 'matthew_d_mcgill...,1.0,5.0,2019-032-01,"{'j__john_g_roberts_jr': 2.0, 'j__clarence_tho...","{'j__john_g_roberts_jr': 1.0, 'j__clarence_tho...",0.0,"{'j__john_g_roberts_jr': 1.0, 'j__clarence_tho...",


In [115]:
group_utts = utts.groupby('meta.case_id')['text'].apply(' '.join)
#NOTE: No stemming or lemmatization done at this point
df = pd.merge(group_utts,roberts[['id','win_side']], how = 'left', 
              left_on = 'meta.case_id', right_on = 'id')
X_train, X_test, y_train, y_test = train_test_split(
    df.loc[:,df.columns != 'win_side'],df['win_side'], test_size = 0.2, random_state = 0)

In [116]:
#Initialize Vectorizer and vectorize train data
count_vect = CountVectorizer(ngram_range = (1,1),min_df = 4,
            stop_words = 'english', token_pattern = r'\b[a-zA-Z]{3,}\b') 
count_df = count_vect.fit_transform(X_train['text'])
count_array = count_df.toarray()
count_df = pd.DataFrame(count_array,columns = count_vect.get_feature_names(), index = X_train['id'])
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(count_df).toarray()
X_train_tfidf = pd.DataFrame(X_train_tfidf, columns = count_vect.get_feature_names(), index = X_train['id'])

X_train_tfidf = pd.merge(X_train_tfidf,convo_case, how = 'left', left_index = True, right_index = True)
X_train_tfidf = pd.merge(X_train_tfidf,utt_shares, how = 'left', left_index = True, right_index = True)
X_train_tfidf = pd.merge(X_train_tfidf,cons_just, how = 'left', left_index = True, right_index = True)
#MERGING IN ADDITIONAL FEATURES HERE
X_train_tfidf

Unnamed: 0_level_0,aaa,aba,abandon,abandoned,abandoning,abandonment,abandons,abate,abated,abatement,...,zillion,zip,zipes,zone,zones,zoning,convo_count,justice_utt_share,petitioner_advocate_utt_share,cons_just
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015_15-290,0.000000,0.000000,0.007648,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,1,0.532934,0.423077,0.662921
2007_07-343,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,1,0.515982,0.584906,0.221239
2009_09-497,0.006649,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,1,0.512727,0.425373,0.432624
2005_04-1131,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.011321,0.006974,0.0,0.0,1,0.509434,0.584615,0.377778
2009_09-338,0.000000,0.008387,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,1,0.519573,0.518519,0.294521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016_16-6219,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,1,0.515982,0.462264,0.663717
2007_06-984,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,1,0.516279,0.341346,0.337838
2013_12-79,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,1,0.502674,0.354839,0.414894
2012_11-982,0.000000,0.000000,0.005345,0.009987,0.006984,0.014411,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,1,0.514599,0.390977,0.624113


In [11]:
#Vectorize test data
count_test = count_vect.transform(X_test['text'])
count_test_array = count_test.toarray()
count_test_df = pd.DataFrame(count_test_array, columns = count_vect.get_feature_names(),index = X_test['id'])

X_test_tfidf = tfidf_transformer.transform(count_test_df).toarray()
X_test_tfidf = pd.DataFrame(X_test_tfidf, columns = count_vect.get_feature_names(), index = X_test['id'])
X_test_tfidf = pd.merge(X_test_tfidf,convo_case, how = 'left', left_index = True, right_index = True)
X_test_tfidf = pd.merge(X_test_tfidf,utt_shares, how = 'left', left_index = True, right_index = True)
X_test_tfidf = pd.merge(X_test_tfidf,cons_just, how = 'left', left_index = True, right_index = True)
X_test_tfidf

Unnamed: 0_level_0,aaa,aba,abandon,abandoned,abandoning,abandonment,abandons,abate,abated,abatement,...,zillion,zip,zipes,zone,zones,zoning,convo_count,justice_utt_share,petitioner_advocate_utt_share,cons_just
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016_15-649,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,1,0.512048,0.222222,0.700000
2005_04-1544,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,1,0.502110,0.483051,0.210084
2006_05-1074,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,1,0.513514,0.537037,0.070175
2013_12-158,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.004829,0.0,0.0,1,0.520958,0.475000,0.494253
2009_08-1553,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,1,0.503425,0.296552,0.612245
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015_14-419,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,1,0.524887,0.466667,0.422414
2009_08-1529,0.0,0.0,0.0,0.005098,0.021389,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,1,0.505814,0.235294,0.298851
2009_09-475,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,1,0.506024,0.463415,0.182540
2008_08-479,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,1,0.512676,0.427746,0.126374


In [12]:
logit = LogisticRegression()
logit = logit.fit(X = X_train_tfidf, y = y_train)
logit.predict(X_test_tfidf)
mean_acc = logit.score(X_test_tfidf, y_test)
print(mean_acc)

0.6634146341463415


In [13]:
np.mean(y_test)

0.6292682926829268

In [51]:
X_train_feat = X_train_tfidf.loc[:,['convo_count',
       'justice_utt_share', 'petitioner_advocate_utt_share', 'cons_just']]
X_test_feat = X_test_tfidf.loc[:,['convo_count',
       'justice_utt_share', 'petitioner_advocate_utt_share', 'cons_just']]
logit2 = LogisticRegression()
logit2 = logit2.fit(X = X_train_feat, y = y_train)
logit2.predict(X_test_feat)
mean_acc2 = logit2.score(X_test_feat, y_test)
print(mean_acc2)

0.6292682926829268


In [74]:
mlp = MLPClassifier(hidden_layer_sizes=(100,25,10), activation='relu', 
                    solver='adam', max_iter=500, random_state = 0)
mlp.fit(X_train_feat,y_train)
predict_test = mlp.predict(X_test_feat)
accuracy_score(y_test,predict_test)


0.6585365853658537

In [75]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,predict_test)) #lots of false negatives
print(classification_report(y_test,predict_test))

[[ 11  65]
 [  5 124]]
              precision    recall  f1-score   support

         0.0       0.69      0.14      0.24        76
         1.0       0.66      0.96      0.78       129

    accuracy                           0.66       205
   macro avg       0.67      0.55      0.51       205
weighted avg       0.67      0.66      0.58       205



In [None]:
""" EXAMPLE CODE FROM https://datascience.stackexchange.com/questions/36049/how-to-adjust-the-hyperparameters-of-mlp-classifier-to-get-more-perfect-performa
FOR HOW TO TEST MANY DIFFERENT PARAMETERS
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
clf.fit(DEAP_x_train, DEAP_y_train)

parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}
# Best parameters set
print('Best parameters found:\n', clf.best_params_)

# All results
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))"""