In [52]:
import numpy as np
import pandas as pd
import regex as re
import gender_guesser.detector as gender
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [3]:
# https://stackoverflow.com/questions/31621414/share-data-between-ipython-notebooks/60863662
%store -r utts
utts.columns

Index(['id', 'timestamp', 'text', 'speaker', 'reply_to', 'conversation_id',
       'meta.case_id', 'meta.start_times', 'meta.stop_times',
       'meta.speaker_type', 'meta.side', 'meta.timestamp', 'vectors',
       'utt_counts'],
      dtype='object')

In [4]:
# https://pypi.org/project/gender-guesser/
d = gender.Detector()

def guess_gender(name):
    name = re.sub('j__', '', name)
    name = re.sub('_\S*', '', name)
    name = name.capitalize()
    guess = d.get_gender(name, 'usa')
    return guess

In [5]:
# Guess gender
gendr = utts.loc[:, ['meta.case_id', 'speaker' ]].copy()
gendr.loc[:, 'gender'] = gendr.loc[:, 'speaker'].map(guess_gender)
print('Before subsetting:')
print(gendr.loc[:, 'gender'].unique())
print(gendr.groupby('gender').agg({'gender': ['count']}).droplevel)

# Restrict to confident guesses
gendr = gendr.loc[gendr.loc[:, 'gender'].isin(['female', 'mostly_female', 
                                               'mostly_male', 'male'])]
print('\nAfter subsetting:')
print(gendr.loc[:, 'gender'].unique())
print(gendr.groupby('gender').agg({'gender': ['count']}).droplevel)

# Compute female_utt_share 
    # = (N female + N mostly_female) / 
    # (N female + N mostly_female + N male + N mostly_male)
gendr.loc[:, 'gender_num'] = gendr.loc[:, 'gender'].isin(['female', 
                                                          'mostly_female'])
gendr = gendr.groupby('meta.case_id').agg({'gender_num': ['sum', 'count']})
gendr.columns = ['female_utts', 'total_utts']
gendr.loc[:, 'female_utt_share'] = (gendr.loc[:, 'female_utts'] /
                                    gendr.loc[:, 'total_utts'])
gendr = gendr.loc[:, ['female_utt_share']]

Before subsetting:
['male' 'female' 'andy' 'mostly_male' 'unknown' 'mostly_female']
<bound method NDFrame.droplevel of                gender
                count
gender               
andy            29443
female          41616
male           158444
mostly_female    1926
mostly_male      3099
unknown          8564>

After subsetting:
['male' 'female' 'mostly_male' 'mostly_female']
<bound method NDFrame.droplevel of                gender
                count
gender               
female          41616
male           158444
mostly_female    1926
mostly_male      3099>


In [6]:
print('Summary Stats for female_utt_share')
print('Min:', gendr.loc[:, 'female_utt_share'].min())
print('Max:', gendr.loc[:, 'female_utt_share'].max())
print('Mean:', gendr.loc[:, 'female_utt_share'].mean())
print('Median:', gendr.loc[:, 'female_utt_share'].median())

Summary Stats for female_utt_share
Min: 0.008620689655172414
Max: 0.8285714285714286
Mean: 0.2155638073950437
Median: 0.17346938775510204


In [7]:
# All casdes
cases = pd.read_json(path_or_buf='data/cases.jsonl', lines=True)

In [8]:
# Case IDs with 0 or 100% petitioner_advocate_utt_share
%store -r advocates
single_sided = (advocates.loc[
    ((advocates.loc[:, 'petitioner_advocate_utt_share'] == 0.0) |
     (advocates.loc[:, 'petitioner_advocate_utt_share'] == 1.0)), :])
single_sided = single_sided.reset_index()
# advocates has one row per case ID --> case_ids are already unique
iffy_ids = single_sided.loc[:, 'meta.case_id']
print(iffy_ids)

0     2005_03-1238
1     2005_04-1332
2      2006_05-593
3      2008_08-267
4     2009_08-1119
5     2009_08-1224
6     2009_08-1498
7      2009_08-472
8     2012_11-1351
9     2013_12-1163
10    2013_12-1200
11     2013_12-872
12     2013_13-339
13     2013_13-369
14    2014_13-1402
15     2014_13-271
16     2014_13-550
17     2014_13-894
18    2014_13-9972
Name: meta.case_id, dtype: object


In [9]:
cases.head(3)

Unnamed: 0,id,year,citation,title,petitioner,respondent,docket_no,court,decided_date,url,...,adv_sides_inferred,known_respondent_adv,advocates,win_side,win_side_detail,scdb_docket_id,votes,votes_detail,is_eq_divided,votes_side
0,1955_71,1955,350 US 79,Affronti v. United States,Affronti,United States,71,Warren Court,"Dec 5, 1955",https://www.oyez.org/cases/1955/71,...,True,True,"{'Harry F. Murphy': {'id': 'harry_f_murphy', '...",0.0,2.0,1955-009-01,"{'j__john_m_harlan2': 2.0, 'j__hugo_l_black': ...","{'j__john_m_harlan2': 1.0, 'j__hugo_l_black': ...",0.0,"{'j__john_m_harlan2': 0.0, 'j__hugo_l_black': ..."
1,1955_410,1955,351 US 79,"American Airlines, Inc. v. North American Airl...","American Airlines, Inc.","North American Airlines, Inc.",410,Warren Court,"Apr 23, 1956",https://www.oyez.org/cases/1955/410,...,True,True,{'Howard C. Westwood': {'id': 'howard_c_westwo...,1.0,4.0,1955-071-01,"{'j__john_m_harlan2': 2.0, 'j__hugo_l_black': ...","{'j__john_m_harlan2': 1.0, 'j__hugo_l_black': ...",0.0,"{'j__john_m_harlan2': 1.0, 'j__hugo_l_black': ..."
2,1955_351,1955,350 US 532,Archawski v. Hanioti,Archawski,Hanioti,351,Warren Court,"Apr 9, 1956",https://www.oyez.org/cases/1955/351,...,True,False,"{'Harry D. Graham': {'id': 'harry_d_graham', '...",1.0,4.0,1955-053-01,"{'j__john_m_harlan2': 2.0, 'j__hugo_l_black': ...","{'j__john_m_harlan2': 1.0, 'j__hugo_l_black': ...",0.0,"{'j__john_m_harlan2': 1.0, 'j__hugo_l_black': ..."


In [10]:
iffy_cases = cases.loc[cases.loc[:, 'id'].isin(iffy_ids), :]
iffy_cases


Unnamed: 0,id,year,citation,title,petitioner,respondent,docket_no,court,decided_date,url,...,adv_sides_inferred,known_respondent_adv,advocates,win_side,win_side_detail,scdb_docket_id,votes,votes_detail,is_eq_divided,votes_side
6615,2005_03-1238,2005,546 US 21,"IBP, Inc. v. Alvarez","IBP, Inc.","Gabriel Alvarez, individually and on behalf of...",03-1238,Roberts Court,"Nov 8, 2005",https://www.oyez.org/cases/2005/03-1238,...,False,False,{'Carter G. Phillips': {'id': 'carter_g_philli...,0.0,2.0,2005-005-01,"{'j__john_paul_stevens': 2.0, 'j__sandra_day_o...","{'j__john_paul_stevens': 1.0, 'j__sandra_day_o...",0.0,"{'j__john_paul_stevens': 0.0, 'j__sandra_day_o..."
6657,2005_04-1332,2005,546 US 345,Will v. Hallock,Richard Will et al.,Susan Hallock et al.,04-1332,Roberts Court,"Jan 18, 2006",https://www.oyez.org/cases/2005/04-1332,...,False,True,"{'Allison M. Zieve': {'id': 'allison_m_zieve',...",0.0,5.0,2005-022-01,"{'j__john_paul_stevens': 2.0, 'j__sandra_day_o...","{'j__john_paul_stevens': 1.0, 'j__sandra_day_o...",0.0,"{'j__john_paul_stevens': 0.0, 'j__sandra_day_o..."
6708,2006_05-593,2006,549 US 225,Osborn v. Haley,Pat Osborn,Barry Haley et al.,05-593,Roberts Court,"Jan 22, 2007",https://www.oyez.org/cases/2006/05-593,...,False,True,{'Douglas Hallward-Driemeier': {'id': 'douglas...,0.0,2.0,2006-013-01,"{'j__john_paul_stevens': 2.0, 'j__antonin_scal...","{'j__john_paul_stevens': 1.0, 'j__antonin_scal...",0.0,"{'j__john_paul_stevens': 0.0, 'j__antonin_scal..."
6884,2008_08-267,2008,556 US _,United States v. Denedo,United States,Jacob Denedo,08-267,Roberts Court,"Jun 8, 2009",https://www.oyez.org/cases/2008/08-267,...,False,True,"{'Pratik A. Shah': {'id': 'pratik_shah', 'name...",0.0,2.0,2008-065-01,"{'j__john_g_roberts_jr': 1.0, 'j__john_paul_st...","{'j__john_g_roberts_jr': 2.0, 'j__john_paul_st...",0.0,"{'j__john_g_roberts_jr': 1.0, 'j__john_paul_st..."
6928,2009_08-1498,2009,561 US 1,Holder v. Humanitarian Law Project,"Eric H. Holder, Jr., Attorney General, et al.","Humanitarian Law Project, et al.",08-1498,Roberts Court,"Jun 21, 2010",https://www.oyez.org/cases/2009/08-1498,...,True,False,"{'David D. Cole': {'id': 'david_d_cole', 'name...",1.0,7.0,2009-077-01,"{'j__john_g_roberts_jr': 2.0, 'j__john_paul_st...","{'j__john_g_roberts_jr': 1.0, 'j__john_paul_st...",0.0,"{'j__john_g_roberts_jr': 1.0, 'j__john_paul_st..."
6948,2009_08-1119,2009,559 US 229,"Milavetz, Gallop & Milavetz, P.A. v. United St...","Milavetz, Gallop & Milavetz, P.A., et al.",United States,08-1119,Roberts Court,"Mar 8, 2010",https://www.oyez.org/cases/2009/08-1119,...,False,False,"{'G. Eric Brunstad, Jr.': {'id': 'g_eric_bruns...",1.0,7.0,2009-031-01,"{'j__john_g_roberts_jr': 2.0, 'j__john_paul_st...","{'j__john_g_roberts_jr': 1.0, 'j__john_paul_st...",0.0,"{'j__john_g_roberts_jr': 1.0, 'j__john_paul_st..."
6962,2009_08-472,2009,559 US 700,Salazar v. Buono,"Ken L. Salazar, Secretary of the Interior, et al.",Frank Buono,08-472,Roberts Court,"Apr 28, 2010",https://www.oyez.org/cases/2009/08-472,...,False,True,"{'Elena Kagan': {'id': 'elena_kagan', 'name': ...",1.0,4.0,2009-041-01,"{'j__john_g_roberts_jr': 2.0, 'j__john_paul_st...","{'j__john_g_roberts_jr': 3.0, 'j__john_paul_st...",0.0,"{'j__john_g_roberts_jr': 1.0, 'j__john_paul_st..."
6974,2009_08-1224,2009,560 US 126,United States v. Comstock,United States,"Graydon Earl Comstock, Jr., et al.",08-1224,Roberts Court,"May 17, 2010",https://www.oyez.org/cases/2009/08-1224,...,False,True,"{'Elena Kagan': {'id': 'elena_kagan', 'name': ...",1.0,4.0,2009-050-01,"{'j__john_g_roberts_jr': 2.0, 'j__john_paul_st...","{'j__john_g_roberts_jr': 1.0, 'j__john_paul_st...",0.0,"{'j__john_g_roberts_jr': 1.0, 'j__john_paul_st..."
7185,2012_11-1351,2012,568 US 503,Levin v. United States,Steven A. Levin,"United States, et al.",11-1351,Roberts Court,"Mar 4, 2013",https://www.oyez.org/cases/2012/11-1351,...,False,True,"{'James A. Feldman': {'id': 'james_a_feldman',...",1.0,4.0,2012-024-01,"{'j__john_g_roberts_jr': 2.0, 'j__antonin_scal...","{'j__john_g_roberts_jr': 1.0, 'j__antonin_scal...",0.0,"{'j__john_g_roberts_jr': 1.0, 'j__antonin_scal..."
7240,2013_13-339,2013,573 US 1,CTS Corp. v. Waldburger,CTS Corp.,"Peter Waldberger, et al.",13-339,Roberts Court,"Jun 9, 2014",https://www.oyez.org/cases/2013/13-339,...,True,False,{'Joseph R. Palmore': {'id': 'joseph_r_palmore...,1.0,3.0,2013-054-01,"{'j__john_g_roberts_jr': 2.0, 'j__antonin_scal...","{'j__john_g_roberts_jr': 3.0, 'j__antonin_scal...",0.0,"{'j__john_g_roberts_jr': 1.0, 'j__antonin_scal..."


In [47]:
X_train_tfidf = pd.read_csv('data/X_train_tfidf', index_col='id')
X_test_tfidf = pd.read_csv('data/X_test_tfidf', index_col='id')
X_train_all = pd.read_csv('data/X_train_all', index_col='id')
X_test_all = pd.read_csv('data/X_test_all', index_col='id')
X_train_feat = pd.read_csv('data/X_train_feat', index_col='id')
X_test_feat = pd.read_csv('data/X_test_feat', index_col='id')
y_train = pd.read_csv('data/y_train')
y_test = pd.read_csv('data/y_test')

In [48]:
print(y_train.head(3))

   Unnamed: 0  win_side
0         760       0.0
1         204       1.0
2         362       1.0


In [49]:
y_train = y_train.loc[:, 'win_side']
y_test = y_test.loc[:, 'win_side']

print(y_train.head(3))

0    0.0
1    1.0
2    1.0
Name: win_side, dtype: float64


In [51]:
logit_tfidf = LogisticRegression()
logit_tfidf = logit_tfidf.fit(X = X_train_tfidf, y = y_train)
logit_tfidf.predict(X_test_tfidf)
mean_acc = logit_tfidf.score(X_test_tfidf, y_test)
print(mean_acc)

logit_feat = LogisticRegression()
logit_feat = logit_feat.fit(X = X_train_feat, y = y_train)
logit_feat.predict(X_test_feat)
mean_acc = logit_feat.score(X_test_feat, y_test)
print(mean_acc)

logit_all = LogisticRegression()
logit_all = logit_all.fit(X = X_train_all, y = y_train)
logit_all.predict(X_test_all)
mean_acc = logit_all.score(X_test_all, y_test)
print(mean_acc)

0.624390243902439
0.6292682926829268
0.6731707317073171


In [62]:
# Solvers excluded:
# liblinear (limited to one-vs-rest schemas)
# sag and saga (convergence issues without additional preprocessing)

param_grid = {
    'solver': ['lbfgs', 'newton-cg', 'newton-cholesky'],
    'penalty': ['l2', 'none']
}

In [63]:
search = GridSearchCV(logit_all, param_grid, n_jobs=-1)
search.fit(X_train_all, y_train)

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further opti

In [67]:
param_grid = {
    'solver': ['lbfgs', 'newton-cg', 'newton-cholesky'],
    'penalty': ['l2', None]
}

In [77]:
search = GridSearchCV(logit_all, param_grid, n_jobs=-1, return_train_score=True)
search.fit(X_train_all, y_train)

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further opti

In [78]:
print('Best parameters found:\n', search.best_params_)

Best parameters found:
 {'penalty': 'l2', 'solver': 'lbfgs'}


In [79]:
results = pd.DataFrame(search.cv_results_)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_penalty,param_solver,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,1.310372,0.042441,0.063959,0.005211,l2,lbfgs,"{'penalty': 'l2', 'solver': 'lbfgs'}",0.658537,0.664634,0.70122,...,0.66132,0.022122,1,0.770642,0.756881,0.753823,0.775573,0.763359,0.764055,0.008157
1,1.498543,0.089841,0.064433,0.011148,l2,newton-cg,"{'penalty': 'l2', 'solver': 'newton-cg'}",0.658537,0.664634,0.70122,...,0.66132,0.022122,1,0.770642,0.756881,0.753823,0.775573,0.763359,0.764055,0.008157
2,384.916435,13.91925,0.119664,0.024652,l2,newton-cholesky,"{'penalty': 'l2', 'solver': 'newton-cholesky'}",0.658537,0.664634,0.70122,...,0.66132,0.022122,1,0.770642,0.756881,0.753823,0.775573,0.763359,0.764055,0.008157
3,0.829251,0.076787,0.0686,0.006794,,lbfgs,"{'penalty': None, 'solver': 'lbfgs'}",0.603659,0.603659,0.579268,...,0.597808,0.012138,5,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,2.005399,0.127007,0.079471,0.01041,,newton-cg,"{'penalty': None, 'solver': 'newton-cg'}",0.579268,0.609756,0.597561,...,0.597808,0.018517,6,1.0,1.0,1.0,1.0,1.0,1.0,0.0
5,154.068194,13.893174,0.140252,0.029886,,newton-cholesky,"{'penalty': None, 'solver': 'newton-cholesky'}",0.615854,0.603659,0.579268,...,0.601474,0.015434,4,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [80]:
results = results.loc[:, ['param_solver', 'param_penalty', 'mean_train_score', 'mean_test_score', 'rank_test_score']]
results = results.sort_values('rank_test_score')
results

Unnamed: 0,param_solver,param_penalty,mean_train_score,mean_test_score,rank_test_score
0,lbfgs,l2,0.764055,0.66132,1
1,newton-cg,l2,0.764055,0.66132,1
2,newton-cholesky,l2,0.764055,0.66132,1
5,newton-cholesky,,1.0,0.601474,4
3,lbfgs,,1.0,0.597808,5
4,newton-cg,,1.0,0.597808,6
