In [1]:
import numpy as np
import pandas as pd

In [2]:
# https://stackoverflow.com/questions/31621414/share-data-between-ipython-notebooks/60863662
%store -r utts
utts.columns

Index(['timestamp', 'text', 'speaker', 'reply_to', 'conversation_id',
       'meta.case_id', 'meta.start_times', 'meta.stop_times',
       'meta.speaker_type', 'meta.side', 'meta.timestamp', 'vectors',
       'utt_counts'],
      dtype='object')

In [3]:
utts['justice'] = utts.loc[:, 'meta.speaker_type'] == 'J'
df = utts.groupby('meta.case_id', as_index=False).agg({'justice': ['sum', 'count']})
df.columns = ['meta.case_id', 'justice_utts', 'tot_utts']
df['justice_utt_share'] = df.loc[:, 'justice_utts'] / df.loc[:, 'tot_utts']
justices = df.loc[:, ['meta.case_id', 'justice_utt_share']]
print(justices)
print('Min:', justices.loc[:, 'justice_utt_share'].min())
print('Max:', justices.loc[:, 'justice_utt_share'].max())
print('Mean:', justices.loc[:, 'justice_utt_share'].mean())

       meta.case_id  justice_utt_share
0      2005_03-1238           0.515571
1      2005_04-1034           0.513353
2     2005_04-10566           0.525333
3      2005_04-1067           0.534954
4      2005_04-1084           0.526611
...             ...                ...
1018    2019_19-631           0.572193
1019    2019_19-635           0.583916
1020     2019_19-67           0.532710
1021      2019_19-7           0.538793
1022    2019_19-715           0.593750

[1023 rows x 2 columns]
Min: 0.48
Max: 0.7347826086956522
Mean: 0.5158600540883759


In [4]:
advocates = utts.loc[utts.loc[:, 'meta.speaker_type'] == 'A', :].copy()
advocates['petitioner_advocate'] = advocates.loc[:, 'meta.side'] == 1
advocates = advocates.groupby('meta.case_id', as_index=False).agg({'petitioner_advocate': ['sum', 'count']})
advocates.columns = ['meta.case_id', 'petitioner_advocate_utts', 'total_advocate_utts']
advocates['petitioner_advocate_utt_share'] = advocates.loc[:, 'petitioner_advocate_utts'] / advocates.loc[:, 'total_advocate_utts']
advocates = advocates.loc[:, ['meta.case_id', 'petitioner_advocate_utt_share']]
print(advocates)
print('Min', advocates.loc[:, 'petitioner_advocate_utt_share'].min())
print('Max', advocates.loc[:, 'petitioner_advocate_utt_share'].max())
print('Mean', advocates.loc[:, 'petitioner_advocate_utt_share'].mean())



       meta.case_id  petitioner_advocate_utt_share
0      2005_03-1238                       1.000000
1      2005_04-1034                       0.603659
2     2005_04-10566                       0.533708
3      2005_04-1067                       0.483660
4      2005_04-1084                       0.514793
...             ...                            ...
1018    2019_19-631                       0.450000
1019    2019_19-635                       0.361345
1020     2019_19-67                       0.540000
1021      2019_19-7                       0.149533
1022    2019_19-715                       0.307692

[1023 rows x 2 columns]
Min 0.0
Max 1.0
Mean 0.45755947906632044


In [5]:
utt_shares = pd.merge(justices, advocates, how='left', on='meta.case_id')
utt_shares

Unnamed: 0,meta.case_id,justice_utt_share,petitioner_advocate_utt_share
0,2005_03-1238,0.515571,1.000000
1,2005_04-1034,0.513353,0.603659
2,2005_04-10566,0.525333,0.533708
3,2005_04-1067,0.534954,0.483660
4,2005_04-1084,0.526611,0.514793
...,...,...,...
1018,2019_19-631,0.572193,0.450000
1019,2019_19-635,0.583916,0.361345
1020,2019_19-67,0.532710,0.540000
1021,2019_19-7,0.538793,0.149533


In [6]:
# https://pypi.org/project/gender-guesser/
import gender_guesser.detector as gender
gender = utts.loc[:, ['meta.case_id', 'speaker' ]].copy()
gender

# TODO: process speaker names, predict gender, compute share_female
# NOTE: gender-guesser will return 'unknown', 'andy' (androgynous), 'male', 'female',
    # 'mostly_male', or 'mostly_female'
# PROPOSED: share_female = (N female + N mostly_female) / 
                        #  (N female + N mostly_female + N male + N mostly_male)

Unnamed: 0,meta.case_id,speaker
0,2005_04-433,j__john_g_roberts_jr
1,2005_04-433,david_c_frederick
2,2005_04-433,j__david_h_souter
3,2005_04-433,david_c_frederick
4,2005_04-433,j__david_h_souter
...,...,...
243088,2019_19-67,j__sonia_sotomayor
243089,2019_19-67,eric_j_feigin
243090,2019_19-67,j__sonia_sotomayor
243091,2019_19-67,eric_j_feigin
