In [10]:
import numpy as np
import pandas as pd
import regex as re
import gender_guesser.detector as gender

In [3]:
# https://stackoverflow.com/questions/31621414/share-data-between-ipython-notebooks/60863662
%store -r utts
utts.columns

Index(['timestamp', 'text', 'speaker', 'reply_to', 'conversation_id',
       'meta.case_id', 'meta.start_times', 'meta.stop_times',
       'meta.speaker_type', 'meta.side', 'meta.timestamp', 'vectors',
       'utt_counts'],
      dtype='object')

In [79]:
# https://pypi.org/project/gender-guesser/
d = gender.Detector()

def guess_gender(name):
    name = re.sub('j__', '', name)
    name = re.sub('_\S*', '', name)
    name = name.capitalize()
    guess = d.get_gender(name, 'usa')
    return guess

In [101]:
# Guess gender
gendr = utts.loc[:, ['meta.case_id', 'speaker' ]].copy()
gendr.loc[:, 'gender'] = gendr.loc[:, 'speaker'].map(guess_gender)
print('Before subsetting:')
print(gendr.loc[:, 'gender'].unique())
print(gendr.groupby('gender').agg({'gender': ['count']}).droplevel)

# Restrict to confident guesses
gendr = gendr.loc[gendr.loc[:, 'gender'].isin(['female', 'mostly_female', 
                                               'mostly_male', 'male'])]
print('\nAfter subsetting:')
print(gendr.loc[:, 'gender'].unique())
print(gendr.groupby('gender').agg({'gender': ['count']}).droplevel)

# Compute female_utt_share 
    # = (N female + N mostly_female) / 
    # (N female + N mostly_female + N male + N mostly_male)
gendr.loc[:, 'gender_num'] = gendr.loc[:, 'gender'].isin(['female', 
                                                          'mostly_female'])
gendr = gendr.groupby('meta.case_id').agg({'gender_num': ['sum', 'count']})
gendr.columns = ['female_utts', 'total_utts']
gendr.loc[:, 'female_utt_share'] = (gendr.loc[:, 'female_utts'] /
                                    gendr.loc[:, 'total_utts'])
gendr = gendr.loc[:, ['female_utt_share']]

Before subsetting:
['male' 'female' 'andy' 'mostly_male' 'unknown' 'mostly_female']
<bound method NDFrame.droplevel of                gender
                count
gender               
andy            29443
female          41616
male           158444
mostly_female    1926
mostly_male      3099
unknown          8564>

After subsetting:
['male' 'female' 'mostly_male' 'mostly_female']
<bound method NDFrame.droplevel of                gender
                count
gender               
female          41616
male           158444
mostly_female    1926
mostly_male      3099>


In [105]:
print('Summary Stats for female_utt_share')
print('Min:', gendr.loc[:, 'female_utt_share'].min())
print('Max:', gendr.loc[:, 'female_utt_share'].max())
print('Mean:', gendr.loc[:, 'female_utt_share'].mean())
print('Median:', gendr.loc[:, 'female_utt_share'].median())

Summary Stats for female_utt_share
Min: 0.008620689655172414
Max: 0.8285714285714286
Mean: 0.2155638073950437
Median: 0.17346938775510204
