# Enrich Gender 
with Gender Guesser (https://pypi.org/project/gender-guesser/) and Wikipedia File (https://github.com/irgroup-classrooms/dis22-2022/tree/64-finaler-dcatensatz-mit-klickzahlen/finalDataset)


Input file 'WP_article_coref_NER_large_final.pkl' from nb Washington_Post_w_coref.ipynb

Output file for Auswertung_WP_Gender_im_article_w_coref.ipynb

### Preliminaries

In [1]:
# Import 
import pandas as pd

import warnings
warnings.filterwarnings('ignore') # (action='once')

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)


INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
# Read file
df = pd.read_pickle('/Users/landsiedelj/washington_post_bias/WP_article_coref_NER_large_final.pkl')

## Split first, middle and last name in different cols

In [3]:
# Write first, middle and last name in different cols
def split_name(df, var):
    sub_df = df[var].str.split('\\s+', expand=True)
    result = []

    for _, row in sub_df.iterrows():
        info = {'first_name': '', 'middle_name': '', 'middle_name': ''}
        n = row.count()

        if n == 0:
            pass
        elif n == 1:
            info['first_name'] = row.iloc[0]
        elif n == 2:
            info['first_name'], info['last_name'] = row.iloc[:2]
        else:
            info['first_name'] = row.iloc[0]
            info['last_name'] = row.iloc[-1]
            info['middle_name'] = ' '.join([(string or '') for string in row.iloc[1:-1]])
        result.append(info)
    return pd.DataFrame(result, index=df.index)

df_names = split_name(df, 'entity')
df = df.join(df_names)

# Enrich gender with different options

## Wikipedia File

In [4]:
df_person = pd.read_excel('/Users/landsiedelj/Downloads/WikiPersonen_lebend_mitGeschlecht_mitGeburtsland_mitBerufskategorien_mitKlickzahlen.xlsx')


In [5]:
# Cleaning df_person
wiki_dictionary = {
                "entity_per": "wikipedia_eintrag",
                "Name, Vorname": "name_vorname"
                }
    
df_person = df_person.copy()
df_person.rename(columns = wiki_dictionary, inplace=True)

In [6]:
# Strip whitespace
df_person['wikipedia_eintrag'] = df_person['wikipedia_eintrag'].str.split(',').str[-1].str.lstrip() # del whitespace

In [7]:
df_person['wikipedia_eintrag'] = df_person['wikipedia_eintrag'].str.replace(r'\(.*\)', '') # del everything in parenthesis
# df_person = df_person.apply(lambda x: x.astype(str).str.lower())
df_person['wikipedia_eintrag'] = df_person['wikipedia_eintrag'].str.rstrip() # del whitespace

In [8]:
df['wiki_gender'] = df.entity.map(df_person.set_index('wikipedia_eintrag')['Geschlecht'].to_dict())

In [9]:
print(df.wiki_gender.value_counts())
print(df.wiki_gender.isnull().sum())

male          766805
female        138526
non-binary       331
Name: wiki_gender, dtype: int64
1867609


## Gender Guesser

Get gender from https://pypi.org/project/gender-guesser/

In [10]:
# !pip install gender_guesser    
import gender_guesser.detector as gender
gd = gender.Detector()
df['gender_guesser'] = df['first_name'].parallel_apply(str.capitalize).map(lambda x: gd.get_gender(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=346659), Label(value='0 / 346659')…

> __unknown__ (name not found),
 __andy__ (androgynous), 
 __male__, 
 __female__, 
 __mostly_male__, 
 or __mostly_female__. 
 
 The difference between andy and unknown is that the former is found to have the same probability to be male than to be female, while the later means that the name wasn’t found in the database.

In [11]:
df.gender_guesser.value_counts()

male             1498199
unknown           670073
female            394742
mostly_male       101044
mostly_female      74283
andy               34930
Name: gender_guesser, dtype: int64

In [12]:
# Combine both columns
df.wiki_gender.fillna(df.gender_guesser, inplace=True)

In [13]:
# Delete not needed col
del df['gender_guesser']
del df['first_name']
del df['middle_name']
del df['last_name']
del df['merged_total_text']
del df['entity_type']

In [14]:
print(df.wiki_gender.value_counts())

male             1607892
unknown           612385
female            406255
mostly_male        69321
mostly_female      49677
andy               27410
non-binary           331
Name: wiki_gender, dtype: int64


In [15]:
# df.to_json('WP_article_100K_with_gender.jsonl', lines=True, orient='records')
df.to_pickle('/Users/landsiedelj/Downloads/wp_final/WP_article_100K_with_coref_gender_final.pkl')