In [12]:
import json
import pandas as pd
import numpy as np
from urllib.request import urlopen

In [13]:
def clean_name(s):
    return s.lower().replace(' ', '%20')

def gen_api_call(firstname, middlename, surname, country):
    call = '&split={}'.format(clean_name(firstname))
    if middlename:
        call = call + '%20{}'.format(clean_name(middlename))
    if surname:
        call = call + '%20{}'.format(clean_name(surname))
    if country:
        call = call + '&country={}'.format(country)
    return call

def call_api(call):
    call = URL + call
    response = urlopen(call)
    decoded = response.read().decode('utf-8')
    data = json.loads(decoded)
    return data

In [14]:
import yaml
def check_error(results):
    res = yaml.load(results)
    try:
        return res['errno']
    except:
        None
        
def get_gender(results):
    res = yaml.load(results)
    try:
        return res['gender']
    except:
        return None
    
def get_accuracy(results):
    res = yaml.load(results)
    try:
        return res['accuracy']
    except:
        return None 

In [52]:
df = pd.read_csv('data/arxiv_first_ro_gender2_result.csv')

In [53]:
df.head()

Unnamed: 0,id,name,Ofirstnam,Omidnam,Osurname,call,results
0,id,name,Ofirstnam,Omidnam,Osurname,&name=ofirstnam%20omidnam,"{'name': 'ofirstnam omidnam', 'name_sanitized'..."
1,arxiv_ro_2,Alexander Yu. Vlasov,ALEXANDER,YU.,VLASOV,&name=alexander%20yu.,"{'name': 'alexander yu.', 'name_sanitized': 'A..."
2,arxiv_ro_3,Andreas Siebert,ANDREAS,,SIEBERT,&name=andreas,"{'name': 'andreas', 'name_sanitized': 'Andreas..."
3,arxiv_ro_4,Stephen L. Adler,STEPHEN,,ADLER,&name=stephen,"{'name': 'stephen', 'name_sanitized': 'Stephen..."
4,arxiv_ro_5,Soumyadeep Paul,SOUMYADEEP,,PAUL,&name=soumyadeep,"{'name': 'soumyadeep', 'name_sanitized': 'Soum..."


In [54]:
df = df.drop_duplicates()

In [55]:
df['results'].iloc[1]

"{'name': 'alexander yu.', 'name_sanitized': 'Alexander', 'country': '', 'gender': 'male', 'samples': 56237, 'accuracy': 99, 'duration': '24ms', 'credits_used': 1}"

In [56]:
df['gender'] = df['results'].map(get_gender)
df['accuracy'] = df['results'].map(get_accuracy)

In [57]:
df['gender'].value_counts(dropna=False)

male       39950
female      7826
unknown     1355
NaN            1
Name: gender, dtype: int64

In [58]:
df.head(10)

Unnamed: 0,id,name,Ofirstnam,Omidnam,Osurname,call,results,gender,accuracy
0,id,name,Ofirstnam,Omidnam,Osurname,&name=ofirstnam%20omidnam,"{'name': 'ofirstnam omidnam', 'name_sanitized'...",unknown,0.0
1,arxiv_ro_2,Alexander Yu. Vlasov,ALEXANDER,YU.,VLASOV,&name=alexander%20yu.,"{'name': 'alexander yu.', 'name_sanitized': 'A...",male,99.0
2,arxiv_ro_3,Andreas Siebert,ANDREAS,,SIEBERT,&name=andreas,"{'name': 'andreas', 'name_sanitized': 'Andreas...",male,99.0
3,arxiv_ro_4,Stephen L. Adler,STEPHEN,,ADLER,&name=stephen,"{'name': 'stephen', 'name_sanitized': 'Stephen...",male,99.0
4,arxiv_ro_5,Soumyadeep Paul,SOUMYADEEP,,PAUL,&name=soumyadeep,"{'name': 'soumyadeep', 'name_sanitized': 'Soum...",male,100.0
5,arxiv_ro_6,Sudipta N. Sinha,SUDIPTA,,SINHA,&name=sudipta,"{'name': 'sudipta', 'name_sanitized': 'Sudipta...",male,82.0
6,arxiv_ro_7,Amitabha Mukerjee,AMITABHA,,MUKERJEE,&name=amitabha,"{'name': 'amitabha', 'name_sanitized': 'Amitab...",male,97.0
7,arxiv_ro_8,Mireille Boutin,MIREILLE,,BOUTIN,&name=mireille,"{'name': 'mireille', 'name_sanitized': 'Mireil...",female,98.0
8,arxiv_ro_9,Kagan Tumer,KAGAN,,TUMER,&name=kagan,"{'name': 'kagan', 'name_sanitized': 'Kagan', '...",male,95.0
9,arxiv_ro_10,Joydeep Ghosh,JOYDEEP,,GHOSH,&name=joydeep,"{'name': 'joydeep', 'name_sanitized': 'Joydeep...",male,100.0


In [61]:
cols = ['id', 'name', 'Ofirstnam', 'Osurname', 'gender', 'accuracy']
df[cols].to_csv('data/api_results/gender_results/arxiv_first_ro_gender2.csv')

In [61]:
df.head()

Unnamed: 0,name,author_name,author_middle_name,author_surname,results,gender,accuracy,len_name,len_middle_name
0,name,author_name,author_middle_name,author_surname,"{'last_name': 'Name', 'first_name': '', 'stric...",unknown,0,11.0,18.0
1,K. M. Lochner,K.,M.,Lochner,"{'last_name': 'Lochner', 'first_name': 'K. M.'...",,61,1.0,1.0
2,D. M. Reeves,D.,M.,Reeves,"{'last_name': 'Reeves', 'first_name': 'D. M.',...",,73,1.0,1.0
3,Y. Vorobeychik,Y.,,Vorobeychik,"{'last_name': 'Y.', 'first_name': '', 'strict'...",,0,1.0,
4,M. P. Wellman,M.,P.,Wellman,"{'last_name': 'Wellman', 'first_name': 'M. P.'...",,73,1.0,1.0


## Ethnicity

In [4]:
df = pd.read_csv('data/arxiv_first_sy_ethnic_result.csv')
df = df.drop_duplicates()

In [5]:
yaml.load(df['results'].iloc[0]).keys()

dict_keys(['2PRACE', 'Hispanic', 'API', 'Black', 'AIAN', 'White'])

In [6]:
import yaml
KEYS = ['2PRACE', 'Hispanic', 'API', 'Black', 'AIAN', 'White']
new_KEYS =  ['NamePrism_{}'.format(x) for x in KEYS]
def get_ethnicity(results):
    try:
        res = yaml.load(results)
        return res['2PRACE'], res['Hispanic'], res['API'], res['Black'], res['AIAN'], res['White']
    except:
        return None, None, None, None, None, None
    
def get_max_ethnicity(row):
    ethnicities = []
    
    for k in new_KEYS:
        if isinstance(row[k], float):
            ethnicities.append(row[k])
        else:
            return None
        
    return KEYS[np.argmax(ethnicities)]

In [7]:
for k in new_KEYS:
    df[k] = None

df[new_KEYS[0]], df[new_KEYS[1]], df[new_KEYS[2]], df[new_KEYS[3]], df[new_KEYS[4]], df[new_KEYS[5]] = zip(*df['results'].map(get_ethnicity))


In [8]:
df['NamePrism_ethnicity'] = df.apply(lambda x: get_max_ethnicity(x), axis=1)

In [9]:
df.head()

Unnamed: 0,id,name,Ofirstnam,Omidnam,Osurname,results,NamePrism_2PRACE,NamePrism_Hispanic,NamePrism_API,NamePrism_Black,NamePrism_AIAN,NamePrism_White,NamePrism_ethnicity
0,id,name,Ofirstnam,Omidnam,Osurname,"{'2PRACE': 1.5983701819861723e-06, 'Hispanic':...",1.59837e-06,0.0372441,0.0560092,0.0373316,6.83087e-06,0.869407,White
1,arxiv_sy_0,Sandro A. Coelho,SANDRO,,COELHO,"{'2PRACE': 1.0575078031311818e-06, 'Hispanic':...",1.05751e-06,0.0623419,0.00415719,0.00011892,1.64203e-07,0.933381,White
2,arxiv_sy_1,Diego Moussallem,DIEGO,,MOUSSALLEM,"{'2PRACE': 0.0002782917126488064, 'Hispanic': ...",0.000278292,0.507371,0.0103683,0.00820891,0.00090819,0.472865,Hispanic
3,arxiv_sy_2,Gustavo C. Publio,GUSTAVO,,PUBLIO,"{'2PRACE': 0.00022398947128404972, 'Hispanic':...",0.000223989,0.854625,0.00644003,0.00260238,0.000682087,0.135426,Hispanic
4,arxiv_sy_3,Diego Esteves,DIEGO,,ESTEVES,"{'2PRACE': 0.0005175848450145907, 'Hispanic': ...",0.000517585,0.838679,0.0194822,0.000954493,0.000505777,0.139861,Hispanic


In [10]:
cols = ['id', 'name', 'NamePrism_ethnicity'] + new_KEYS

In [11]:
df[cols].to_csv('data/api_results/ethnicity_results/arxiv_first_sy_ethnic_result.csv')