In [1]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import unicodedata
import pycountry

In [2]:
sample = pd.read_stata('../data/sample.dta')
sample.head()

Unnamed: 0,id,Ofirstnam,Omidnam,Osurname,msa
0,1.0,A,DALE,MAYO,9999.0
1,2.0,MARY,ANN,SKREZEC,7362.0
2,3.0,RAY,HOWARD,SUHLER,6450.0
3,4.0,VUONG,TUYEN,KHON,9999.0
4,5.0,SON,SUK,LEE,3920.0


## AAAI Conference

In [3]:
aaai_df = pd.read_csv('../data/aaai_authors.csv')
len(aaai_df)

# 2004
# 97
# change from 2002
# if ; in jj then:
# split by , and remove last as university

29349

In [4]:
aaai_df.head()

Unnamed: 0,year,paper_id,title,url,author_name,university,country
0,18,16583,Algorithms for Trip-Vehicle Assignment in Ride...,https://aaai.org/ocs/index.php/AAAI/AAAI18/pap...,Xiaohui Bei,Nanyang Technological University,Singapore
1,18,16583,Algorithms for Trip-Vehicle Assignment in Ride...,https://aaai.org/ocs/index.php/AAAI/AAAI18/pap...,Shengyu Zhang,<span>The Chinese University of Hong Kong</span>,Hong Kong
2,18,16893,EAD: Elastic-Net Attacks to Deep Neural Networ...,https://aaai.org/ocs/index.php/AAAI/AAAI18/pap...,Pin-Yu Chen,IBM Research AI,United States
3,18,16893,EAD: Elastic-Net Attacks to Deep Neural Networ...,https://aaai.org/ocs/index.php/AAAI/AAAI18/pap...,Yash Sharma,,"The Cooper Union, New York"
4,18,16893,EAD: Elastic-Net Attacks to Deep Neural Networ...,https://aaai.org/ocs/index.php/AAAI/AAAI18/pap...,Huan Zhang,,"University of California, Davis"


## Only keep from 2018 - 2005

In [5]:
aaai_df2 = pd.read_csv('../data/aaai_18_05.csv')
aaai_df2 = aaai_df2.where((pd.notnull(aaai_df2)), None)
total_aaai2 = len(aaai_df2)
aaai_df2.head()

Unnamed: 0,year,paper_id,title,url,author_name,university,country
0,18,16583,Algorithms for Trip-Vehicle Assignment in Ride...,https://aaai.org/ocs/index.php/AAAI/AAAI18/pap...,Xiaohui Bei,Nanyang Technological University,Singapore
1,18,16583,Algorithms for Trip-Vehicle Assignment in Ride...,https://aaai.org/ocs/index.php/AAAI/AAAI18/pap...,Shengyu Zhang,<span>The Chinese University of Hong Kong</span>,Hong Kong
2,18,16893,EAD: Elastic-Net Attacks to Deep Neural Networ...,https://aaai.org/ocs/index.php/AAAI/AAAI18/pap...,Pin-Yu Chen,IBM Research AI,United States
3,18,16893,EAD: Elastic-Net Attacks to Deep Neural Networ...,https://aaai.org/ocs/index.php/AAAI/AAAI18/pap...,Yash Sharma,,"The Cooper Union, New York"
4,18,16893,EAD: Elastic-Net Attacks to Deep Neural Networ...,https://aaai.org/ocs/index.php/AAAI/AAAI18/pap...,Huan Zhang,,"University of California, Davis"


In [6]:
def clean_chars(s):
    s_clean= ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')
    return s_clean.upper()

def clean_and_name(name):
    if not name:
        return None
    name = clean_chars(name)
    if name.split(' ')[0] == 'AND':
        return ' '.join(name.split(' ')[1:])
    if name.split(' ')[-1] == 'AND':
        return ' '.join(name.split(' ')[:-1])
    else:
        return name

def split_name(name):
    name = clean_and_name(name)
    if not name:
        return pd.Series([None, None, None])
    first_name = None
    middle_name = None
    surname = None
    name_split = str(name).split(' ')
    if len(name_split) == 1:
        first_name  = name_split[0]
    elif len(name_split) == 2:
        first_name  = name_split[0]
        surname = name_split[1]
    elif len(name_split) == 3:
        first_name  = name_split[0]
        middle_name = name_split[1]
        surname = name_split[2]
    else:
        return pd.Series([None, None, None])
    if  len(re.sub('[.]', '', first_name)) == 1:
        first_name = None
    if middle_name: 
        if len(re.sub('[.]', '', middle_name)) == 1:
            middle_name = None
    if surname:
        if len(re.sub('[.]', '', surname)) == 1:
            surname = None
    return pd.Series([first_name, middle_name, surname])

In [7]:
aaai_df2['first_name'] = None
aaai_df2['middle_name'] = None
aaai_df2['surname'] = None
aaai_df2[['first_name', 'middle_name', 'surname']] = aaai_df2['author_name'].apply(lambda x: split_name(x))
aaai_df2[['author_name', 'first_name', 'middle_name', 'surname', 'country']].head()

Unnamed: 0,author_name,first_name,middle_name,surname,country
0,Xiaohui Bei,XIAOHUI,,BEI,Singapore
1,Shengyu Zhang,SHENGYU,,ZHANG,Hong Kong
2,Pin-Yu Chen,PIN-YU,,CHEN,United States
3,Yash Sharma,YASH,,SHARMA,"The Cooper Union, New York"
4,Huan Zhang,HUAN,,ZHANG,"University of California, Davis"


In [8]:
aaai_df2[['author_name', 'first_name', 'middle_name', 'surname', 'country']]

Unnamed: 0,author_name,first_name,middle_name,surname,country
0,Xiaohui Bei,XIAOHUI,,BEI,Singapore
1,Shengyu Zhang,SHENGYU,,ZHANG,Hong Kong
2,Pin-Yu Chen,PIN-YU,,CHEN,United States
3,Yash Sharma,YASH,,SHARMA,"The Cooper Union, New York"
4,Huan Zhang,HUAN,,ZHANG,"University of California, Davis"
5,Jinfeng Yi,JINFENG,,YI,Tencent AI Lab
6,Cho-Jui Hsieh,CHO-JUI,,HSIEH,"University of California, Davis"
7,Jonathan Chung,JONATHAN,,CHUNG,University of Toronto
8,Moshe Eizenman,MOSHE,,EIZENMAN,University of Toronto
9,Uros Rakita,UROS,,RAKITA,University of Toronto


In [9]:
first_aaai = aaai_df2[['first_name', 'middle_name', 'surname', 'country']].copy()
first_aaai = first_aaai.drop_duplicates()
unique_aaai = len(first_aaai)
first_aaai = first_aaai[~(pd.isnull(first_aaai['first_name']) | 
                                 (first_aaai['first_name'] == 'None'))].copy()
#first_aaai = first_aaai.dropna(subset=['first_name'])
first_aaai.reset_index(inplace=True)
first_aaai.rename(columns={'index':'id', 
                           'first_name': 'Ofirstnam', 
                           'middle_name': 'Omidnam', 
                           'surname': 'Osurname'}, inplace=True)
first_aaai.dropna(subset=['Ofirstnam'])
first_aaai['id'] = first_aaai['id'].apply(lambda x: 'aaai_1_{}'.format(x))
unique_clean_names = len(first_aaai)
#first_aaai.to_csv('../data/clean_name/aaai_first_18_05.csv', index=False)
print('papers {}'.format(total_aaai2))
print('unique {}'.format(unique_aaai))
print('unique clean {}'.format(unique_clean_names))
first_aaai.head()

papers 20233
unique 14379
unique clean 14147


Unnamed: 0,id,Ofirstnam,Omidnam,Osurname,country
0,aaai_1_0,XIAOHUI,,BEI,Singapore
1,aaai_1_1,SHENGYU,,ZHANG,Hong Kong
2,aaai_1_2,PIN-YU,,CHEN,United States
3,aaai_1_3,YASH,,SHARMA,"The Cooper Union, New York"
4,aaai_1_4,HUAN,,ZHANG,"University of California, Davis"


In [11]:
second_aaai = aaai_df2[pd.isnull(aaai_df2['first_name'])]

In [42]:
 list(pycountry.countries)[0].alpha_2

'AW'

In [12]:
countries = pd.DataFrame([x.name for x in list(pycountry.countries)], columns=['country'])
countries['country_name'] = countries['country']
countries['country_code'] = [x.alpha_2 for x in list(pycountry.countries)]
countries.head(10)

Unnamed: 0,country,country_name,country_code
0,Aruba,Aruba,AW
1,Afghanistan,Afghanistan,AF
2,Angola,Angola,AO
3,Anguilla,Anguilla,AI
4,Åland Islands,Åland Islands,AX
5,Albania,Albania,AL
6,Andorra,Andorra,AD
7,United Arab Emirates,United Arab Emirates,AE
8,Argentina,Argentina,AR
9,Armenia,Armenia,AM


In [15]:
vars = ['id', 'Ofirstnam', 'Omidnam', 'Osurname', 'country_code']
first_aaai.merge(countries, how='left', on='country')[vars].to_csv('../data/clean_name/aaai_first_18_05.csv', index=False)

In [16]:
len(first_aaai)

14147