In [1]:
import pandas as pd
import re
import matplotlib.pyplot as plt

In [2]:
sample = pd.read_stata('../data/sample.dta')
sample.head()

Unnamed: 0,id,Ofirstnam,Omidnam,Osurname,msa
0,1.0,A,DALE,MAYO,9999.0
1,2.0,MARY,ANN,SKREZEC,7362.0
2,3.0,RAY,HOWARD,SUHLER,6450.0
3,4.0,VUONG,TUYEN,KHON,9999.0
4,5.0,SON,SUK,LEE,3920.0


In [3]:
## Cleaning functions
columns_interest = ['name','author_name', 'author_middle_name', 'author_surname']

def clean_and_name(name):
    name = str(name)

    if name.split(' ')[0] == 'and':
        return ' '.join(name.split(' ')[1:])
    if name.split(' ')[-1] == 'and':
        return ' '.join(name.split(' ')[:-1])
    else:
        return name

def clean_first_name(name):
    name = clean_and_name(name)
    if  len(re.sub('[.]', '', name)) == 1:
        return None
    else:
        return name

## Arxiv Learning Keywords

In [4]:
arviv_ml_df = pd.read_csv('../data/arxiv_learning_authors.csv')
total_ml = len(arviv_ml_df.drop_duplicates())

In [5]:
print(arviv_ml_df['published'].min(), arviv_ml_df['published'].max())

1993-04-16T07:43:08Z 2019-03-05T18:49:40Z


In [6]:
print('total: {}'.format(total_ml))
arviv_ml_df = arviv_ml_df[columns_interest]
unique_ml = len(arviv_ml_df.drop_duplicates())
print('unique authors: {}'.format(unique_ml))
arviv_ml_df.head()

total: 367687
unique authors: 105644


Unnamed: 0,name,author_name,author_middle_name,author_surname
0,Anjishnu Banerjee,Anjishnu,,Banerjee
1,David Dunson,David,,Dunson
2,Surya Tokdar,Surya,,Tokdar
3,Cosma Rohilla Shalizi,Cosma,Rohilla,Shalizi
4,Abigail Z. Jacobs,Abigail,Z.,Jacobs


In [7]:
arviv_ml_df['author_name'] = arviv_ml_df['author_name'].map(clean_first_name)
first_arviv_ml_df = arviv_ml_df[~(pd.isnull(arviv_ml_df['author_name']) | 
                                 (arviv_ml_df['author_name'] == 'None'))].copy()
second_arviv_ml_df = arviv_ml_df[(pd.isnull(arviv_ml_df['author_name']) | 
                                   (arviv_ml_df['author_name'] == 'None'))].copy()
missing_ml = len(second_arviv_ml_df.drop_duplicates())
print('missing {}'.format(missing_ml))
print('total {}'.format(len(first_arviv_ml_df)))
unique_first_ml = len(first_arviv_ml_df.drop_duplicates())
print('unique {}'.format(unique_first_ml))
first_arviv_ml_df = first_arviv_ml_df.drop_duplicates()
first_arviv_ml_df.head()

missing 15324
total 325466
unique 90320


Unnamed: 0,name,author_name,author_middle_name,author_surname
0,Anjishnu Banerjee,Anjishnu,,Banerjee
1,David Dunson,David,,Dunson
2,Surya Tokdar,Surya,,Tokdar
3,Cosma Rohilla Shalizi,Cosma,Rohilla,Shalizi
4,Abigail Z. Jacobs,Abigail,Z.,Jacobs


In [8]:
first_arviv_ml_df.reset_index(inplace=True)
first_arviv_ml_df.rename(columns={'index':'id', 
                           'author_name': 'Ofirstnam', 
                           'author_middle_name': 'Omidnam', 
                           'author_surname': 'Osurname'}, inplace=True)
first_arviv_ml_df['id'] = first_arviv_ml_df['id'].apply(lambda x: 'arxiv_ml_{}'.format(x))
first_arviv_ml_df.to_csv('../data/clean_name/arxiv_first_ml.csv', index=False)
first_arviv_ml_df.head()

Unnamed: 0,id,name,Ofirstnam,Omidnam,Osurname
0,arxiv_ml_0,Anjishnu Banerjee,Anjishnu,,Banerjee
1,arxiv_ml_1,David Dunson,David,,Dunson
2,arxiv_ml_2,Surya Tokdar,Surya,,Tokdar
3,arxiv_ml_3,Cosma Rohilla Shalizi,Cosma,Rohilla,Shalizi
4,arxiv_ml_4,Abigail Z. Jacobs,Abigail,Z.,Jacobs


## Arxiv Symbols

In [9]:
arviv_sy_df = pd.read_csv('../data/arxiv_symbols_authors.csv')
total_sy = len(arviv_sy_df.drop_duplicates())

In [10]:
print(arviv_sy_df['published'].min(), arviv_sy_df['published'].max())

1993-08-03T01:10:06Z 2019-03-05T18:49:40Z


In [11]:
print('total: {}'.format(total_sy))
arviv_sy_df = arviv_sy_df[columns_interest]
unique_sy = len(arviv_sy_df.drop_duplicates())
print('unique authors: {}'.format(unique_sy))
arviv_sy_df.head()

total: 100387
unique authors: 45981


Unnamed: 0,name,author_name,author_middle_name,author_surname
0,Sandro A. Coelho,Sandro,A.,Coelho
1,Diego Moussallem,Diego,,Moussallem
2,Gustavo C. Publio,Gustavo,C.,Publio
3,Diego Esteves,Diego,,Esteves
4,Heike Adel,Heike,,Adel


In [12]:
arviv_sy_df['author_name'] = arviv_sy_df['author_name'].map(clean_first_name)
first_arviv_sy_df = arviv_sy_df[~(pd.isnull(arviv_sy_df['author_name']) | 
                                 (arviv_sy_df['author_name'] == 'None'))].copy()
second_arviv_sy_df = arviv_sy_df[(pd.isnull(arviv_sy_df['author_name']) | 
                                 (arviv_sy_df['author_name'] == 'None'))].copy()
missing_sy = len(second_arviv_sy_df.drop_duplicates())
print('missing {}'.format(missing_sy))
print('total {}'.format(len(first_arviv_sy_df)))
unique_first_sy = len(first_arviv_sy_df.drop_duplicates())
print('unique {}'.format(unique_first_sy))
first_arviv_sy_df = first_arviv_sy_df.drop_duplicates()
first_arviv_sy_df.head()

missing 2874
total 97548
unique 43107


Unnamed: 0,name,author_name,author_middle_name,author_surname
0,Sandro A. Coelho,Sandro,A.,Coelho
1,Diego Moussallem,Diego,,Moussallem
2,Gustavo C. Publio,Gustavo,C.,Publio
3,Diego Esteves,Diego,,Esteves
4,Heike Adel,Heike,,Adel


In [13]:
first_arviv_sy_df.reset_index(inplace=True)
first_arviv_sy_df.rename(columns={'index':'id', 
                           'author_name': 'Ofirstnam', 
                           'author_middle_name': 'Omidnam', 
                           'author_surname': 'Osurname'}, inplace=True)
first_arviv_sy_df['id'] = first_arviv_sy_df['id'].apply(lambda x: 'arxiv_sy_{}'.format(x))
first_arviv_sy_df.to_csv('../data/clean_name/arxiv_first_sy.csv', index=False)
first_arviv_sy_df.head()

Unnamed: 0,id,name,Ofirstnam,Omidnam,Osurname
0,arxiv_sy_0,Sandro A. Coelho,Sandro,A.,Coelho
1,arxiv_sy_1,Diego Moussallem,Diego,,Moussallem
2,arxiv_sy_2,Gustavo C. Publio,Gustavo,C.,Publio
3,arxiv_sy_3,Diego Esteves,Diego,,Esteves
4,arxiv_sy_4,Heike Adel,Heike,,Adel


## Arxiv Robotics

In [27]:
arviv_ro_df = pd.read_csv('../data/arxiv_robotics_authors.csv')
total_ro = len(arviv_ro_df)
print(total_ro)

171126


In [22]:
print(arviv_ro_df['published'].min(), arviv_ro_df['published'].max())

1994-01-01T00:00:00Z 2019-03-05T18:49:40Z


In [23]:
arviv_ro_df = arviv_ro_df[columns_interest]
print('total: {}'.format(total_ro))
arviv_ro_df = arviv_ro_df[columns_interest]
unique_ro = len(arviv_ro_df.drop_duplicates())
print('unique authors: {}'.format(unique_ro))
arviv_ro_df.head()

total: 171126
unique authors: 55173


Unnamed: 0,name,author_name,author_middle_name,author_surname
0,R. A Helzerman,R.,A,Helzerman
1,M. P. Harper,M.,P.,Harper
2,Alexander Yu. Vlasov,Alexander,Yu.,Vlasov
3,Andreas Siebert,Andreas,,Siebert
4,Stephen L. Adler,Stephen,L.,Adler


In [24]:
arviv_ro_df['author_name'] = arviv_ro_df['author_name'].map(clean_first_name)
first_arviv_ro_df = arviv_ro_df[~(pd.isnull(arviv_ro_df['author_name']) | 
                                 (arviv_ro_df['author_name'] == 'None'))].copy()
second_arviv_ro_df = arviv_ro_df[(pd.isnull(arviv_ro_df['author_name']) | 
                                 (arviv_ro_df['author_name'] == 'None'))].copy()
missing_ro = len(second_arviv_ro_df.drop_duplicates())
print('missing {}'.format(missing_ro))
print('total {}'.format(len(first_arviv_ro_df)))
unique_first_ro = len(first_arviv_ro_df.drop_duplicates())
print('unique {}'.format(unique_first_ro))
first_arviv_ro_df = first_arviv_ro_df.drop_duplicates()
first_arviv_ro_df.head()

missing 4745
total 153472
unique 50428


Unnamed: 0,name,author_name,author_middle_name,author_surname
2,Alexander Yu. Vlasov,Alexander,Yu.,Vlasov
3,Andreas Siebert,Andreas,,Siebert
4,Stephen L. Adler,Stephen,L.,Adler
5,Soumyadeep Paul,Soumyadeep,,Paul
6,Sudipta N. Sinha,Sudipta,N.,Sinha


In [25]:
first_arviv_ro_df.reset_index(inplace=True)
first_arviv_ro_df.rename(columns={'index':'id', 
                           'author_name': 'Ofirstnam', 
                           'author_middle_name': 'Omidnam', 
                           'author_surname': 'Osurname'}, inplace=True)
first_arviv_ro_df['id'] = first_arviv_ro_df['id'].apply(lambda x: 'arxiv_ro_{}'.format(x))
first_arviv_ro_df.to_csv('../data/clean_name/arxiv_first_ro.csv', index=False)
first_arviv_ro_df.head()

Unnamed: 0,id,name,Ofirstnam,Omidnam,Osurname
0,arxiv_ro_2,Alexander Yu. Vlasov,Alexander,Yu.,Vlasov
1,arxiv_ro_3,Andreas Siebert,Andreas,,Siebert
2,arxiv_ro_4,Stephen L. Adler,Stephen,L.,Adler
3,arxiv_ro_5,Soumyadeep Paul,Soumyadeep,,Paul
4,arxiv_ro_6,Sudipta N. Sinha,Sudipta,N.,Sinha


In [26]:
pd.DataFrame({'total_papers': [total_ml, total_sy, total_ro],
              'unique_authors': [unique_ml, unique_sy, unique_ro],
              'unique_clean_authors': [unique_first_ml, unique_first_sy, unique_first_ro],
              'missing': [missing_ml, missing_sy, missing_ro]}, index=['Machine Learning',
                                                                       'Symbolic Systems',
                                                                       'Robotics'],
            columns=['total_papers', 'unique_authors', 'unique_clean_authors', 'missing'])

Unnamed: 0,total_papers,unique_authors,unique_clean_authors,missing
Machine Learning,367687,105644,90320,15324
Symbolic Systems,100387,45981,43107,2874
Robotics,171126,55173,50428,4745
