## Authors EDA
Exploratory data analysis focused on authors.

In [2]:
import pandas as pd
from matplotlib import pyplot as plt 
import numpy as np

%matplotlib inline

In [3]:
%%time

# load metadata extracted data in notebook 00_load_metadata
arxiv_metadata = pd.read_csv('data/arxiv_metadata.zip')



CPU times: user 10 s, sys: 1.36 s, total: 11.4 s
Wall time: 11.4 s


In [16]:
def count_authors(df):
    """Given a dataframe, count the number of unique authors"""
    count = [author for author in df['authors_parsed']]
    return list(set(count))

# ALL AUTHORS
all_authors = count_authors(arxiv_metadata)

# PUBLISHED AUTHORS
idx = arxiv_metadata['journal-ref'].isna()
arxiv_published = arxiv_metadata[~idx]
published_authors = count_authors(arxiv_published)

pd.DataFrame.from_dict(
    {'All authors': len(all_authors), 'Published authors': len(published_authors)}, orient='index', columns=["Count"])

Unnamed: 0,Count
All authors,1853495
Published authors,644649


### Authors by subject


In [15]:
keys = ["Computer Science", "Economics", "Electrical Engineering and Systems Science", "Mathematics", "Physics", "Quantitative Biology", "Quantitative Finance", "Statistics", "Junk"]

In [58]:
# total papers submitted, by subject
counts_total = {key:arxiv_metadata[key].sum() for key in keys if key in arxiv_metadata.keys()}
counts_total_df = pd.DataFrame.from_dict(counts_total, orient='index', columns=["Count"])
counts_total_df['Subject'] = counts_total_df.index

# papers published, by subject
counts_published = {key:arxiv_published[key].sum() for key in keys if key in arxiv_published.keys()}
counts_published_df = pd.DataFrame.from_dict(counts_published, orient='index', columns=["Count"])
counts_published_df['Subject'] = counts_published_df.index

# merge both into 1 dataframe
counts_df = counts_total_df.join(counts_published_df, on="Subject", how='left', lsuffix='_left', rsuffix='_right')
counts_df = counts_df.drop(['Subject', 'Subject_left', 'Subject_right'], axis=1)
counts_df.rename(columns={'Count_left':'Submitted', 'Count_right':'Published'}, inplace=True)
counts_df

Unnamed: 0,Submitted,Published
Computer Science,860031,173075
Economics,9815,1116
Electrical Engineering and Systems Science,84491,9399
Mathematics,639954,147285
Physics,1242872,625188
Quantitative Biology,45543,13224
Quantitative Finance,19105,3861
Statistics,118711,17667
Junk,3983,683
