## Metadata EDA

### Submitted vs. published papers

In [None]:
idx = arxiv_metadata['journal-ref'].isna()
arxiv_published = arxiv_metadata[~idx]
pd.DataFrame.from_dict({'Submitted': arxiv_metadata.shape[0], 'Published': arxiv_published.shape[0]}, orient='index', columns=["Total papers"])

### Papers by category

In [None]:
keys = ["Computer Science", "Economics", "Electrical Engineering and Systems Science", "Mathematics", "Physics", "Quantitative Biology", "Quantitative Finance", "Statistics", "Junk"]

# total papers submitted, by subject
counts_total = {key:arxiv_metadata[key].sum() for key in keys if key in arxiv_metadata.keys()}
counts_total_df = pd.DataFrame.from_dict(counts_total, orient='index', columns=["Count"])
counts_total_df['Subject'] = counts_total_df.index

# papers published, by subject
counts_published = {key:arxiv_published[key].sum() for key in keys if key in arxiv_published.keys()}
counts_published_df = pd.DataFrame.from_dict(counts_published, orient='index', columns=["Count"])
counts_published_df['Subject'] = counts_published_df.index

# merge both into 1 dataframe
counts_df = counts_total_df.join(counts_published_df, on="Subject", how='left', lsuffix='_left', rsuffix='_right')
counts_df = counts_df.drop(['Subject', 'Subject_left', 'Subject_right'], axis=1)
counts_df.rename(columns={'Count_left':'Submitted', 'Count_right':'Published'}, inplace=True)
counts_df

_Note that cummulated column totals my be different than total number of papers, as a paper may be in several categories._

### Published papers over time, by submisison month and subject.

In [None]:
import pandas as pd
from matplotlib import pyplot as plt 
import numpy as np

%matplotlib inline

In [None]:
# Index by Created date. Sets a Pandas DatetimeIndex on the DataFrame
#arxiv_df_published.set_index('Created', inplace=True)
# Group by year and month, aggregate using size

row_counts = arxiv_published.groupby([arxiv_published.Year, arxiv_published.Month]).size()
row_counts.index.set_names(['Year', 'Month'], inplace=True)

In [None]:
row_counts.unstack(level=1)

In [None]:
plt.title('Number of papers by creation date')
min_year = arxiv_metadata.Year.min()
max_year = arxiv_metadata.Year.max()
plt.xticks(ticks=range(0, len(row_counts), 12), labels=range(min_year, max_year+1))
plt.xlabel('Creation date')
plt.ylabel('Papers count')

# cumulative
plt.plot(np.cumsum(list(row_counts)), color='tab:blue')

# monthly
plt.plot(list(row_counts), color='tab:red')

plt.legend(['Cumulative', 'Monthly'])
plt.show();