# Research papers
* Load metadata, list articles related to machine learning
* Load abstracts, keep only machine learning articles
* Identify research (non-review) papers by looking into the abstracts
* Save them as CSV

In [1]:
import pandas as pd
from matplotlib import pyplot as plt 
import numpy as np
import zipfile as zf

%matplotlib inline

## Load metadata, list articles related to machine learning

In [2]:
%%time

# load metadata extracted data in notebook 00_load_metadata
arxiv_metadata = pd.read_csv('data/arxiv_metadata.csv.zip', index_col=0)



CPU times: user 9.13 s, sys: 1.48 s, total: 10.6 s
Wall time: 11.4 s


In [3]:
# list articles related to machine learning
ml_categories = ['cs.AI', 'cs.LG']
arxiv_cs_idx = arxiv_metadata['Computer Science'] == 1
arxiv_cs = arxiv_metadata[arxiv_cs_idx]
arxiv_ml_idx = []
for paper_categories in arxiv_cs['categories']:
    found = False
    for ml_category in ml_categories:
        if ml_category in paper_categories:
            found = True
            break
    arxiv_ml_idx.append(found)

arxiv_ml = arxiv_cs[arxiv_ml_idx]

## Load abstracts, keep only machine learning articles

In [4]:
%%time

# load abstracts extracted data in notebook 00_load_abstracts
arxiv_abstracts = pd.read_csv('data/arxiv_abstracts.csv.zip', index_col=0)

CPU times: user 9.52 s, sys: 1 s, total: 10.5 s
Wall time: 11.5 s




Merge the metadata and abstracts into one dataframe, based on arxiv id.

In [5]:
arxiv_abstracts_ml = arxiv_abstracts[arxiv_abstracts.id.isin(arxiv_ml.id)]
arxiv_ml_merged = pd.merge(arxiv_ml, arxiv_abstracts_ml, on='id')

## Identify research (non-review) papers by looking into the abstracts

In [6]:
research_paper_idx = ['systematic literature review' not in abstract.lower() for abstract in arxiv_ml_merged.abstract]
print(f"Found {sum(research_paper_idx)} research (non-review) machine learning papers.")

Found 207226 research (non-review) machine learning papers.


In [7]:
arxiv_ml_research = arxiv_ml_merged[research_paper_idx]

## Save as compressed CSV

In [8]:
with zf.ZipFile('data/arxiv_ml_research.csv.zip', 'w') as ziparchive:
    ziparchive.writestr('arxiv_ml_research.csv', arxiv_ml_research.to_csv())