# Benchmark articles
* Load metadata, list articles related to machine learning
* Load abstracts, keep only machine learning articles
* Identify review papers by looking into the abstracts

In [1]:
import pandas as pd
from matplotlib import pyplot as plt 
import numpy as np

%matplotlib inline

## Load metadata, list articles related to machine learning

In [3]:
%%time

# load metadata extracted data in notebook 00_load_metadata
arxiv_metadata = pd.read_csv('data/arxiv_metadata.csv.zip')



CPU times: user 11.3 s, sys: 1.73 s, total: 13 s
Wall time: 13.9 s


In [9]:
# list articles related to machine learning
ml_categories = ['cs.AI', 'cs.LG']
arxiv_cs_idx = arxiv_metadata['Computer Science'] == 1
arxiv_cs = arxiv_metadata[arxiv_cs_idx]
arxiv_ml_idx = []
for paper_categories in arxiv_cs['categories']:
    found = False
    for ml_category in ml_categories:
        if ml_category in paper_categories:
            found = True
            break
    arxiv_ml_idx.append(found)

arxiv_ml = arxiv_cs[arxiv_ml_idx]

## Load abstracts, keep only machine learning articles

In [13]:
%%time

# load abstracts extracted data in notebook 00_load_abstracts
arxiv_abstracts = pd.read_csv('data/arxiv_abstracts.csv.zip')

CPU times: user 10.9 s, sys: 1.14 s, total: 12.1 s
Wall time: 13.4 s




In [46]:
arxiv_abstracts_ml = arxiv_abstracts[arxiv_abstracts.id.isin(arxiv_ml.id)]
arxiv_ml['arxiv_id'] = arxiv_ml['id']
arxiv_abstracts_ml['arxiv_id'] = arxiv_abstracts_ml['id']
arxiv_ml_merged = pd.merge(arxiv_ml, arxiv_abstracts_ml, on='arxiv_id')

## Identify review papers by looking into the abstracts

In [110]:
review_paper_idx = ['systematic literature review' in abstract.lower() for abstract in arxiv_ml_merged.abstract]
print(f"Found {sum(review_paper_idx)} potential review papers.")

Found 110 potential review papers.


In [111]:
arxiv_ml_reviews = arxiv_ml_merged[review_paper_idx]

## Save as compressed CSV

In [112]:
import zipfile as zf

with zf.ZipFile('data/arxiv_ml_reviews.csv.zip', 'w') as ziparchive:
    ziparchive.writestr('arxiv_ml_reviews.csv', arxiv_ml_reviews.to_csv())

## Short EDA

In [113]:
arxiv_ml_reviews.columns

Index(['Unnamed: 0_x', 'id_x', 'submitter', 'authors', 'title', 'journal-ref',
       'doi', 'report-no', 'categories', 'license', 'versions', 'update_date',
       'authors_parsed', 'created', 'year', 'month', 'Computer Science',
       'Economics', 'Electrical Engineering and Systems Science', 'Junk',
       'Mathematics', 'Physics', 'Quantitative Biology',
       'Quantitative Finance', 'Statistics', 'arxiv_id', 'Unnamed: 0_y',
       'abstract', 'comments', 'id_y'],
      dtype='object')

In [114]:
pd.set_option('max_colwidth', None)
arxiv_ml_reviews[['title', 'year', 'month', 'categories']].sort_values(['year', 'month'], ascending=False)

Unnamed: 0,title,year,month,categories
199978,Agent Design Pattern Catalogue: A Collection of Architectural Patterns\n for Foundation Model based Agents,2024,5,cs.AI cs.SE
200693,From the evolution of public data ecosystems to the evolving horizons of\n the forward-looking intelligent public data ecosystem empowered by emerging\n technologies,2024,5,cs.CY cs.AI cs.ET cs.HC cs.IR
200845,ABI Approach: Automatic Bias Identification in Decision-Making Under\n Risk based in an Ontology of Behavioral Economics,2024,5,cs.HC cs.AI
198955,"Visually Grounded Language Learning: a review of language games,\n datasets, tasks, and models",2023,12,cs.CL cs.AI
199048,Concept Drift Adaptation in Text Stream Mining Settings: A Comprehensive\n Review,2023,12,cs.LG cs.CL cs.IR
...,...,...,...,...
39489,Hybrid Recommender Systems: A Systematic Literature Review,2019,1,cs.IR cs.CY cs.LG
29667,Survey and cross-benchmark comparison of remaining time prediction\n methods in business process monitoring,2018,5,cs.AI cs.LG
29112,State-Space Abstractions for Probabilistic Inference: A Systematic\n Review,2018,4,cs.AI
17607,Towards Evidence-Based Ontology for Supporting Systematic Literature\n Review,2016,9,cs.DL cs.AI cs.SE
