# Benchmark articles
* Load metadata, list articles related to machine learning
* Load abstracts, keep only machine learning articles
* Identify review papers by looking into the abstracts

In [1]:
import pandas as pd
from matplotlib import pyplot as plt 
import numpy as np

%matplotlib inline

## Load metadata, list articles related to machine learning

In [2]:
%%time

# load metadata extracted data in notebook 00_load_metadata
arxiv_metadata = pd.read_csv('data/arxiv_metadata.csv.zip')



CPU times: user 9.16 s, sys: 1.25 s, total: 10.4 s
Wall time: 11.2 s


In [6]:
# list articles related to machine learning
ml_categories = ['cs.AI', 'cs.LG']
arxiv_cs_idx = arxiv_metadata['Computer Science'] == 1
arxiv_cs = arxiv_metadata[arxiv_cs_idx]
arxiv_ml_idx = []
for paper_categories in arxiv_cs['categories']:
    found = False
    for ml_category in ml_categories:
        if ml_category in paper_categories:
            found = True
            break
    arxiv_ml_idx.append(found)

arxiv_ml = arxiv_cs[arxiv_ml_idx]

## Load abstracts, keep only machine learning articles

In [7]:
%%time

# load abstracts extracted data in notebook 00_load_abstracts
arxiv_abstracts = pd.read_csv('data/arxiv_abstracts.csv.zip')

CPU times: user 9.12 s, sys: 639 ms, total: 9.76 s
Wall time: 9.76 s




In [8]:
arxiv_abstracts_ml = arxiv_abstracts[arxiv_abstracts.id.isin(arxiv_ml.id)]
arxiv_ml['arxiv_id'] = arxiv_ml['id']
arxiv_abstracts_ml['arxiv_id'] = arxiv_abstracts_ml['id']
arxiv_ml_merged = pd.merge(arxiv_ml, arxiv_abstracts_ml, on='arxiv_id')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  arxiv_ml['arxiv_id'] = arxiv_ml['id']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  arxiv_abstracts_ml['arxiv_id'] = arxiv_abstracts_ml['id']


## Identify review papers by looking into the abstracts

In [9]:
review_paper_idx = ['systematic literature review' in abstract.lower() for abstract in arxiv_ml_merged.abstract]
print(f"Found {sum(review_paper_idx)} potential review papers.")

Found 114 potential review papers.


In [10]:
arxiv_ml_reviews = arxiv_ml_merged[review_paper_idx]

## Save as compressed CSV

In [11]:
import zipfile as zf

with zf.ZipFile('data/arxiv_ml_reviews.csv.zip', 'w') as ziparchive:
    ziparchive.writestr('arxiv_ml_reviews.csv', arxiv_ml_reviews.to_csv())

## Short EDA

In [12]:
arxiv_ml_reviews.columns

Index(['Unnamed: 0_x', 'id_x', 'submitter', 'authors', 'title', 'journal-ref',
       'doi', 'report-no', 'categories', 'license', 'versions', 'update_date',
       'authors_parsed', 'created', 'year', 'month', 'Computer Science',
       'Economics', 'Electrical Engineering and Systems Science', 'Junk',
       'Mathematics', 'Physics', 'Quantitative Biology',
       'Quantitative Finance', 'Statistics', 'arxiv_id', 'Unnamed: 0_y',
       'id_y', 'abstract', 'comments'],
      dtype='object')

In [13]:
pd.set_option('max_colwidth', None)
arxiv_ml_reviews[['title', 'year', 'month', 'categories']].sort_values(['year', 'month'], ascending=False)

Unnamed: 0,title,year,month,categories
203757,Combining Machine Learning and Ontology: A Systematic Literature Review,2024,1,cs.AI cs.LG
204030,Exploring the Role of Convolutional Neural Networks (CNN) in Dental\n Radiography Segmentation: A Comprehensive Systematic Literature Review,2024,1,cs.CV cs.LG
204411,Artificial intelligence to automate the systematic review of scientific\n literature,2024,1,cs.IR cs.AI
205149,A Systematic Literature Review on Explainability for Machine/Deep\n Learning-based Software Engineering Research,2024,1,cs.SE cs.AI
198922,"Visually Grounded Language Learning: a review of language games,\n datasets, tasks, and models",2023,12,cs.CL cs.AI
...,...,...,...,...
39488,Hybrid Recommender Systems: A Systematic Literature Review,2019,1,cs.IR cs.CY cs.LG
29666,Survey and cross-benchmark comparison of remaining time prediction\n methods in business process monitoring,2018,5,cs.AI cs.LG
29111,State-Space Abstractions for Probabilistic Inference: A Systematic\n Review,2018,4,cs.AI
17606,Towards Evidence-Based Ontology for Supporting Systematic Literature\n Review,2016,9,cs.DL cs.AI cs.SE
