# Query the Arxiv Preprint Server

This **Notebook-as-Tool** allows you to:

1.   query the preprint server arxiv.org and retrieve results as csv. The query routine retrieves data on three levels: (a) title field, (b) abstract field and (c) all fields.

For running or adapting this Colab Notebook you need to create a copy in you Google drive: **File → Save a copy in Drive**. I will be stored in a folder ```Colab Notebooks```. Open this file with Google Colab and run the cells consecutively by pressing the **Play** button or pushing **shift+enter**.

**Important notes:**
- Code is hidden in the background of Colab forms. For viewing and editing the code **double click** cell or select  **View → Show/hide code**
- Data will be stored in Google Drive in the folder ```Colab Data```. A connection to your drive will be authenticated when running setup code cells. This is temporary and only your current notebook will be conncted to your drive. The connection will be revoked when the notebook is terminated or by selecting **Runtime → Factory reset runtime**.


**Credits:** This notebook was written by Marcus Burkhardt and makes use of the arxiv API wrapper.

In [None]:
#@title Setup 1: Mount Google Drive for Loading and Storing Data
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
#@title Setup 2: Install and Load Required Libraries and Run Setup Procedures

# Install Libraries
try:
  import arxiv
  import feedparser
  pass
except:
  !pip install arxiv
  import arxiv
  !pip install feedparser
  import feedparser
  pass

# Import Libaries
import os
import time
import pandas as pd
from datetime import datetime
from tqdm.notebook import tqdm
import requests

# Defining path variable for data path
data_path = os.path.join("gdrive", "MyDrive", "Colab_Data", "Data", "Arxiv")
if not os.path.isdir(data_path):
  os.makedirs(data_path)

In [None]:
#@title Setup 3: Definition of Core and Support Functions Used by the Tool(s)

def query_arxiv(query, max_results, search_scope=None, sort_by='relevance', sort_order='ascending'):
    print(f'Start querying arxiv for term: {query} (scope: {search_scope})')
    per_page = 1000
    start = 0 - per_page
    more = True
    results = []
    while more:
        print('.', end='')
        start += per_page
        if search_scope is None:
            url = f"https://export.arxiv.org/api/query?search_query={query}&max_results={per_page}&start={start}&sortBy={sort_by}&sortOrder={sort_order}"
        else:
            url = f"https://export.arxiv.org/api/query?search_query={search_scope}:{query}&max_results={per_page}&start={start}&sortBy={sort_by}&sortOrder={sort_order}"
        resp = requests.get(url)
        resp = feedparser.parse(resp.text)
        items = resp['entries']
        if len(items) > 0:
            if len(results) + len(items) > max_results:
                results += items[:max_results - len(results)]
                more = False
                print('')
            else:
                results += items
        else:
            more = False
            print('')
    results = pd.json_normalize(results)
    return results

def query(q, max_results=1000, sort_by='relevance', sort_order='ascending'):
    qti = query_arxiv(q, search_scope='ti', max_results=max_results, sort_by=sort_by, sort_order=sort_order)
    qtiUnique = qti.copy()
    qtiUnique['[Result Type]'] = 'title'

    results = qtiUnique.copy()
    if len(results) >= max_results:
        return results[:max_results]

    qabs = query_arxiv(q, search_scope='abs', max_results=max_results, sort_by=sort_by, sort_order=sort_order)
    if len(qtiUnique) > 0 and len(qabs) > 0:
        qabsUnique = qabs[~qabs['id'].isin(qti['id'].tolist())].copy()
    else:
        qabsUnique = qabs.copy()
    qabsUnique['[Result Type]'] = 'abstract'

    results = pd.concat([results, qabsUnique])
    if len(results) >= max_results:
        return results[:max_results]

    qall = query_arxiv(q, search_scope='all', max_results=max_results, sort_by=sort_by, sort_order=sort_order)
    qtmp = pd.concat([qtiUnique, qabsUnique])
    if len(qtmp) > 0 and len(qall) > 0:
        qallUnique = qall[(~qall['id'].isin(qtmp['id'].tolist()))].copy()
    else:
        qallUnique = qall.copy()
    qallUnique['[Result Type]'] = 'all'

    results = pd.concat([results, qallUnique])
    if len(results) >= max_results:
        return results[:max_results]

    qnoscope = query_arxiv(q, search_scope=None, max_results=max_results, sort_by=sort_by, sort_order=sort_order)
    qtmp = pd.concat([qtiUnique, qabsUnique, qallUnique])
    if len(qtmp) > 0 and len(qnoscope) > 0:
        qnoscopeUnique = qnoscope[~qnoscope['id'].isin(qtmp['id'].tolist())].copy()
    else:
        qnoscopeUnique = qnoscope.copy()
    qnoscopeUnique['[Result Type]'] = 'No Scope'

    results = pd.concat([results, qnoscopeUnique])
    if len(results) >= max_results:
        return results[:max_results]
    return results

In [None]:
#@title Query arxiv.org
#@markdown
query_term = '' #@param {type:"string"}
max_results = 25000 #@param {type:"slider", min:10, max:50000, step:10}
sort_by = 'relevance' # @param ["relevance", "lastUpdatedDate", "submittedDate"] {allow-input: false}
sort_order = 'descending' # @param ["ascending", "descending"] {allow-input: false}

#@markdown **Please note:** The entered search term is queried iteratively using different search scopes: title, abstract, and all. These scopes are provided by the API. In the last iteration the search term is queried without a scope. If an article is retrieved for multiple scopes only the first is retained.

outfile_name = f"{datetime.now()}_{query_term}_max_results_{max_results}_sortBy_{sort_by}_sortOrder_{sort_order}.csv" # could be added as a form field.
outfile = os.path.join(data_path, outfile_name)
print('Results will be stored at {}'.format('/'.join(outfile.split('/')[2:])))
print()
results = query(query_term, max_results, sort_by, sort_order)
print()
print(f"In total {len(results)} have been retrieved.")
results["[Query Term]"] = query_term
if len(results) > 0:
    results["[Comment]"] = ""
    cols = ["id", "[Result Type]", "[Query Term]", "[Comment]",
            "author", "authors", "published", "updated", "title",
            "summary", "arxiv_comment", "links", "tags",
            "title_detail.type", "title_detail.language",
            "title_detail.base", "title_detail.value",
            "summary_detail.type", "summary_detail.language",
            "summary_detail.base", "summary_detail.value",
            "author_detail.name", "arxiv_primary_category.term",
            "arxiv_primary_category.scheme", "arxiv_doi",
            "arxiv_journal_ref", "arxiv_affiliation",
            "guidislink", "link", "published_parsed",
            "updated_parsed"]
    rcols = list(results.columns)
    if len([i for i in rcols if i not in cols]) > 0:
        print(f"Retrieved results contain unknown columns: {[i for i in rcols if i not in cols]}.")
        print("Script needs to be extended.")
    results = results[[i for i in cols if i in rcols]]
    results.to_csv(outfile, sep='\t', index=None)

print(f'Results saved to {outfile}')
print('Done.')