# Query the Arxiv Preprint Server

This **Notebook-as-Tool** allows you to:

1. query the preprint server arxiv.org and retrieve results as csv. The query routine retrieves data on four levels: (a) title field, (b) abstract field, (c) all fields and (d) without search scope.
2. specify the number of reults to retrieve and the sort order of the results.

For running or adapting this Colab Notebook you need to create a copy in you Google drive: **File → Save a copy in Drive**. I will be stored in a folder ```Colab Notebooks```. Open this file with Google Colab and run the cells consecutively by pressing the **Play** button or pushing **shift+enter**.

**Important notes:**
- Code is hidden in the background of Colab forms. For viewing and editing the code **double click** cell or select  **View → Show/hide code**
- Data will be stored in Google Drive in the folder ```Colab Data```. A connection to your drive will be authenticated when running setup code cells. This is temporary and only your current notebook will be conncted to your drive. The connection will be revoked when the notebook is terminated or by selecting **Runtime → Factory reset runtime**.


**Credits:** This notebook was written by Marcus Burkhardt and makes use of the arxiv API wrapper.

In [None]:
#@title Setup 1: Mount Google Drive for Loading and Storing Data
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
#@title Setup 2: Install and Load Required Libraries and Run Setup Procedures

# Install Libraries
try:
  import feedparser
  pass
except:
  !pip install feedparser
  import feedparser
  pass

# Import Libaries
import os
import time
import pandas as pd
from datetime import datetime
from tqdm.notebook import tqdm
import requests

# Defining path variable for data path
data_path = os.path.join("gdrive", "MyDrive", "Colab_Data", "Data", "Arxiv")
if not os.path.isdir(data_path):
  os.makedirs(data_path)

In [19]:
#@title Setup 3: Definition of Core and Support Functions Used by the Tool(s)

def query_arxiv(query, max_results, search_scope=None, sort_by='relevance', sort_order='ascending'):
    print(f'Start querying arxiv for term: {query} (scope: {search_scope})')
    per_page = 10000
    if max_results < per_page:
        per_page = max_results
    start = 0
    more = True
    results = []
    while more:
        if search_scope is None:
            url = f"https://export.arxiv.org/api/query?search_query={query}&max_results={per_page}&start={start}&sortBy={sort_by}&sortOrder={sort_order}"
        else:
            url = f"https://export.arxiv.org/api/query?search_query={search_scope}:{query}&max_results={per_page}&start={start}&sortBy={sort_by}&sortOrder={sort_order}"
        if start == 0:
            print(f'Initial query for first {per_page} results: {url}')
        print('.', end='')
        resp = requests.get(url)
        resp = feedparser.parse(resp.text)
        items = resp['entries']
        start += per_page
        if len(items) > 0:
            if len(results) + len(items) > max_results:
                results += items[:max_results - len(results)]
                more = False
                print('')
            else:
                results += items
        else:
            more = False
            print('')
    results = pd.json_normalize(results)
    return results

def query(q, max_results=1000, max_results_by_scope=False, sort_by='relevance', sort_order='ascending'):
    qti = query_arxiv(q, search_scope='ti', max_results=max_results, sort_by=sort_by, sort_order=sort_order)
    qtiUnique = qti.copy()
    qtiUnique['[Result Type]'] = 'title'

    results = qtiUnique.copy()[:max_results]
    if not max_results_by_scope and len(results) >= max_results:
        return results[:max_results]

    qabs = query_arxiv(q, search_scope='abs', max_results=max_results, sort_by=sort_by, sort_order=sort_order)
    if len(qtiUnique) > 0 and len(qabs) > 0:
        qabsUnique = qabs[~qabs['id'].isin(qti['id'].tolist())].copy()
    else:
        qabsUnique = qabs.copy()
    qabsUnique['[Result Type]'] = 'abstract'

    qabsUnique = qabsUnique[:max_results]
    results = pd.concat([results, qabsUnique])
    if not max_results_by_scope and len(results) >= max_results:
        return results[:max_results]

    qall = query_arxiv(q, search_scope='all', max_results=max_results, sort_by=sort_by, sort_order=sort_order)
    qtmp = pd.concat([qtiUnique, qabsUnique])
    if len(qtmp) > 0 and len(qall) > 0:
        qallUnique = qall[(~qall['id'].isin(qtmp['id'].tolist()))].copy()
    else:
        qallUnique = qall.copy()
    qallUnique['[Result Type]'] = 'all'
    qallUnique = qallUnique[:max_results]
    results = pd.concat([results, qallUnique])
    if not max_results_by_scope and len(results) >= max_results:
        return results[:max_results]

    qnoscope = query_arxiv(q, search_scope=None, max_results=max_results, sort_by=sort_by, sort_order=sort_order)
    qtmp = pd.concat([qtiUnique, qabsUnique, qallUnique])
    if len(qtmp) > 0 and len(qnoscope) > 0:
        qnoscopeUnique = qnoscope[~qnoscope['id'].isin(qtmp['id'].tolist())].copy()
    else:
        qnoscopeUnique = qnoscope.copy()
    qnoscopeUnique['[Result Type]'] = 'No Scope'
    qnoscopeUnique = qnoscopeUnique[:max_results]

    results = pd.concat([results, qnoscopeUnique])
    if not max_results_by_scope and len(results) >= max_results:
        return results[:max_results]

    return results

In [23]:
#@title Query arxiv.org
#@markdown
#@markdown ____________
#@markdown ### Specify the search query
query_term = '' #@param {type:"string"}
#@markdown >**Please note:** The entered search term is queried iteratively using different search scopes: title, abstract, and all. These scopes are provided by the API. In the last iteration the search term is queried without a scope. If the same article is retrieved for multiple scopes only the first is retained in the results set. This is reflected in the column [Result Type] which is added by the script and not archix metadata.
#@markdown ____________
#@markdown ### Specify the number of results to retrieve
max_results = 1000 # @param {type:"integer"}
max_results_by_scope = False # @param {type:"boolean"}
#@markdown >**Please note:** If max_results_by_scope is checked, max_results applies to each search scope (title, abstract, all, and no scope) separately, i.e. the total amount of results might be higher than the set maximus. If unchecked, the chosen number of max_results is the total maximum.
#@markdown ____________
#@markdown ### Specify the order of results
sort_by = 'relevance' # @param ["relevance", "lastUpdatedDate", "submittedDate"] {allow-input: false}
sort_order = 'descending' # @param ["ascending", "descending"] {allow-input: false}
#@markdown >**Please note:** Users can query arxiv by relevance, the date an article was submitted and the date it was last updated. Results can be sorted in ascending or descending order.
#@markdown ____________


print('Run query...')
results = query(query_term, max_results, max_results_by_scope, sort_by, sort_order)
outfile_name = f"{query_term}_{len(results)}_results_SEARCH_PARAMS_max_results_{max_results}_max_results_by_scope_{max_results_by_scope}_sortBy_{sort_by}_sortOrder_{sort_order}_QUERY_TIME_{datetime.now()}.csv" # could be added as a form field.
outfile = os.path.join(data_path, outfile_name)
print()
print(f"In total {len(results)} have been retrieved.")
results["[Query Term]"] = query_term
if len(results) > 0:
    results["[Comment]"] = ""
    cols = ["id", "[Result Type]", "[Query Term]", "[Comment]",
            "author", "authors", "published", "updated", "title",
            "summary", "arxiv_comment", "links", "tags",
            "title_detail.type", "title_detail.language",
            "title_detail.base", "title_detail.value",
            "summary_detail.type", "summary_detail.language",
            "summary_detail.base", "summary_detail.value",
            "author_detail.name", "arxiv_primary_category.term",
            "arxiv_primary_category.scheme", "arxiv_doi",
            "arxiv_journal_ref", "arxiv_affiliation",
            "guidislink", "link", "published_parsed",
            "updated_parsed"]
    rcols = list(results.columns)
    if len([i for i in rcols if i not in cols]) > 0:
        print(f"Retrieved results contain unknown columns: {[i for i in rcols if i not in cols]}.")
        print("Script needs to be extended.")
    results = results[[i for i in cols if i in rcols]]
    results.to_csv(outfile, sep='\t', index=None)

print(f'Results saved to {outfile}')
print('Done.')

Run query...
Start querying arxiv for term: prompt (scope: ti)
Initial query for first 10000 results: https://export.arxiv.org/api/query?search_query=ti:prompt&max_results=10000&start=0&sortBy=relevance&sortOrder=descending
..
Start querying arxiv for term: prompt (scope: abs)
Initial query for first 10000 results: https://export.arxiv.org/api/query?search_query=abs:prompt&max_results=10000&start=0&sortBy=relevance&sortOrder=descending
...
Start querying arxiv for term: prompt (scope: all)
Initial query for first 10000 results: https://export.arxiv.org/api/query?search_query=all:prompt&max_results=10000&start=0&sortBy=relevance&sortOrder=descending
...
Start querying arxiv for term: prompt (scope: None)
Initial query for first 10000 results: https://export.arxiv.org/api/query?search_query=prompt&max_results=10000&start=0&sortBy=relevance&sortOrder=descending
...

In total 17732 have been retrieved.
Results saved to gdrive/MyDrive/Colab_Data/Data/Arxiv/prompt_17732_results_SEARCH_PARAMS