# Query the Arxiv Preprint Server

This **Notebook-as-Tool** allows you to:

1.   query the preprint server arxiv.org and retrieve results as csv. The query routine retrieves data on three levels: (a) title field, (b) abstract field and (c) all fields. 

For running or adapting this Colab Notebook you need to create a copy in you Google drive: **File → Save a copy in Drive**. I will be stored in a folder ```Colab Notebooks```. Open this file with Google Colab and run the cells consecutively by pressing the **Play** button or pushing **shift+enter**.

**Important notes:**
- Code is hidden in the background of Colab forms. For viewing and editing the code **double click** cell or select  **View → Show/hide code**
- Data will be stored in Google Drive in the folder ```Colab Data```. A connection to your drive will be authenticated when running setup code cells. This is temporary and only your current notebook will be conncted to your drive. The connection will be revoked when the notebook is terminated or by selecting **Runtime → Factory reset runtime**.


**Credits:** This notebook was written by Marcus Burkhardt and makes use of the arxiv API wrapper.

In [None]:
#@title Setup 1: Mount Google Drive for Loading and Storing Data
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
#@title Setup 2: Install and Load Required Libraries and Run Setup Procedures

# Install Libraries
try: 
  import arxiv
  import feedparser
  pass
except: 
  !pip install arxiv
  import arxiv
  !pip install feedparser
  import feedparser
  pass

# Import Libaries
import os
import time
import pandas as pd
from datetime import datetime
from tqdm.notebook import tqdm
import requests

# Defining path variable for data path
data_path = os.path.join("gdrive", "MyDrive", "Colab_Data", "Data", "Arxiv")
if not os.path.isdir(data_path):
  os.makedirs(data_path)

In [None]:
#@title Setup 3: Definition of Core and Support Functions Used by the Tool(s)

def query_arxiv(query, max_results, search_scope='all', ):
    print(f'Start querying arxiv for term: {query} (scope: {search_scope})')
    max_results = 1000
    start = 0 - max_results
    more = True
    results = []
    while more:
        print('.', end='')
        start += max_results
        url = f"https://export.arxiv.org/api/query?search_query={search_scope}:{query}&max_results={max_results}&start={start}"
        resp = requests.get(url)
        resp = feedparser.parse(resp.text)
        items = resp['entries']
        if len(items) > 0:
            results += items
        else:
            more = False
            print('')
    results = pd.json_normalize(results)
    return results

def query(q, max_results=1000):
    qti = query_arxiv(q, search_scope='ti', max_results=max_results)
    qabs = query_arxiv(q, search_scope='abs', max_results=max_results)
    qall = query_arxiv(q, search_scope='all', max_results=max_results)
    qtiUnique = qti.copy()
    qtiUnique['ResultType'] = 'title'
    
    if len(qtiUnique) > 0 and len(qabs) > 0:
        qabsUnique = qabs[~qabs['id'].isin(qti['id'].tolist())].copy()
        qabsUnique['ResultType'] = 'abstract'
    else:
        qabsUnique = qabs.copy()
        qabsUnique['ResultType'] = 'abstract' 
    
    tmp = pd.concat([qtiUnique, qabsUnique])
    if len(tmp) > 0 and len(qall) > 0:
        qallUnique = qall[(~qall['id'].isin(qti['id'].tolist())) & (~qall['id'].isin(qabs['id'].tolist()))].copy()
    else:
        qallUnique = qall.copy()
    qallUnique['ResultType'] = 'all'
    results = pd.concat([qtiUnique, qabsUnique, qallUnique])
    return results

In [None]:
#@title Query arxiv.org
query_term = "consent" #@param {type:"string"}
max_results = 10000 #@param {type:"slider", min:10, max:10000, step:10}
outfile_name = f"{datetime.now()}_{query_term}_{max_results}.csv" # could be added as a form field.
outfile = os.path.join(data_path, outfile_name)
print('Results will be stored at {}'.format('/'.join(outfile.split('/')[2:])))

results = query(query_term, max_results)
print(f"In total {len(results)} have been retrieved.")
results["query term"] = query_term
if len(results) > 0:
    results.to_csv(outfile, sep='\t', index=None)

print('Done.')