In [71]:
"""
python_arXiv_parsing_example.py

This sample script illustrates a basic arXiv api call
followed by parsing of the results using the 
feedparser python module.

Please see the documentation at 
http://export.arxiv.org/api_help/docs/user-manual.html
for more information, or email the arXiv api 
mailing list at arxiv-api@googlegroups.com.

urllib is included in the standard python library.
feedparser can be downloaded from http://feedparser.org/ .

Author: Julius B. Lucks

This is free software.  Feel free to do what you want
with it, but please play nice with the arXiv API!
"""

import urllib.request
import feedparser
import os
import re

# Base api query url
base_url = 'http://export.arxiv.org/api/query?'

# Search parameters
search_query = 'all:nuclear+AND+all:reactor'  # search for electron in all fields'
start = 0                      # retrieve the first 5 results
max_results = 1000

query = f'search_query={search_query}&start={start}&max_results={max_results}'

# perform a GET request using the base_url and query
response = urllib.request.urlopen(base_url + query).read()

# parse the response using feedparser
feed = feedparser.parse(response)

# Directory to save PDF files
pdf_dir = 'arxiv_pdfs'
os.makedirs(pdf_dir, exist_ok=True)

# Function to sanitize file names
def sanitize_filename(filename):
    return re.sub(r'[\\/*?:"<>|]', "_", filename)

# print out feed information
print(f'Feed title: {feed.feed.title}')
print(f'Feed last updated: {feed.feed.updated}')

# print opensearch metadata
print(f'totalResults for this query: {feed.feed.opensearch_totalresults}')
print(f'itemsPerPage for this query: {feed.feed.opensearch_itemsperpage}')
print(f'startIndex for this query: {feed.feed.opensearch_startindex}')

# Run through each entry, and print out information
for entry in feed.entries:
    print('e-print metadata')
    arxiv_id = entry.id.split("/abs/")[-1]
    sanitized_arxiv_id = sanitize_filename(arxiv_id)
    print(f'arxiv-id: {sanitized_arxiv_id}')
    print(f'Published: {entry.published}')
    print(f'Title:  {entry.title}')
    
    # feedparser v4.1 only grabs the first author
    author_string = entry.author
    
    # grab the affiliation in <arxiv:affiliation> if present
    # - this will only grab the first affiliation encountered
    #   (the first affiliation for the first author)
    # Please email the list with a way to get all of this information!
    try:
        author_string += f' ({entry.arxiv_affiliation})'
    except AttributeError:
        pass
    
    print(f'Last Author:  {author_string}')
    
    # feedparser v5.0.1 correctly handles multiple authors, print them all
    try:
        print(f'Authors:  {", ".join(author.name for author in entry.authors)}')
    except AttributeError:
        pass

    # get the links to the abs page and construct the pdf link
    abs_link = None
    pdf_link = None
    for link in entry.links:
        if link.rel == 'alternate':
            abs_link = link.href
            pdf_link = abs_link.replace('/abs/', '/pdf/')
            print(f'abs page link: {abs_link}')
            print(f'pdf link: {pdf_link}')

    # Download the PDF file if the link is found
    if pdf_link:
        pdf_filename = os.path.join(pdf_dir, f"{sanitized_arxiv_id}.pdf")
        try:
            print(f'Downloading PDF to {pdf_filename}...')
            urllib.request.urlretrieve(pdf_link, pdf_filename)
        except urllib.error.HTTPError as e:
            print(f'Failed to download {pdf_filename}: {e}')
    
    # The journal reference, comments and primary_category sections live under 
    # the arxiv namespace
    try:
        journal_ref = entry.arxiv_journal_ref
    except AttributeError:
        journal_ref = 'No journal ref found'
    print(f'Journal reference: {journal_ref}')
    
    try:
        comment = entry.arxiv_comment
    except AttributeError:
        comment = 'No comment found'
    print(f'Comments: {comment}')
    
    # Since the <arxiv:primary_category> element has no data, only
    # attributes, feedparser does not store anything inside
    # entry.arxiv_primary_category
    # This is a dirty hack to get the primary_category, just take the
    # first element in entry.tags.  If anyone knows a better way to do
    # this, please email the list!
    print(f'Primary Category: {entry.tags[0]["term"]}')
    
    # Lets get all the categories
    all_categories = [t['term'] for t in entry.tags]
    print(f'All Categories: {", ".join(all_categories)}')
    
    # The abstract is in the <summary> element
    print(f'Abstract: {entry.summary}')


Feed title: ArXiv Query: search_query=all:nuclear AND all:reactor&amp;id_list=&amp;start=0&amp;max_results=1000
Feed last updated: 2024-06-21T00:00:00-04:00
totalResults for this query: 1470
itemsPerPage for this query: 1000
startIndex for this query: 0
e-print metadata
arxiv-id: 1611.01575v1
Published: 2016-11-05T00:19:59Z
Title:  Reactor Antineutrino Signals at Morton and Boulby
Last Author:  Steve Dye
Authors:  Steve Dye
abs page link: http://arxiv.org/abs/1611.01575v1
pdf link: http://arxiv.org/pdf/1611.01575v1
Downloading PDF to arxiv_pdfs\1611.01575v1.pdf...
Journal reference: No journal ref found
Comments: 8 pages, 9 figures, 5 tables
Primary Category: nucl-ex
All Categories: nucl-ex, physics.ins-det
Abstract: Increasing the distance from which an antineutrino detector is capable of
monitoring the operation of a registered reactor, or discovering a clandestine
reactor, strengthens the Non-Proliferation of Nuclear Weapons Treaty. This
report presents calculations of reactor antin