Manual: https://arxiv.org/help/api/user-manual#Quickstart

In [60]:
import urllib
import time
from bs4 import BeautifulSoup
from enum import Enum
import pandas as pd

In [None]:
# Specify categories to search
categories = [
    'astro-ph',     # general astrophysics
    'astro-ph.ga',  # astrophysics of galaxies
    'astro-ph.co',  # cosmology and nongalactic astrophysics
    'astro-ph.ep',  # earth and planetary astrophysics
    'astro-ph.he',  # high energy astrophysical phenomena
    'astro-ph.im',  # instrumentation and methods for astrophysics 
    'astro-ph.sr']  # solar and stellar astrophysics

# Base API query URL
base_url = 'http://export.arxiv.org/api/query?'
results_per_page = 10

rows = []

for category in categories:
    # Search parameters
    search_query = 'cat:' + category
    startIndex = 0
    
    # Loop until total results reached
    while True:
        if startIndex == 0:
            total_results = int(soup.find('opensearch:totalResults').string)
        
        url = 'http://export.arxiv.org/api/query?search_query=' + search_query + '&start=' + str(startIndex)
        results = urllib.request.urlopen(url).read()

        soup = BeautifulSoup(results, 'xml')
        # Get all entry tags
        entries = soup.find_all('entry')

        for entry in entries: 
            # Collect authors
            authors = []
            for author in entry.find_all('name'):
                authors.append(author.string)
            author_str = ', '.join(authors)

            # Get DOI if it exists
            doi = entry.find('arxiv:doi')
            if doi:
                doi = doi.string

            # Extract links if they exist
            doi_link = None
            pdf_link = None
            links = entry.find_all('link')
            for link in links:
                link_title = link.get('title')
                if link_title and link_title == 'doi':
                    doi_link = link['href']
                elif link_title and link_title == 'pdf':
                    pdf_link = link['href']

            # Get journal if it exists
            journal = entry.find('arxiv:journal_ref')
            if journal:
                journal = journal.string

            # Get comment if it exists
            comment = entry.find('arxiv:comment')
            if comment:
                comment = comment.string

            row = {
                'id': entry.id.string,
                'updated': entry.updated.string,
                'published': entry.published.string,
                'title': entry.title.string,
                'summary': entry.summary.string,
                'authors': author_str,
                'doi': doi,
                'doi_link': doi_link,
                'journal': journal,
                'pdf_link': pdf_link,
                'category': entry.find('arxiv:primary_category')['term'],
                'comment': comment
            }
            rows.append(row)
            
            # Quit looping if we have gotten all results
            print(str(len(rows)) + ' of ' + str(total_results) + '...')
            if len(rows) == total_results:
                break
            else: 
                startIndex += results_per_page
                time.sleep(3) # recommended to sleep

1 of 105380...
2 of 105380...
3 of 105380...
4 of 105380...
5 of 105380...
6 of 105380...
7 of 105380...
8 of 105380...
9 of 105380...
10 of 105380...
11 of 105380...
12 of 105380...
13 of 105380...
14 of 105380...
15 of 105380...
16 of 105380...
17 of 105380...
18 of 105380...
19 of 105380...
20 of 105380...
21 of 105380...
22 of 105380...
23 of 105380...
24 of 105380...


In [131]:
arxiv_metadata_df = pd.DataFrame(rows)
arxiv_metadata_df

Unnamed: 0,authors,category,comment,doi,doi_link,id,journal,pdf_link,published,summary,title,updated
0,"Ramesh Narayan, Bohdan Paczyński, Tsvi Piran",astro-ph,14 pages,10.1086/186493,http://dx.doi.org/10.1086/186493,http://arxiv.org/abs/astro-ph/9204001v1,Astrophys.J. 395 (1992) L83-L86,http://arxiv.org/pdf/astro-ph/9204001v1,1992-04-13T18:20:01Z,It is proposed that gamma-ray bursts are cre...,Gamma-Ray Bursts as the Death Throes of Massiv...,1992-04-13T18:20:01Z
1,"Lawrence Krauss, Martin White",astro-ph,13 pages plus figures (not included),10.1086/171792,http://dx.doi.org/10.1086/171792,http://arxiv.org/abs/astro-ph/9204002v1,"Astrophys.J.397:357,1992",http://arxiv.org/pdf/astro-ph/9204002v1,1992-04-26T17:54:00Z,The four observables associated with gravita...,Gravitational Lensing and the Variability of G,1992-04-26T17:54:00Z
2,J. I. Katz,astro-ph,10 pages (Replaced to provide omitted line.),10.1007/BF00645080,http://dx.doi.org/10.1007/BF00645080,http://arxiv.org/abs/astro-ph/9204003v2,,http://arxiv.org/pdf/astro-ph/9204003v2,1992-04-29T16:36:30Z,The BATSE experiment on GRO has demonstrated...,The Ptolemaic Gamma-Ray Burst Universe,1992-04-30T20:39:38Z
3,"B P Schmidt, R P Kirshner, R G Eastman",astro-ph,21 pages,10.1086/171659,http://dx.doi.org/10.1086/171659,http://arxiv.org/abs/astro-ph/9204004v1,Astrophys.J. 395 (1992) 366-386,http://arxiv.org/pdf/astro-ph/9204004v1,1992-04-30T19:20:04Z,We use the Expanding Photosphere Method to d...,Expanding Photospheres of Type II Supernovae a...,1992-04-30T19:20:04Z
4,"B. J. Carrigan, J. I. Katz",astro-ph,24 pages,10.1086/171906,http://dx.doi.org/10.1086/171906,http://arxiv.org/abs/astro-ph/9204005v1,Astrophys.J. 399 (1992) 100-107,http://arxiv.org/pdf/astro-ph/9204005v1,1992-04-30T19:18:05Z,We have calculated gamma-ray radiative trans...,Radiation Transfer in Gamma-Ray Bursts,1992-04-30T19:18:05Z
5,"D. J. Johnson, M. W. Friedlander, J. I. Katz",astro-ph,29 pages,10.1086/172552,http://dx.doi.org/10.1086/172552,http://arxiv.org/abs/astro-ph/9204006v1,,http://arxiv.org/pdf/astro-ph/9204006v1,1992-04-30T19:30:14Z,Dust is observed to form in nova ejecta. The...,Nova Dust Nucleation: Kinetics and Photodissoc...,1992-04-30T19:30:14Z
6,Valerio Faraoni,astro-ph,12 pages,10.1086/171866,http://dx.doi.org/10.1086/171866,http://arxiv.org/abs/astro-ph/9205001v1,"Astrophys.J.398:425,1992",http://arxiv.org/pdf/astro-ph/9205001v1,1992-05-01T16:41:45Z,We apply Perlick's (1990a) rigorous formulat...,Nonstationary Gravitational Lenses and the Fer...,1992-05-01T16:41:45Z
7,"T. Hanawa, R. Matsumoto, K. Shibata",astro-ph,12 pages,10.1086/186454,http://dx.doi.org/10.1086/186454,http://arxiv.org/abs/astro-ph/9205002v1,Astrophys.J. 393 (1992) L71-L74,http://arxiv.org/pdf/astro-ph/9205002v1,1992-05-02T02:12:56Z,The effect of the magnetic skew on the Parke...,Giant Molecular Cloud Formation through the Pa...,1992-05-02T02:12:56Z
8,J. I. Katz,astro-ph,3 pages,10.1063/1.42698,http://dx.doi.org/10.1063/1.42698,http://arxiv.org/abs/astro-ph/9205003v1,,http://arxiv.org/pdf/astro-ph/9205003v1,1992-05-04T18:57:07Z,I present a model for acceleration of proton...,Particle Acceleration in (by) Accretion Discs,1992-05-04T18:57:07Z
9,"A. Cappi, S. Maurogordato",astro-ph,20 pages,,,http://arxiv.org/abs/astro-ph/9205004v1,"Astron.Astrophys.259:423-434,1992",http://arxiv.org/pdf/astro-ph/9205004v1,1992-05-08T12:22:00Z,We compare the spatial distributions of gala...,The Spatial Distribution of Nearby Galaxy Clus...,1992-05-08T12:22:00Z
