In [1]:
import re
import requests as rq
from urllib.parse import urljoin
import bs4
from bs4 import BeautifulSoup
from pathlib import Path
import os

In [2]:
def get_paper_urls(navigation: bs4.element.ResultSet, n: int) -> list:
    paper_urls = []
    
    for item in navigation[0:n_papers]:
        id_ = item.select_one("a[title='Abstract'][id]")
        html_tag = item.select_one("a[title='View HTML'][id^='html-']")
        html_url = urljoin(arxiv_url, html_tag["href"]) if html_tag else None
    
        if not html_url:
            continue
        else:
            paper_urls.append(html_url)

    return paper_urls


def get_title(soup: BeautifulSoup) -> str:
    '''
    Gets the text content of the title tag.
    '''
    title = soup.find("title").get_text()
    
    return title


def get_abstract(soup: BeautifulSoup) -> str:
    '''
    Gets the text content of the abstract tag.
    '''
    abstract_h6 = soup.find("h6", class_="ltx_title ltx_title_abstract")
    p = abstract_h6.find_next("p", class_="ltx_p")
    abstract_text = p.get_text()
    
    return abstract_text


def check_file_exists(path: str) -> bool:
    '''
    Check file exists.
    '''
    return os.path.exists(path)

In [3]:
data_path = "../data"

In [4]:
# arxiv_url = "https://arxiv.org/list/cs.AI/recent"
arxiv_url = "https://arxiv.org/list/econ.TH/recent"

In [5]:
# Query the recently uploaded papers list
response = rq.get(arxiv_url, headers = {"User-Agent": "Mozilla/5.0"})
response_html = response.text

In [6]:
# Obtain the list of items from the papers list
main_soup = BeautifulSoup(response_html, "html.parser")
navigation = main_soup.select("dl > dt")

In [7]:
# n_papers = 50
n_papers = 28
paper_urls = get_paper_urls(navigation=navigation, n=n_papers)

In [10]:
paper_contents = []

for url in paper_urls:
    paper_html_content = rq.get(url, headers = {"User-Agent": "Mozilla/5.0"}).text

    if len(paper_html_content) > 3000:
        html_id = url.split("/")[-1]
        
        # Check if we already downloaded both files, otherwise skip it to reduce processing time.
        full_content_file = f"{data_path}/raw/htmls/{html_id}.txt"
        abstract_content_file = f"{data_path}/raw/abstracts/{html_id}.txt"
        full_file_already_exists = check_file_exists(path=full_content_file)
        abs_file_already_exists = check_file_exists(path=abstract_content_file)

        if full_file_already_exists and abs_file_already_exists:
            print(f"Both files for {html_id} already downloaded, skipping.")
            continue

        # Get content of paper url
        paper_soup = BeautifulSoup(paper_html_content, "html.parser")
        title = get_title(soup=paper_soup)

        # Obtain the abstract from the paper content
        try:
            abstract = get_abstract(paper_soup)
        except AttributeError as e:
            print(f"\033[93m Skipping {html_id} as it does not have an abstract. \033[0m")
            continue

        # Write files
        print(f"Downloading {url} \n\t Title: {title}")
        with open(full_content_file, "w", encoding="utf8") as f:
            f.writelines(paper_html_content)
            print(f"Wrote file {full_content_file}")
        with open(abstract_content_file, "w", encoding="utf8") as f:
            f.writelines(abstract)
            print(f"Wrote file {abstract_content_file}")
        
    else:
        print(f"\033[93m Skipping {url}, not enough characters in HTML content. \033[0m")


Downloading https://arxiv.org/html/2510.24388v1 
	 Title: A Characterization of Egalitarian and Proportional Sharing Principles: An Efficient Extension Operator Approach1footnote 11footnote 1We thank Phillipe Solal, Sylvain Ferrières Sylvain Béal, Juan D. Moreno-Ternero, Toru Hokari, Stéphane Gonzalez, David Lowing, Kevin Techker, Susumu Cato, Takashi Ui, Nobuo Koida, Shintaro Miura, Florian Navarro, Hendrik Rommeswinkel, and participants in EAGT 2024, Summer workshop 2024, RISS workshop 2025 in Kansai University, Prof. Koichi Tadenuma retirement conference, Université Marie et Louis Pasteur, Hitotsubashi University, Kwansei-gakuin Univeristy, SING 2025, University of Saint-Etienne, and Networks and Games seminars at CES for helpful comments. Nakada acknowledges the financial support from Japan Society for the Promotion of Science KAKENHI: No.19K13651, 20KK0036, and 25K16606. Koriyama acknowledges the financial support from Investissements d’Avenir, ANR-11-IDEX-0003/Labex Ecodec/ANR-11