In [61]:
import re
import requests as rq
from urllib.parse import urljoin
import bs4
from bs4 import BeautifulSoup
from pathlib import Path
import os
import json

In [62]:
def get_paper_urls(navigation: bs4.element.ResultSet, n: int) -> list:
    '''
    Obtains the url link to each paper which appears in the navigation list.
    '''
    paper_urls = []
    
    for i, item in enumerate(navigation):
        id_ = item.select_one("a[title='Abstract'][id]")
        html_tag = item.select_one("a[title='View HTML'][id^='html-']")
        html_url = urljoin(arxiv_url, html_tag["href"]) if html_tag else None
    
        if not html_url:
            continue
        else:
            paper_urls.append(html_url)

        if len(paper_urls) >= n_papers:
            break

    return paper_urls


def get_title(soup: BeautifulSoup) -> str:
    '''
    Gets the text content of the title tag.
    '''
    title = soup.find("title").get_text()
    
    return title


def get_abstract(soup: BeautifulSoup) -> str:
    '''
    Gets the text content of the abstract tag.
    '''
    abstract_h6 = soup.find("h6", class_="ltx_title ltx_title_abstract")
    p = abstract_h6.find_next("p", class_="ltx_p")
    abstract_text = p.get_text()
    
    return abstract_text


def check_file_exists(path: str) -> bool:
    '''
    Check file exists.
    '''
    return os.path.exists(path)

In [63]:
# Create the data path if necessary
data_path = "../data"

individual_paths = [(Path(data_path)/"raw"/"htmls"), (Path(data_path)/"raw"/"abstracts"), (Path(data_path)/"raw"/"parsed_sections")]
for path in individual_paths:
    (Path(data_path)/"raw"/"htmls").mkdir(parents=True, exist_ok=False)
    (Path(data_path)/"raw"/"abstracts").mkdir(parents=True, exist_ok=False)
    (Path(data_path)/"raw"/"parsed_sections").mkdir(parents=True, exist_ok=False)

In [64]:
# Set the parameters for scraping the website
ARXIV_BASE_URL = "https://arxiv.org/list"
arxiv_topic = "/cs.AI/recent?skip=0&show=1000"
arxiv_url = f"{ARXIV_BASE_URL}{arxiv_topic}"
print(f"Querying {arxiv_url}")

Querying https://arxiv.org/list/cs.AI/recent?skip=0&show=1000


In [65]:
# Query the recently uploaded papers list
headers = {"User-Agent": "Mozilla/5.0"}
response = rq.get(arxiv_url, headers = headers)
response_html = response.text

In [66]:
# Obtain the list of items from the papers list
main_soup = BeautifulSoup(response_html, "html.parser")
navigation = main_soup.select("dl > dt")

In [67]:
# Obtain a list of n links to the recent papers for the chosen topic
# TO DO: Implement a handler for when the amount of documents is less than n due to issues with the HTML content of the linked page.
n_papers = 60
paper_urls = get_paper_urls(navigation=navigation, n=n_papers)

In [68]:
# Recursively extract sections from soup representation of html
def get_sections(tag: BeautifulSoup):
    section_tags = tag.find_all("section", recursive=False)
    sections = []


    for section_tag in section_tags:

        paragraphs = [
        p.get_text(" ", strip=True)
        for p in section_tag.find_all("p")
        ]
        section_dict = {
            "title": section_tag.find(["h1",
                                       "h2",
                                       "h3",
                                       "h4",
                                       "h5",
                                       "h6",
                                       "h7"]).get_text().strip(),
            "paragraphs": "\n".join(paragraphs),
            "subsections": get_sections(section_tag)
        }
        sections.append(section_dict)
    return sections

    return get_sections(get_sections(
        tag.find("html", recursive=False).find("body",recursive=False).find("div",class_="ltx_page_main", recursive=False).find("div",class_="ltx_page_content", recursive=False).find("article", recursive=False)))


In [69]:
paper_contents = []

for url in paper_urls:
    paper_html_content = rq.get(url, headers = headers).text

    if len(paper_html_content) > 3000:
        html_id = url.split("/")[-1]
        print(url)
        
        # Check if we already downloaded both files, otherwise skip it to reduce processing time.
        full_content_file = f"{data_path}/raw/htmls/{html_id}.txt"
        abstract_content_file = f"{data_path}/raw/abstracts/{html_id}.txt"
        parsed_sections_file = f"{data_path}/raw/parsed_sections/{html_id}.json"
        full_file_already_exists = check_file_exists(path=full_content_file)
        abs_file_already_exists = check_file_exists(path=abstract_content_file)

        if full_file_already_exists and abs_file_already_exists:
            print(f"Both files for {html_id} already downloaded, skipping.")
            continue

        # Get content of paper url
        paper_soup = BeautifulSoup(paper_html_content, "html.parser")
        title = get_title(soup=paper_soup)

        # Obtain the abstract from the paper content
        try:
            abstract = get_abstract(paper_soup)
        except AttributeError as e:
            print(f"\033[93m Skipping {html_id} as it does not have an abstract. \033[0m")
            continue
        try:
            sections = get_sections(paper_soup.find("html",recursive=False).find("body", recursive=False).find("div",class_="ltx_page_main", recursive=False).find("div",class_="ltx_page_content", recursive=False).find("article", recursive=False))
        except Exception as e:
            print(f"\033[93m Error parsing html sections \033[0m")
            continue
        # Write files
        print(f"Downloading {url} \n\t Title: {title}")
        with open(full_content_file, "w", encoding="utf8") as f:
            f.writelines(paper_html_content)
            print(f"Wrote file {full_content_file}")
        with open(abstract_content_file, "w", encoding="utf8") as f:
            f.writelines(abstract)
            print(f"Wrote file {abstract_content_file}")
        with open(parsed_sections_file, "w", encoding="utf-8") as f:
            json.dump(sections, f, ensure_ascii=False, indent=2)
        
        
    else:
        print(f"\033[93m Skipping {url}, not enough characters in HTML content. \033[0m")


https://arxiv.org/html/2510.26784v1
Both files for 2510.26784v1 already downloaded, skipping.
https://arxiv.org/html/2510.26752v1
Downloading https://arxiv.org/html/2510.26752v1 
	 Title: The Oversight Game: Learning to Cooperatively Balance an AI Agent’s Safety and Autonomy
Wrote file ../data/raw/htmls/2510.26752v1.txt
Wrote file ../data/raw/abstracts/2510.26752v1.txt
https://arxiv.org/html/2510.26732v1
Downloading https://arxiv.org/html/2510.26732v1 
	 Title: Cross-Platform Evaluation of Reasoning Capabilities in Foundation Models
Wrote file ../data/raw/htmls/2510.26732v1.txt
Wrote file ../data/raw/abstracts/2510.26732v1.txt
https://arxiv.org/html/2510.26721v1
Downloading https://arxiv.org/html/2510.26721v1 
	 Title: Unveiling Intrinsic Text Bias in Multimodal Large Language Models through Attention Key-Space Analysis
Wrote file ../data/raw/htmls/2510.26721v1.txt
Wrote file ../data/raw/abstracts/2510.26721v1.txt
https://arxiv.org/html/2510.26702v1
Downloading https://arxiv.org/html/2