In [7]:
import re
import requests as rq
from urllib.parse import urljoin
import bs4
from bs4 import BeautifulSoup
from pathlib import Path
import os

In [8]:
def get_paper_urls(navigation: bs4.element.ResultSet, n: int) -> list:
    paper_urls = []
    
    for item in navigation[0:n_papers]:
        id_ = item.select_one("a[title='Abstract'][id]")
        html_tag = item.select_one("a[title='View HTML'][id^='html-']")
        html_url = urljoin(arxiv_url, html_tag["href"]) if html_tag else None
    
        if not html_url:
            continue
        else:
            paper_urls.append(html_url)

    return paper_urls


def get_title(soup: BeautifulSoup) -> str:
    '''
    Gets the text content of the title tag.
    '''
    title = soup.find("title").get_text()
    
    return title


def get_abstract(soup: BeautifulSoup) -> str:
    '''
    Gets the text content of the abstract tag.
    '''
    abstract_h6 = soup.find("h6", class_="ltx_title ltx_title_abstract")
    p = abstract_h6.find_next("p", class_="ltx_p")
    abstract_text = p.get_text()
    
    return abstract_text


def check_file_exists(path: str) -> bool:
    '''
    Check file exists.
    '''
    return os.path.exists(path)

In [26]:
data_path = "../data"
if not Path(data_path).exists():
    Path(data_path).mkdir()
    (Path(data_path)/"raw").mkdir()
    (Path(data_path)/"raw"/"htmls").mkdir()
    (Path(data_path)/"raw"/"abstracts").mkdir()


In [27]:
arxiv_url = "https://arxiv.org/list/cs.AI/recent"

In [28]:
# Query the recently uploaded papers list
response = rq.get(arxiv_url, headers = {"User-Agent": "Mozilla/5.0"})
response_html = response.text

In [29]:
# Obtain the list of items from the papers list
main_soup = BeautifulSoup(response_html, "html.parser")
navigation = main_soup.select("dl > dt")

In [30]:
n_papers = 50
paper_urls = get_paper_urls(navigation=navigation, n=n_papers)

In [31]:
paper_contents = []

for url in paper_urls:
    paper_html_content = rq.get(url, headers = {"User-Agent": "Mozilla/5.0"}).text

    if len(paper_html_content) > 3000:
        html_id = url.split("/")[-1]
        
        # Check if we already downloaded both files, otherwise skip it to reduce processing time.
        full_content_file = f"{data_path}/raw/htmls/{html_id}.txt"
        abstract_content_file = f"{data_path}/raw/abstracts/{html_id}.txt"
        full_file_already_exists = check_file_exists(path=full_content_file)
        abs_file_already_exists = check_file_exists(path=abstract_content_file)

        if full_file_already_exists and abs_file_already_exists:
            print(f"Both files for {html_id} already downloaded, skipping.")
            continue

        # Get content of paper url
        paper_soup = BeautifulSoup(paper_html_content, "html.parser")
        title = get_title(soup=paper_soup)

        # Obtain the abstract from the paper content
        try:
            abstract = get_abstract(paper_soup)
        except AttributeError as e:
            print(f"\033[93m Skipping {html_id} as it does not have an abstract. \033[0m")
            continue

        # Write files
        print(f"Downloading {url} \n\t Title: {title}")
        with open(full_content_file, "w", encoding="utf8") as f:
            f.writelines(paper_html_content)
            print(f"Wrote file {full_content_file}")
        with open(abstract_content_file, "w", encoding="utf8") as f:
            f.writelines(abstract)
            print(f"Wrote file {abstract_content_file}")
        
    else:
        print(f"\033[93m Skipping {url}, not enough characters in HTML content. \033[0m")


Downloading https://arxiv.org/html/2510.24690v1 
	 Title: Bridging Tool Dependencies and Domain Knowledge: A Graph-Based Framework for In-Context Planning
Wrote file ../data/raw/htmls/2510.24690v1.txt
Wrote file ../data/raw/abstracts/2510.24690v1.txt
Downloading https://arxiv.org/html/2510.24663v1 
	 Title: OrchDAG: Complex Tool Orchestration in Multi-Turn Interactions with Plan DAGs
Wrote file ../data/raw/htmls/2510.24663v1.txt
Wrote file ../data/raw/abstracts/2510.24663v1.txt
Downloading https://arxiv.org/html/2510.24650v1 
	 Title: Advancing site-specific disease and pest management in precision agriculture: From reasoning-based foundation models to adaptive, feedback-driven learning
Wrote file ../data/raw/htmls/2510.24650v1.txt
Wrote file ../data/raw/abstracts/2510.24650v1.txt
Downloading https://arxiv.org/html/2510.24645v1 
	 Title: FunReason-MT Technical Report: Overcoming the Complexity Barrier in Multi-Turn Function Calling
Wrote file ../data/raw/htmls/2510.24645v1.txt
Wrote fi