In [43]:
import re
import requests as rq
from urllib.parse import urljoin
import bs4
from bs4 import BeautifulSoup
from pathlib import Path
import os

In [44]:
def get_paper_urls(navigation: bs4.element.ResultSet, n: int) -> list:
    '''
    Obtains the url link to each paper which appears in the navigation list.
    '''
    paper_urls = []
    
    for i, item in enumerate(navigation):
        id_ = item.select_one("a[title='Abstract'][id]")
        html_tag = item.select_one("a[title='View HTML'][id^='html-']")
        html_url = urljoin(arxiv_url, html_tag["href"]) if html_tag else None
    
        if not html_url:
            continue
        else:
            paper_urls.append(html_url)

        if len(paper_urls) >= n_papers:
            break

    return paper_urls


def get_title(soup: BeautifulSoup) -> str:
    '''
    Gets the text content of the title tag.
    '''
    title = soup.find("title").get_text()
    
    return title


def get_abstract(soup: BeautifulSoup) -> str:
    '''
    Gets the text content of the abstract tag.
    '''
    abstract_h6 = soup.find("h6", class_="ltx_title ltx_title_abstract")
    p = abstract_h6.find_next("p", class_="ltx_p")
    abstract_text = p.get_text()
    
    return abstract_text


def check_file_exists(path: str) -> bool:
    '''
    Check file exists.
    '''
    return os.path.exists(path)

In [45]:
# Create the data path if necessary
data_path = "../data"

if not Path(data_path).exists():
    Path(data_path).mkdir()
    
    (Path(data_path)/"raw").mkdir()
    (Path(data_path)/"raw"/"htmls").mkdir()
    (Path(data_path)/"raw"/"abstracts").mkdir()

In [46]:
# Set the parameters for scraping the website
ARXIV_BASE_URL = "https://arxiv.org/list"
arxiv_topic = "/cs.AI/recent?skip=0&show=1000"
arxiv_url = f"{ARXIV_BASE_URL}{arxiv_topic}"
print(f"Querying {arxiv_url}")

Querying https://arxiv.org/list/cs.AI/recent?skip=0&show=1000


In [47]:
# Query the recently uploaded papers list
headers = {"User-Agent": "Mozilla/5.0"}
response = rq.get(arxiv_url, headers = headers)
response_html = response.text

In [48]:
# Obtain the list of items from the papers list
main_soup = BeautifulSoup(response_html, "html.parser")
navigation = main_soup.select("dl > dt")

In [49]:
# Obtain a list of n links to the recent papers for the chosen topic
# TO DO: Implement a handler for when the amount of documents is less than n due to issues with the HTML content of the linked page.
n_papers = 60
paper_urls = get_paper_urls(navigation=navigation, n=n_papers)

In [50]:
paper_contents = []

for url in paper_urls:
    paper_html_content = rq.get(url, headers = headers).text

    if len(paper_html_content) > 3000:
        html_id = url.split("/")[-1]
        
        # Check if we already downloaded both files, otherwise skip it to reduce processing time.
        full_content_file = f"{data_path}/raw/htmls/{html_id}.txt"
        abstract_content_file = f"{data_path}/raw/abstracts/{html_id}.txt"
        full_file_already_exists = check_file_exists(path=full_content_file)
        abs_file_already_exists = check_file_exists(path=abstract_content_file)

        if full_file_already_exists and abs_file_already_exists:
            print(f"Both files for {html_id} already downloaded, skipping.")
            continue

        # Get content of paper url
        paper_soup = BeautifulSoup(paper_html_content, "html.parser")
        title = get_title(soup=paper_soup)

        # Obtain the abstract from the paper content
        try:
            abstract = get_abstract(paper_soup)
        except AttributeError as e:
            print(f"\033[93m Skipping {html_id} as it does not have an abstract. \033[0m")
            continue

        # Write files
        print(f"Downloading {url} \n\t Title: {title}")
        with open(full_content_file, "w", encoding="utf8") as f:
            f.writelines(paper_html_content)
            print(f"Wrote file {full_content_file}")
        with open(abstract_content_file, "w", encoding="utf8") as f:
            f.writelines(abstract)
            print(f"Wrote file {abstract_content_file}")
        
    else:
        print(f"\033[93m Skipping {url}, not enough characters in HTML content. \033[0m")


Both files for 2510.25758v1 already downloaded, skipping.
Both files for 2510.25724v1 already downloaded, skipping.
Both files for 2510.25679v1 already downloaded, skipping.
Both files for 2510.25668v1 already downloaded, skipping.
Both files for 2510.25612v1 already downloaded, skipping.
Both files for 2510.25588v1 already downloaded, skipping.
Both files for 2510.25518v1 already downloaded, skipping.
Both files for 2510.25517v1 already downloaded, skipping.
Both files for 2510.25510v1 already downloaded, skipping.
Both files for 2510.25504v1 already downloaded, skipping.
Both files for 2510.25445v1 already downloaded, skipping.
Both files for 2510.25388v1 already downloaded, skipping.
Both files for 2510.25320v1 already downloaded, skipping.
Both files for 2510.25232v1 already downloaded, skipping.
Both files for 2510.25223v1 already downloaded, skipping.
Both files for 2510.25205v1 already downloaded, skipping.
Both files for 2510.25179v1 already downloaded, skipping.
Both files for