In [1]:
import bs4 as BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains
import time
from typing import Optional, List, Dict, Any, Tuple

In [24]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from typing import List, Optional
import time

class ArxivScraper:
    def __init__(self):
        options = Options()
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--ignore-certificate-errors")
        # options.add_argument("--headless")  # Uncomment to run in headless mode

        self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        self.paper_id_list : List[str] = []
        self.author_list : List[Dict] = []

    def pagination(self) -> Optional[bool]:
        print("     [INFO] Pagination enabled!")
        next_page_locator = (By.XPATH, '//*[@id="main-container"]/div[2]/nav[2]/a[2]')
        try:
            next_page = WebDriverWait(self.driver, 10).until(EC.visibility_of_element_located(next_page_locator))
            actions = ActionChains(self.driver)
            actions.click(next_page).perform()
            return True
        except Exception as e:
            return None

    def connect_to_arxiv(self):
        print("\n📍 Step 1: Connecting to Arxiv!")
        url = "https://arxiv.org/"
        keyword = "photonic circuits"
        try:
            self.driver.get(url)
            time.sleep(2)
            print("     [INFO] Arxiv connection successful!")
        except Exception as e:
            print(f"An error occurred: {e}")

        try:
            search_box = self.driver.find_element(By.XPATH, '//*[@id="header"]/div[2]/form/div/div[1]/input')
            search_box.send_keys(keyword)
            search_box.send_keys(Keys.RETURN)
            time.sleep(2)
            print("     [INFO] Searching keywords!")
        except Exception as e:
            print(f"An error occurred: {e}")

    def get_paper_ids(self, pagination: bool = True) -> List[str]:
        print("\n📍 Step 2: Scraping paper links!")
        try:
            paper_entries = self.driver.find_elements(By.XPATH, '//*[@id="main-container"]/div[2]/ol/li')
            for paper in paper_entries:
                paper_link = paper.find_element(By.XPATH, './/p[@class="list-title is-inline-block"]/a').get_attribute("href")
                paper_id = paper_link.split("/")[-1]
                self.paper_id_list.append(paper_id)

            if pagination is True:
                self.get_paper_ids()
                print("     [INFO] Page found rotating the next page!")
            else:
                print("     [INFO] No more pages to scrape!")

            print("     [INFO] Paper links scraped successfully!")
            return self.paper_id_list
        except Exception as e:
            print(f"An error occurred: {e}")
            return []

    def get_author_details(self) -> Optional[List[Dict]]:
        print("\n📍 Step 3: Scraping author details!")

        if not self.paper_id_list:
            print("     [INFO] No paper links found!")
            return None

        paper_id = next(iter(self.paper_id_list))
        api = f"https://api.semanticscholar.org/v1/paper/arXiv:{paper_id}?include_unknown_references=true"
        self.author_list = []
        try:
            response = requests.get(api)
            data = response.json()
            authors = data["authors"]
            for author in authors:
                self.author_list.append({"Author": author["name"], "ID": author["authorId"]})
            print("     [INFO] Author details scraped successfully!")
            return self.author_list
        except Exception as e:
            print(f"An error occurred: {e}")
            return None

    # TODO: Handling with pagination
    def get_releated_papers(self, pagination: bool = True) -> Optional[List[Dict]]:
        print("\n📍 Step 4: Connecting to Scholar and scraping related papers!")
        if not self.author_list:
            print("     [INFO] No author details found!")
            return None

        author = next(iter(self.author_list))
        url = "https://www.semanticscholar.org/author/Iason-Gabriel/2343751686?sort=pub-date"
        try:
            self.driver.get(url)
            time.sleep(2)
            print("     [INFO] Scholar connection successful!")
        except Exception as e:
            print(f"An error occurred: {e}")

        try:
            paper_page = self.driver.find_element(By.XPATH,
            '//*[@id="app"]/div[1]/div[2]/div/main/div[2]/div/div/div/div[2]/div/div[2]/div/div/div[1]/a')
            paper_date = self.driver.find_element(By.CSS_SELECTOR, '.cl-paper-pubdates span span')

            arxiv_link = paper_page.get_attribute("href")
            print(arxiv_link)
            print(paper_date.text)
            for details in self.author_list:
                details["LastPaperLink"] = arxiv_link
                details["LastPaperDate"] = paper_date.text
                details["AmountOfMentions"] = 1

            # if pagination:
            #     self.get_releated_papers()
            #     print("     [INFO] Page found rotating the next page!")
            # else:
            #     print("     [INFO] No more pages to scrape!")

            print("     [INFO] Related papers scraped successfully!")
            return self.author_list
        except Exception as e:
            print(f"An error occurred: {e}")
            return None

    def _save_excel(self):
        pass

if __name__ == "__main__":
    scraper = ArxivScraper()
    scraper.connect_to_arxiv()
    scraper.get_paper_ids(pagination=False)
    scraper.get_author_details()
    scraper.get_releated_papers(pagination=False)
    scraper.driver.quit()


📍 Step 1: Connecting to Arxiv!
     [INFO] Arxiv connection successful!
     [INFO] Searching keywords!

📍 Step 2: Scraping paper links!
     [INFO] No more pages to scrape!
     [INFO] Paper links scraped successfully!

📍 Step 3: Scraping author details!
     [INFO] Author details scraped successfully!

📍 Step 4: Connecting to Scholar and scraping related papers!
     [INFO] Scholar connection successful!
https://arxiv.org/pdf/2502.02528.pdf
4 February 2025
     [INFO] Related papers scraped successfully!
