In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import asyncio
import time
from concurrent.futures import ThreadPoolExecutor
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

# Configure Selenium WebDriver
def setup_driver(silent=False):
    options = Options()
    if silent:
        options.add_argument("--headless")
    service = Service()
    driver = webdriver.Chrome(service=service, options=options)
    return driver

In [4]:
async def scrape_page(page_number: int = 1):
    def blocking_scrape():
        driver = setup_driver(False)
        driver.get("https://firefly.gchan.moe/problemset")
        element = driver.find_element(By.XPATH, "//button[contains(@class, 'datatable-pagination-list-item-link') and normalize-space(text())='›']")
        for i in range(page_number - 1):
            element.click()
            time.sleep(0.2)  
        time.sleep(0.3)  
        pdfURLs = []
        for url in driver.find_elements(By.XPATH, "//div[@class='datatable-container']//a[@href]"):
            pdfURLs.append(url.get_attribute("href")+"/statement")
        driver.quit()
        print(pdfURLs)
        return pdfURLs
    return await asyncio.to_thread(blocking_scrape)

In [5]:
async def get_url():
    # First batch: pages 1 to 5
    tasks_batch_1 = [scrape_page(page_number) for page_number in range(1, 6)]
    results_batch_1 = await asyncio.gather(*tasks_batch_1)
    tasks_batch_2 = [scrape_page(page_number) for page_number in range(6, 11)]
    results_batch_2 = await asyncio.gather(*tasks_batch_2)
    all_links = [link for result in (results_batch_1 + results_batch_2) for link in result]
    return all_links

In [6]:
urls = await get_url()
df = pd.DataFrame(urls, columns=["ProblemURL"])
df.to_csv("graderchan.csv", index=False)

['https://firefly.gchan.moe/problemset/c1_bkk67_2/statement', 'https://firefly.gchan.moe/problemset/c1_bkk67_3/statement', 'https://firefly.gchan.moe/problemset/c1_bkk67_4/statement', 'https://firefly.gchan.moe/problemset/c1_bkk67_5/statement', 'https://firefly.gchan.moe/problemset/c1_su63_anagram1/statement', 'https://firefly.gchan.moe/problemset/c1_su63_anagram2/statement', 'https://firefly.gchan.moe/problemset/c1_su63_arrayexp1/statement', 'https://firefly.gchan.moe/problemset/c1_su63_arrayexp2/statement', 'https://firefly.gchan.moe/problemset/c1_su63_auction/statement', 'https://firefly.gchan.moe/problemset/c1_su63_luckypair/statement', 'https://firefly.gchan.moe/problemset/c1_su63_nametitle/statement', 'https://firefly.gchan.moe/problemset/c1_su66_isbn/statement', 'https://firefly.gchan.moe/problemset/c1_su66_matrixsym/statement', 'https://firefly.gchan.moe/problemset/c1_su66_permutation/statement', 'https://firefly.gchan.moe/problemset/c1_su66_rangeandmean/statement', 'https://fi