In [None]:
from bs4 import BeautifulSoup
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import time
import json
import os

# Path to ChromeDriver
CHROMEDRIVER_PATH = "/Users/chinmayanand/Documents/chromedriver-mac-arm64/chromedriver"

# Create folders for problems and editorials if they don't exist
os.makedirs("problems", exist_ok=True)
os.makedirs("editorials", exist_ok=True)

def configure_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.6778.140 Safari/537.36"
    )
    return Chrome(service=Service(CHROMEDRIVER_PATH), options=options)

def save_page_source(page_source, filename):
    with open(filename, "w", encoding="utf-8") as f:
        f.write(page_source)
    print(f"Saved page source to '{filename}'.")

def scrape_codeforces(url):
    driver = configure_driver()
    driver.get(url)
    time.sleep(10)  # Allow time for the page to load

    # Save page source for debugging
    page_source = driver.page_source
    save_page_source(page_source, "debug_page_source.html")
    driver.quit()

    soup = BeautifulSoup(page_source, "html.parser")

    # Extract title
    try:
        title = soup.find("div", class_="title").text.strip()
    except AttributeError:
        title = "Title not found"

    # Extract content based on the URL type
    if "problem" in url:
        try:
            content = soup.find("div", class_="problem-statement").text.strip()
        except AttributeError:
            content = "Content not found"
    else:  # Handle editorial pages
        try:
            content = soup.find("div", class_="ttypography").text.strip()
        except AttributeError:
            content = "Content not found"

    # Extract tags
    try:
        tags = [tag.text.strip() for tag in soup.find_all("span", class_="tag-box")]
    except AttributeError:
        tags = []

    # Determine folder and file naming based on URL
    if "problem" in url:
        folder = "problems"
        problem_id = url.split("/")[-2]  # Extract the problem ID, e.g., "2029"
        problem_letter = url.split("/")[-1]  # Extract the problem letter, e.g., "D"
        problem_folder = os.path.join(folder, problem_id)
        os.makedirs(problem_folder, exist_ok=True)  # Create a subfolder for the problem ID
        json_path = f"{problem_folder}/{problem_letter}.json"
        text_path = f"{problem_folder}/{problem_letter}.txt"
    else:
        folder = "editorials"
        entry_id = url.split("/")[-1]  # Use the entry ID for editorials
        json_path = f"{folder}/{entry_id}.json"
        text_path = f"{folder}/{entry_id}.txt"

    # Save content and metadata
    with open(json_path, "w", encoding="utf-8") as json_file:
        json.dump({
            "url": url,
            "title": title,
            "tags": tags,
            "content": content[:25000] + "..."
        }, json_file, indent=4)
    with open(text_path, "w", encoding="utf-8") as text_file:
        text_file.write(content)

    print(f"Scraped data saved to '{folder}' folder.")
    print(f"JSON: {json_path}")
    print(f"Text: {text_path}")

# Main loop to continuously ask for URLs
while True:
    url = input("\nEnter the Codeforces URL (or type 'exit' to quit): ")
    if url.lower() == "exit":
        print("Exiting the scraper. Goodbye!")
        break
    scrape_codeforces(url)


Enter the Codeforces URL (or type 'exit' to quit): https://codeforces.com/problemset/problem/2029/A
Saved page source to 'debug_page_source.html'.
Scraped data saved to 'problems' folder.
JSON: problems/2029/A.json
Text: problems/2029/A.txt

Enter the Codeforces URL (or type 'exit' to quit): https://codeforces.com/blog/entry/133516
Saved page source to 'debug_page_source.html'.
Scraped data saved to 'editorials' folder.
JSON: editorials/133516.json
Text: editorials/133516.txt

Enter the Codeforces URL (or type 'exit' to quit): https://codeforces.com/problemset/problem/2029/B
Saved page source to 'debug_page_source.html'.
Scraped data saved to 'problems' folder.
JSON: problems/2029/B.json
Text: problems/2029/B.txt

Enter the Codeforces URL (or type 'exit' to quit): https://codeforces.com/problemset/problem/2029/C
Saved page source to 'debug_page_source.html'.
Scraped data saved to 'problems' folder.
JSON: problems/2029/C.json
Text: problems/2029/C.txt

Enter the Codeforces URL (or type