In [1]:
from urllib.request import Request, urlopen
from urllib.error import HTTPError, URLError
from bs4 import BeautifulSoup
import os
import json
import time

# Create directories for storing scraped data
os.makedirs("problems", exist_ok=True)
os.makedirs("metadata", exist_ok=True)

def scrape_problem(contest_id, problem_id):
    url = f"https://codeforces.com/contest/{contest_id}/problem/{problem_id}"
    print(f"Scraping: {url}")

    req = Request(
        url=url,
        headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
    )

    try:
        response = urlopen(req)
        page_content = response.read().decode("utf-8")
        soup = BeautifulSoup(page_content, "html.parser")

        # Extract problem title
        problem_title = soup.find("div", class_="title")
        problem_title = problem_title.get_text(strip=True) if problem_title else None

        # If no problem title is found, consider the problem does not exist
        if not problem_title:
            print(f"No problem found for {problem_id}. Stopping.")
            return False  # Stop processing further problem IDs

        # Extract problem statement
        problem_statement_div = soup.find("div", class_="problem-statement")
        problem_statement = ""
        if problem_statement_div:
            for section in problem_statement_div.find_all(["div", "p", "pre"]):
                if section.name == "div" and "header" in section.get("class", []):
                    problem_statement += f"\n\n{section.get_text(strip=True)}\n"
                elif section.name == "pre":
                    problem_statement += f"\n{section.get_text()}\n"
                else:
                    problem_statement += f"{section.get_text(strip=True)} "

        # Replace $$$ with single $
        problem_statement = problem_statement.replace("$$$", "")

        # Extract metadata
        tags = [tag.get_text(strip=True) for tag in soup.find_all("span", class_="tag-box")]

        time_limit_div = soup.find("div", string="time limit per test")
        time_limit = (
            time_limit_div.find_next_sibling().get_text(strip=True)
            if time_limit_div and time_limit_div.find_next_sibling()
            else "Unknown"
        )

        memory_limit_div = soup.find("div", string="memory limit per test")
        memory_limit = (
            memory_limit_div.find_next_sibling().get_text(strip=True)
            if memory_limit_div and memory_limit_div.find_next_sibling()
            else "Unknown"
        )

        # Save problem statement
        with open(f"problems/{contest_id}_{problem_id}.txt", "w", encoding="utf-8") as f:
            f.write(f"Problem Title: {problem_title}\n")
            f.write(f"Time Limit: {time_limit}\n")
            f.write(f"Memory Limit: {memory_limit}\n")
            f.write(f"Tags: {', '.join(tags)}\n")
            f.write("\nProblem Statement:\n")
            f.write(problem_statement.strip())

        # Save metadata
        metadata = {
            "contest_id": contest_id,
            "problem_id": problem_id,
            "title": problem_title,
            "tags": tags,
            "time_limit": time_limit,
            "memory_limit": memory_limit,
        }
        with open(f"metadata/{contest_id}_{problem_id}.json", "w", encoding="utf-8") as f:
            json.dump(metadata, f, indent=4, ensure_ascii=False)

        print(f"Successfully scraped: {problem_title}")
        return True  # Problem successfully scraped

    except HTTPError as e:
        if e.code == 404:
            print(f"HTTP Error 404: Problem {problem_id} does not exist. Stopping.")
            return False  # Stop processing further problem IDs
        print(f"HTTP Error {e.code}: {url}")
        return False
    except URLError as e:
        print(f"URL Error: {e.reason}")
        return False
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return False

def scrape_contest(contest_id):
    problem_ids = "ABCDEFG"

    for problem_id in problem_ids:
        success = scrape_problem(contest_id, problem_id)
        if not success:
            break  # Stop further scraping if no more problems are found
        time.sleep(2)  # Delay between requests

if __name__ == "__main__":
    contest_id = input("Enter a contest ID (e.g., 2043): ").strip()

    print(f"\nStarting to scrape contest: {contest_id}")
    scrape_contest(contest_id)


Enter a contest ID (e.g., 2043): 2043

Starting to scrape contest: 2043
Scraping: https://codeforces.com/contest/2043/problem/A
Successfully scraped: A. Coin Transformation
Scraping: https://codeforces.com/contest/2043/problem/B
Successfully scraped: B. Digits
Scraping: https://codeforces.com/contest/2043/problem/C
Successfully scraped: C. Sums on Segments
Scraping: https://codeforces.com/contest/2043/problem/D
Successfully scraped: D. Problem about GCD
Scraping: https://codeforces.com/contest/2043/problem/E
Successfully scraped: E. Matrix Transformation
Scraping: https://codeforces.com/contest/2043/problem/F
Successfully scraped: F. Nim
Scraping: https://codeforces.com/contest/2043/problem/G
Successfully scraped: G. Problem with Queries
