In [2]:
import os
import json
import time
import requests
import re
from bs4 import BeautifulSoup
!pip install selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import shutil

class CodeforcesScraper:
    def __init__(self):
        self.base_url = "https://codeforces.com"
        self.setup_project_structure()
        self.setup_selenium()

    def setup_project_structure(self):
        # Define project structure
        self.project_dirs = {
            'root': os.getcwd(),
            'data': 'data',
            'problems': 'data/problems',
            'editorials': 'data/editorials',
            'metadata': 'data/metadata',
            'docs': 'docs',
            'samples': 'data/samples'
        }

        # Create directories
        for dir_path in self.project_dirs.values():
            os.makedirs(dir_path, exist_ok=True)

        # Create documentation
        self.create_documentation()

        # Create config file
        self.create_config()

    def create_documentation(self):
        readme_content = """# Codeforces Problem Scraper

## Overview
This tool scrapes problem statements and editorials from Codeforces, preserving LaTeX formatting and code blocks.

## Features
- Extracts problem statements with preserved LaTeX formatting
- Captures test cases and sample inputs/outputs
- Preserves code blocks with proper formatting
- Stores metadata in JSON format
- Includes editorial content with proper section handling

## Project Structure
```
project/
├── data/
│   ├── problems/
│   ├── editorials/
│   ├── metadata/
│   └── samples/
└── docs/
```

## Usage
Run `main.py` to start the scraper:
```bash
python main.py
```

## Configuration
Settings can be modified in `config.json`
"""
        with open(os.path.join(self.project_dirs['docs'], 'README.md'), 'w', encoding='utf-8') as f:
            f.write(readme_content)

    def create_config(self):
        config = {
            'rate_limit_delay': 2,
            'max_problems': 10,
            'paths': self.project_dirs,
            'selenium': {
                'headless': True,
                'timeout': 10
            }
        }
        with open('config.json', 'w', encoding='utf-8') as f:
            json.dump(config, f, indent=4)

    def setup_selenium(self):
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--log-level=3")
        self.driver = webdriver.Chrome(options=chrome_options)

    def extract_latex(self, element):
        """Preserve LaTeX formatting in text."""
        if not element:
            return ""

        # Convert element to string while preserving LaTeX
        content = str(element)

        # Preserve inline LaTeX
        content = re.sub(r'\$([^$]+)\$', r'$\1$', content)

        # Preserve block LaTeX
        content = re.sub(r'\$\$([^$]+)\$\$', r'$$\1$$', content)

        # Convert to plain text while keeping LaTeX
        soup = BeautifulSoup(content, 'html.parser')
        return soup.get_text(separator='\n')

    def extract_code_blocks(self, element):
        """Extract and format code blocks."""
        code_blocks = []
        if element:
            for pre in element.find_all('pre'):
                code_blocks.append(pre.get_text())
        return code_blocks

    def get_test_cases(self, problem_statement):
        """Extract test cases from problem statement."""
        test_cases = []
        if problem_statement:
            input_specs = problem_statement.find_all('div', class_='input')
            output_specs = problem_statement.find_all('div', class_='output')

            for input_spec, output_spec in zip(input_specs, output_specs):
                test_case = {
                    'input': input_spec.find('pre').get_text() if input_spec.find('pre') else "",
                    'output': output_spec.find('pre').get_text() if output_spec.find('pre') else ""
                }
                test_cases.append(test_case)

        return test_cases

    def scrape_problem(self, problem_url):
        self.driver.get(problem_url)
        try:
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "problem-statement"))
            )
        except TimeoutException:
            return None

        soup = BeautifulSoup(self.driver.page_source, 'html.parser')
        problem_statement = soup.find("div", class_="problem-statement")

        if not problem_statement:
            return None

        # Extract problem components
        header = problem_statement.find('div', class_='header')
        title = header.find('div', class_='title').text.strip() if header else ""
        time_limit = header.find('div', class_='time-limit').text.strip() if header else ""
        memory_limit = header.find('div', class_='memory-limit').text.strip() if header else ""

        # Get main content with preserved LaTeX
        content_div = problem_statement.find('div', class_='header').find_next_sibling('div')
        content = self.extract_latex(content_div)

        # Get code blocks
        code_blocks = self.extract_code_blocks(problem_statement)

        # Get test cases
        test_cases = self.get_test_cases(problem_statement)

        # Get tags
        tags_div = soup.find('div', class_='tag-box')
        tags = [tag.text.strip() for tag in tags_div.find_all('span', class_='tag')] if tags_div else []

        return {
            'title': title,
            'time_limit': time_limit,
            'memory_limit': memory_limit,
            'content': content,
            'code_blocks': code_blocks,
            'test_cases': test_cases,
            'tags': tags,
            'url': problem_url
        }

    def scrape_editorial(self, problem_url):
        editorial_url = problem_url.replace("/problem/", "/tutorial/")
        try:
            self.driver.get(editorial_url)
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.ID, "pageContent"))
            )

            soup = BeautifulSoup(self.driver.page_source, 'html.parser')
            tutorial = soup.find('div', class_='ttypography')

            if not tutorial:
                return None

            # Extract content with preserved LaTeX
            content = self.extract_latex(tutorial)

            # Extract code blocks
            code_blocks = self.extract_code_blocks(tutorial)

            # Extract section headers
            sections = []
            for header in tutorial.find_all(['h1', 'h2', 'h3']):
                sections.append({
                    'level': int(header.name[1]),
                    'title': header.text.strip()
                })

            return {
                'content': content,
                'code_blocks': code_blocks,
                'sections': sections,
                'url': editorial_url
            }
        except:
            return None

    def save_problem(self, problem_data, problem_number):
        # Save problem statement
        problem_file = os.path.join(self.project_dirs['problems'], f'problem_{problem_number}.txt')
        with open(problem_file, 'w', encoding='utf-8') as f:
            f.write(f"Title: {problem_data['title']}\n")
            f.write(f"Time Limit: {problem_data['time_limit']}\n")
            f.write(f"Memory Limit: {problem_data['memory_limit']}\n\n")
            f.write("Problem Statement:\n")
            f.write(problem_data['content'])

            if problem_data['code_blocks']:
                f.write("\n\nCode Blocks:\n")
                for i, block in enumerate(problem_data['code_blocks'], 1):
                    f.write(f"\nBlock {i}:\n{block}\n")

        # Save test cases
        if problem_data['test_cases']:
            test_case_file = os.path.join(self.project_dirs['samples'], f'test_cases_{problem_number}.json')
            with open(test_case_file, 'w', encoding='utf-8') as f:
                json.dump(problem_data['test_cases'], f, indent=2)

        # Save metadata
        metadata_file = os.path.join(self.project_dirs['metadata'], f'problem_{problem_number}_metadata.json')
        metadata = {
            'title': problem_data['title'],
            'tags': problem_data['tags'],
            'url': problem_data['url'],
            'time_limit': problem_data['time_limit'],
            'memory_limit': problem_data['memory_limit']
        }
        with open(metadata_file, 'w', encoding='utf-8') as f:
            json.dump(metadata, f, indent=2)

    def save_editorial(self, editorial_data, problem_number):
        if editorial_data:
            editorial_file = os.path.join(self.project_dirs['editorials'], f'editorial_{problem_number}.txt')
            with open(editorial_file, 'w', encoding='utf-8') as f:
                # Write sections
                if editorial_data['sections']:
                    f.write("Sections:\n")
                    for section in editorial_data['sections']:
                        f.write(f"{'#' * section['level']} {section['title']}\n")
                    f.write("\n")

                # Write main content
                f.write("Content:\n")
                f.write(editorial_data['content'])

                # Write code blocks
                if editorial_data['code_blocks']:
                    f.write("\n\nCode Blocks:\n")
                    for i, block in enumerate(editorial_data['code_blocks'], 1):
                        f.write(f"\nBlock {i}:\n{block}\n")

    def get_problem_list(self, num_problems=10):
        url = f"{self.base_url}/problemset"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        problems = []
        problem_rows = soup.select("table.problems tr")[1:num_problems+1]

        for row in problem_rows:
            problem_cells = row.select("td")
            if len(problem_cells) >= 2:
                problem_link = problem_cells[1].find("a")
                if problem_link:
                    problem_url = problem_link.get("href")
                    if problem_url:
                        problems.append(self.base_url + problem_url)

        return problems

    def run(self, num_problems=10):
        with open('config.json', 'r') as f:
            config = json.load(f)

        problems = self.get_problem_list(num_problems)

        for i, problem_url in enumerate(problems, 1):
            print(f"Scraping problem {i}/{len(problems)}: {problem_url}")

            problem_data = self.scrape_problem(problem_url)
            if problem_data:
                self.save_problem(problem_data, i)

                editorial_data = self.scrape_editorial(problem_url)
                if editorial_data:
                    self.save_editorial(editorial_data, i)

            time.sleep(config['rate_limit_delay'])

    def cleanup(self):
        self.driver.quit()

def main():
    scraper = CodeforcesScraper()
    try:
        scraper.run(num_problems=10)
    finally:
        scraper.cleanup()

if __name__ == "__main__":
    main()

Collecting selenium
  Downloading selenium-4.27.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.28.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.27.1-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m75.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.28.0-py3-none-any.whl (486 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.3/486.3 kB[0m [31m30.4 MB/s