In [4]:
import json
import os

# Parameters
last_page = 1933
base_path = 'scraping_notebooks'
step = 500

# Ensure the output directory exists
os.makedirs(base_path, exist_ok=True)

# Read the base notebook with UTF-8 encoding
try:
    with open('./base.ipynb', 'r', encoding='utf-8') as f:
        notebook_content = json.load(f)
except FileNotFoundError:
    print("Error: base.ipynb not found")
    exit(1)

# Iterate over page ranges
for i in range(0, last_page, step):
    # Define the output path for the CSV (for reference)
    csv_path = f"{base_path}/data_avito_page_{i+1}_to_{min(i+step, last_page)}.csv"
    print(f"Generating notebook for: {csv_path}")

    # Create a copy of the notebook content
    new_notebook = notebook_content.copy()

    # Modify the notebook cells
    for cell in new_notebook['cells']:
        if cell['cell_type'] == 'code':
            # Debug: Print the cell content to inspect
            print(f"Inspecting code cell: {cell['source']}")
            for j, line in enumerate(cell['source']):
                # Use more flexible matching for start_page and end_page
                if line.strip().startswith('start_page'):
                    print(f"Found start_page in line: {line}")
                    cell['source'][j] = f'start_page = {i+1}\n'
                elif line.strip().startswith('end_page'):
                    print(f"Found end_page in line: {line}")
                    cell['source'][j] = f'end_page = {min(i+step, last_page)}\n'

    # Define the new notebook filename
    notebook_path = f"{base_path}/data_avito_notebook_{i+1}_to_{min(i+step, last_page)}.ipynb"

    # Save the modified notebook with UTF-8 encoding
    with open(notebook_path, 'w', encoding='utf-8') as f:
        json.dump(new_notebook, f, indent=2, ensure_ascii=False)

    print(f"Created notebook: {notebook_path}")

Generating notebook for: scraping_notebooks/data_avito_page_1_to_500.csv
Inspecting code cell: ['import pandas as pd\n', 'import requests\n', 'from bs4 import BeautifulSoup\n', 'from datetime import datetime\n', 'import os\n', '\n', "data_base_path = '../scraped_data'\n", 'os.makedirs(data_base_path, exist_ok=True)\n', '\n', 'data = []\n', 'start_page = 0\n', 'end_page = 1\n', '\n', 'page_number = start_page\n', 'max_pages = end_page\n', '\n', 'prix_min = 1\n', '\n', 'headers = {\n', "    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',\n", "    'Accept-Language': 'en-US,en;q=0.9',\n", '}\n', '\n', 'equipements_possibles = [\n', "    'ABS',\n", "    'Airbags',\n", "    'CD/MP3/Bluetooth',\n", "    'Caméra de recul',\n", "    'Climatisation',\n", "    'ESP',\n", "    'Jantes aluminium',\n", "    'Limiteur de vitesse',\n", "    'Ordinateur de bord',\n", "    'Radar de recul',\n", "    'Régulateur de vites