In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re
from datetime import datetime

BASE_URL = "https://quotes.toscrape.com/page/{}/"
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}
MAX_PAGES = 50
START_PAGE = 1
TIMEOUT = 10
MAX_RETRIES = 4
DELAY_MIN = 0.5
DELAY_MAX = 1.5
CSV_PATH = "quotes_dataset.csv"
CHECKPOINT_EVERY = 10

session = requests.Session()
session.headers.update(HEADERS)

def safe_get(url, max_retries=MAX_RETRIES, timeout=TIMEOUT):
    backoff = 1.0
    for attempt in range(1, max_retries + 1):
        try:
            resp = session.get(url, timeout=timeout)
            if resp.status_code == 200:
                return resp.text
            if 400 <= resp.status_code < 500:
                return None
        except requests.RequestException:
            pass
        time.sleep(backoff + random.random() * 0.5)
        backoff *= 2
    return None

def parse_quotes(html):
    soup = BeautifulSoup(html, "html.parser")
    records = []
    for q in soup.select("div.quote"):
        text_elem = q.select_one("span.text")
        author_elem = q.select_one("small.author")
        tags_elem = q.select("div.tags a.tag")
        text = text_elem.get_text(strip=True) if text_elem else ""
        author = author_elem.get_text(strip=True) if author_elem else ""
        tags = "|".join([t.get_text(strip=True) for t in tags_elem]) if tags_elem else ""
        records.append({"quote": text, "author": author, "tags": tags})
    return records

def has_next_page(html):
    soup = BeautifulSoup(html, "html.parser")
    next_btn = soup.select_one("li.next a")
    return bool(next_btn)

def save_checkpoint(records, path=CSV_PATH):
    df = pd.DataFrame(records)
    df.to_csv(path, index=False, encoding="utf-8-sig")

def main():
    all_records = []
    page = START_PAGE
    printed_html_preview = False
    start_time = datetime.utcnow()
    print("Starting scraper (English outputs)...")
    while page <= MAX_PAGES:
        url = BASE_URL.format(page)
        print(f"Fetching page {page}: {url}")
        html = safe_get(url)
        if not html:
            print(f"[!] Failed to fetch page {page}. Skipping.")
            page += 1
            time.sleep(random.uniform(DELAY_MIN, DELAY_MAX))
            continue
        if not printed_html_preview:
            preview = re.sub(r"\s+", " ", html[:1000])
            print("\n--- HTML Preview (first 1000 chars) ---\n")
            print(preview)
            print("\n--- End of HTML Preview ---\n")
            printed_html_preview = True
        records = parse_quotes(html)
        if records:
            all_records.extend(records)
            print(f"Extracted {len(records)} quotes from page {page}. Total collected: {len(all_records)}")
        else:
            print(f"No quotes found on page {page}.")
        if page % CHECKPOINT_EVERY == 0:
            save_checkpoint(all_records)
            print(f"Checkpoint saved at page {page} ({len(all_records)} total).")
        if not has_next_page(html):
            print("No next page detected. Stopping.") #فضت الصفحات
            break
        page += 1
        time.sleep(random.uniform(DELAY_MIN, DELAY_MAX))
    save_checkpoint(all_records)
    end_time = datetime.utcnow()
    duration = (end_time - start_time).total_seconds()
    print(f"\n Final for enregister to '{CSV_PATH}' completed.")
    print(f"Total quotes collected: {len(all_records)}")
    print(f"Duration (seconds): {duration:.2f}")
    if len(all_records) > 0:
        df = pd.DataFrame(all_records)
        print("\nPreview of first 10 rows:")
        print(df.head(10).to_string(index=False))
    else:
        print("No data collected.")
if __name__ == "__main__":
    main()


  start_time = datetime.utcnow()


Starting scraper (English outputs)...
Fetching page 1: https://quotes.toscrape.com/page/1/

--- HTML Preview (first 1000 chars) ---

<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>Quotes to Scrape</title> <link rel="stylesheet" href="/static/bootstrap.min.css"> <link rel="stylesheet" href="/static/main.css"> </head> <body> <div class="container"> <div class="row header-box"> <div class="col-md-8"> <h1> <a href="/" style="text-decoration: none">Quotes to Scrape</a> </h1> </div> <div class="col-md-4"> <p> <a href="/login">Login</a> </p> </div> </div> <div class="row"> <div class="col-md-8"> <div class="quote" itemscope itemtype="http://schema.org/CreativeWork"> <span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span> <span>by <small class="auth

--- End of HTML Preview ---

Extracted 10 quotes from page 1. Total collected: 10
Fetching page 2: https://quotes.toscrape

  end_time = datetime.utcnow()
