In [63]:
import time, random, re
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
import pandas as pd

BASE = "https://books.toscrape.com/"

session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Connection": "keep-alive",
})

def sleep():
    time.sleep(random.uniform(0.5, 1.2))

def get_soup(url):
    r = session.get(url, timeout=20)
    r.raise_for_status()
    return BeautifulSoup(r.text, "lxml")

RATING_MAP = {"One":1, "Two":2, "Three":3, "Four":4, "Five":5}

def parse_book_detail(url):
    soup = get_soup(url)
    main = soup.select_one("div.product_main")
    title = main.select_one("h1").get_text(strip=True)
    price_text = main.select_one("p.price_color").get_text(strip=True)
    price = float(re.sub(r"[^\d.]", "", price_text))

    avail_text = soup.select_one("p.instock.availability").get_text(" ", strip=True)
    m = re.search(r"(\d+)", avail_text)
    stock = int(m.group(1)) if m else None

    rating_tag = main.select_one("p.star-rating")
    rating = None
    if rating_tag:
        for cls in rating_tag.get("class", []):
            if cls in RATING_MAP:
                rating = RATING_MAP[cls]
                break

    desc = ""
    desc_anchor = soup.select_one("#product_description")
    if desc_anchor:
        p = desc_anchor.find_next("p")
        if p:
            desc = p.get_text(" ", strip=True)

    cat = soup.select_one("ul.breadcrumb li:nth-of-type(3) a")
    category = cat.get_text(strip=True) if cat else None

    img = soup.select_one("#product_gallery img")
    image_url = urljoin(url, img["src"]) if img and img.has_attr("src") else None

    # product info table (UPC, tax, etc.)
    info = {}
    for tr in soup.select("table.table.table-striped tr"):
        th = tr.find("th")
        td = tr.find("td")
        if th and td:
            info[th.get_text(strip=True)] = td.get_text(strip=True)

    return {
        "title": title,
        "price": price,
        "stock": stock,
        "rating": rating,
        "category": category,
        "description": desc,
        "image_url": image_url,
        "product_page": url,
        "upc": info.get("UPC"),
        "product_type": info.get("Product Type"),
        "price_excl_tax": info.get("Price (excl. tax)"),
        "price_incl_tax": info.get("Price (incl. tax)"),
        "tax": info.get("Tax"),
        "availability_text": avail_text
    }

def parse_list_page(url):
    soup = get_soup(url)
    cards = soup.select("article.product_pod")
    results = []
    for card in cards:
        a = card.select_one("h3 a")
        if not a:
            continue
        detail_url = urljoin(url, a["href"])
        try:
            sleep()
            data = parse_book_detail(detail_url)
            results.append(data)
        except Exception as e:
            print("failed:", detail_url, e)
    # find next page
    next_a = soup.select_one("li.next a")
    next_url = urljoin(url, next_a["href"]) if next_a else None
    return results, next_url

def scrape_all():
    url = BASE
    all_rows = []
    seen_pages = set()
    while url and url not in seen_pages:
        print("page:", url)
        seen_pages.add(url)
        rows, url = parse_list_page(url)
        all_rows.extend(rows)
        sleep()
    return all_rows




In [56]:
!pip install requests beautifulsoup4 lxml pandas




In [62]:
rows = scrape_all()
df = pd.DataFrame(rows)
df.head()

page: https://books.toscrape.com/
page: https://books.toscrape.com/catalogue/page-2.html
page: https://books.toscrape.com/catalogue/page-3.html
page: https://books.toscrape.com/catalogue/page-4.html
page: https://books.toscrape.com/catalogue/page-5.html
page: https://books.toscrape.com/catalogue/page-6.html
page: https://books.toscrape.com/catalogue/page-7.html
page: https://books.toscrape.com/catalogue/page-8.html
page: https://books.toscrape.com/catalogue/page-9.html
page: https://books.toscrape.com/catalogue/page-10.html
page: https://books.toscrape.com/catalogue/page-11.html
page: https://books.toscrape.com/catalogue/page-12.html
page: https://books.toscrape.com/catalogue/page-13.html
page: https://books.toscrape.com/catalogue/page-14.html
page: https://books.toscrape.com/catalogue/page-15.html
page: https://books.toscrape.com/catalogue/page-16.html
page: https://books.toscrape.com/catalogue/page-17.html
page: https://books.toscrape.com/catalogue/page-18.html
page: https://books.to

Unnamed: 0,title,price,stock,rating,category,description,image_url,product_page,upc,product_type,price_excl_tax,price_incl_tax,tax,availability_text
0,A Light in the Attic,51.77,22,3,Poetry,It's hard to imagine a world without A Light i...,https://books.toscrape.com/media/cache/fe/72/f...,https://books.toscrape.com/catalogue/a-light-i...,a897fe39b1053632,Books,Â£51.77,Â£51.77,Â£0.00,In stock (22 available)
1,Tipping the Velvet,53.74,20,1,Historical Fiction,"""Erotic and absorbing...Written with starling ...",https://books.toscrape.com/media/cache/08/e9/0...,https://books.toscrape.com/catalogue/tipping-t...,90fa61229261140a,Books,Â£53.74,Â£53.74,Â£0.00,In stock (20 available)
2,Soumission,50.1,20,1,Fiction,"Dans une France assez proche de la nÃ´tre, un ...",https://books.toscrape.com/media/cache/ee/cf/e...,https://books.toscrape.com/catalogue/soumissio...,6957f44c3847a760,Books,Â£50.10,Â£50.10,Â£0.00,In stock (20 available)
3,Sharp Objects,47.82,20,4,Mystery,"WICKED above her hipbone, GIRL across her hear...",https://books.toscrape.com/media/cache/c0/59/c...,https://books.toscrape.com/catalogue/sharp-obj...,e00eb4fd7b871a48,Books,Â£47.82,Â£47.82,Â£0.00,In stock (20 available)
4,Sapiens: A Brief History of Humankind,54.23,20,5,History,From a renowned historian comes a groundbreaki...,https://books.toscrape.com/media/cache/ce/5f/c...,https://books.toscrape.com/catalogue/sapiens-a...,4165285e1663650f,Books,Â£54.23,Â£54.23,Â£0.00,In stock (20 available)


In [64]:
df.to_csv("books_fast.csv", index=False)
print(df.shape) 


(1000, 14)
