In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

BASE_URL = "https://books.toscrape.com/catalogue/"

def parse_book_details(book_url):
    response = requests.get(book_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    upc = soup.find('table', class_='table').find_all('tr')[0].find_all('td')[0].text
    return upc
    
def parse_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    books = []
    for book in soup.find_all('article', class_='product_pod'):
        book_name = book.h3.a['title']
        price = book.find('p', class_='price_color').text[1:]
        book_url = BASE_URL + book.h3.a['href']
        
        book_upc = parse_book_details(book_url)
        books.append({
            'id': book_upc,
            'book_name': book_name,
            'price': price,
            'url': book_url
        })
        return books
        
def scrape_books(max_books=1000):
    all_books = []
    page = 1
    while len(all_books) < max_books:
        url = f"https://books.toscrape.com/catalogue/page-{page}.html"
        books = parse_page(url)
        if not books:
            break
        all_books.extend(books)
        page += 1
    return all_books[:max_books]
    
books_data = scrape_books()

df = pd.DataFrame(books_data)

df.to_csv('books.csv', index=False)

print("Файл books.csv создан!")

Файл books.csv создан!


In [12]:
df.head(300)

Unnamed: 0,id,book_name,price,url
0,a897fe39b1053632,A Light in the Attic,51.77,https://books.toscrape.com/catalogue/a-light-i...
1,23356462d1320d61,In Her Wake,12.84,https://books.toscrape.com/catalogue/in-her-wa...
2,b4fd5943413e089a,Slow States of Collapse: Poems,57.31,https://books.toscrape.com/catalogue/slow-stat...
3,7d385d34d12e60ff,The Nameless City (The Nameless City #1),38.16,https://books.toscrape.com/catalogue/the-namel...
4,0fa6dceead7ce47a,"Princess Jellyfish 2-in-1 Omnibus, Vol. 01 (Pr...",13.61,https://books.toscrape.com/catalogue/princess-...
5,e4f74c16de34d440,Immunity: How Elie Metchnikoff Changed the Cou...,57.36,https://books.toscrape.com/catalogue/immunity-...
6,38d45839cb1c83c1,Algorithms to Live By: The Computer Science of...,30.81,https://books.toscrape.com/catalogue/algorithm...
7,d6361d16212664ed,The Shadow Hero (The Shadow Hero),33.14,https://books.toscrape.com/catalogue/the-shado...
8,efc3768127714ec3,The Bridge to Consciousness: I'm Writing the B...,32.0,https://books.toscrape.com/catalogue/the-bridg...
9,caf4fe9311f1dc59,Modern Romance,28.26,https://books.toscrape.com/catalogue/modern-ro...
