In [1]:
# --- Resumen del bloque: importa librerías necesarias | automatiza navegador con Selenium. ---
# Selenium / control del navegador
!pip install selenium --quiet
!apt-get update > /dev/null
!apt install chromium-chromedriver --yes > /dev/null
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
# Importaciones
import sys
sys.path.insert(0, '/usr/lib/chromium-browser/chromedriver')

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.9/43.9 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25hW: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


cp: '/usr/lib/chromium-browser/chromedriver' and '/usr/bin/chromedriver' are the same file


In [2]:
# --- Resumen del bloque: importa librerías necesarias | automatiza navegador con Selenium | manipula datos con pandas. ---
# Importaciones
import pandas as pd
# Selenium / control del navegador
from selenium.webdriver import Chrome
# Selenium / control del navegador
from selenium.webdriver.chrome.options import Options as ChromeOptions
# Selenium / control del navegador
from selenium.webdriver.common.by import By
# Selenium / control del navegador
from selenium.common.exceptions import NoSuchElementException
from urllib.parse import urljoin
# Importaciones
from tqdm import tqdm
# Importaciones
import time

In [3]:
# --- Resumen del bloque: ejecuta pasos del flujo de scraping/procesamiento. ---
HOMEPAGE = "http://books.toscrape.com"

In [4]:
# --- Resumen del bloque: manipula datos con pandas | itera sobre colecciones o páginas | incluye pausas para evitar bloqueos/rate limit | maneja excepciones | guarda resultados en archivos | define funciones auxiliares. ---
# Función auxiliar: getAllBookUrls
def getAllBookUrls(driver, startUrl):
    """
    Phase 1: Navigates through the entire catalog to collect the URLs of each book.
    """
    print("Phase 1: Collecting all book URLs...")
    driver.get(startUrl)
    bookUrls = []

# Bucle while
    while True:
        bookElements = driver.find_elements(By.CSS_SELECTOR, 'article.product_pod h3 > a')
# Bucle
        for element in bookElements:
            absoluteUrl = urljoin(driver.current_url, element.get_attribute('href'))
# Acumulación de resultados
            bookUrls.append(absoluteUrl)

# Intento protegido (try)
        try:
            nextButton = driver.find_element(By.CSS_SELECTOR, '.next > a')
            nextButton.click()
# Pausa para no saturar el servidor
            time.sleep(0.5)
# Manejo de excepción
        except NoSuchElementException:
            break

    print(f"{len(bookUrls)} book URLs found.")
    return bookUrls

# Función auxiliar: scrapeBookDetails
def scrapeBookDetails(driver, bookUrl):
    """
    Phase 2: Visits a book's page and extracts all detailed information.
    """
    driver.get(bookUrl)

    ratingMap = {"One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5}

# Función auxiliar: getTableData
    def getTableData(label):
# Intento protegido (try)
        try:
            return driver.find_element(By.XPATH, f'//th[text()="{label}"]/following-sibling::td').text
# Manejo de excepción
        except NoSuchElementException:
            return None

    name = driver.find_element(By.TAG_NAME, 'h1').text
    category = driver.find_element(By.CSS_SELECTOR, '.breadcrumb li:nth-of-type(3) a').text
    relativeImageUrl = driver.find_element(By.CSS_SELECTOR, '#product_gallery img').get_attribute('src')
    absoluteImageUrl = urljoin(bookUrl, relativeImageUrl)

# Intento protegido (try)
    try:
        description = driver.find_element(By.XPATH, '//div[@id="product_description"]/following-sibling::p').text
        hasDescription = True
# Manejo de excepción
    except NoSuchElementException:
        description = ""
        hasDescription = False

    ratingText = driver.find_element(By.CSS_SELECTOR, 'p.star-rating').get_attribute('class').split()[-1]
    rating = ratingMap.get(ratingText, 0)

    upc = getTableData("UPC")
    productType = getTableData("Product Type")
    priceExclTax = getTableData("Price (excl. tax)")
    priceInclTax = getTableData("Price (incl. tax)")
    tax = getTableData("Tax")
    availability = getTableData("Availability")
    numberOfReviews = getTableData("Number of reviews")

    return {
        'Name': name,
        'Category': category,
        'UPC': upc,
        'Product Type': productType,
        'Price (excl. tax)': priceExclTax,
        'Price (incl. tax)': priceInclTax,
        'Tax': tax,
        'Availability': availability,
        '# Reviews': numberOfReviews,
        'Rating': rating,
        'Has Description': hasDescription,
        'Image URL': absoluteImageUrl,
        'Description': description
    }

# Función auxiliar: main
def main():
    """
    Orchestrates the entire scraping process and saves the results.
    """
    browserOptions = ChromeOptions()
    browserOptions.add_argument("--headless")
    browserOptions.add_argument("--no-sandbox")
    browserOptions.add_argument("--window-size=1920,1080")

    driver = Chrome(options=browserOptions)

    # Phase 1
    allUrls = getAllBookUrls(driver, HOMEPAGE)

    # Phase 2
    print("\nPhase 2: Extracting details for each book...")
    allBookDetails = []
# Bucle
    for url in tqdm(allUrls, desc="Processing books"):
        details = scrapeBookDetails(driver, url)
# Acumulación de resultados
        allBookDetails.append(details)

    driver.quit()

    print("\nCreating DataFrame and saving to CSV...")
# Construcción de DataFrame
    df = pd.DataFrame(allBookDetails)
# Persistencia de resultados
    df.to_csv("detailed_books.csv", index=False, encoding='utf-8-sig')

    print("\nProcess finished. Data saved to 'detailed_books.csv'")
    print("First 5 rows of the DataFrame:")
    print(df.head())

In [5]:
# --- Resumen del bloque: ejecuta pasos del flujo de scraping/procesamiento. ---
if __name__ == '__main__':
    main()

Phase 1: Collecting all book URLs...
1000 book URLs found.

Phase 2: Extracting details for each book...


Processing books: 100%|██████████| 1000/1000 [15:27<00:00,  1.08it/s]



Creating DataFrame and saving to CSV...

Process finished. Data saved to 'detailed_books.csv'
First 5 rows of the DataFrame:
                                    Name            Category  \
0                   A Light in the Attic              Poetry   
1                     Tipping the Velvet  Historical Fiction   
2                             Soumission             Fiction   
3                          Sharp Objects             Mystery   
4  Sapiens: A Brief History of Humankind             History   

                UPC Product Type Price (excl. tax) Price (incl. tax)    Tax  \
0  a897fe39b1053632        Books            £51.77            £51.77  £0.00   
1  90fa61229261140a        Books            £53.74            £53.74  £0.00   
2  6957f44c3847a760        Books            £50.10            £50.10  £0.00   
3  e00eb4fd7b871a48        Books            £47.82            £47.82  £0.00   
4  4165285e1663650f        Books            £54.23            £54.23  £0.00   

              