In [None]:
!pip install selenium --quiet
!apt-get update > /dev/null
!apt install chromium-chromedriver --yes > /dev/null
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
import sys
sys.path.insert(0, '/usr/lib/chromium-browser/chromedriver')

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/9.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/9.6 MB[0m [31m51.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━[0m [32m6.8/9.6 MB[0m [31m93.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m9.6/9.6 MB[0m [31m103.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m64.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.9/43.9 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hW: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' 

In [None]:
import pandas as pd
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from urllib.parse import urljoin
from tqdm import tqdm
import time

In [None]:
HOMEPAGE = "http://books.toscrape.com"

In [None]:
def getAllBookUrls(driver, startUrl):
    """
    Phase 1: Navigates through the entire catalog to collect the URLs of each book.
    """
    print("Phase 1: Collecting all book URLs...")
    driver.get(startUrl)
    bookUrls = []

    while True:
        bookElements = driver.find_elements(By.CSS_SELECTOR, 'article.product_pod h3 > a')
        for element in bookElements:
            absoluteUrl = urljoin(driver.current_url, element.get_attribute('href'))
            bookUrls.append(absoluteUrl)

        try:
            nextButton = driver.find_element(By.CSS_SELECTOR, '.next > a')
            nextButton.click()
            time.sleep(0.5)
        except NoSuchElementException:
            break

    print(f"{len(bookUrls)} book URLs found.")
    return bookUrls

def scrapeBookDetails(driver, bookUrl):
    """
    Phase 2: Visits a book's page and extracts all detailed information.
    """
    driver.get(bookUrl)

    ratingMap = {"One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5}

    def getTableData(label):
        try:
            return driver.find_element(By.XPATH, f'//th[text()="{label}"]/following-sibling::td').text
        except NoSuchElementException:
            return None

    name = driver.find_element(By.TAG_NAME, 'h1').text
    category = driver.find_element(By.CSS_SELECTOR, '.breadcrumb li:nth-of-type(3) a').text
    relativeImageUrl = driver.find_element(By.CSS_SELECTOR, '#product_gallery img').get_attribute('src')
    absoluteImageUrl = urljoin(bookUrl, relativeImageUrl)

    try:
        description = driver.find_element(By.XPATH, '//div[@id="product_description"]/following-sibling::p').text
        hasDescription = True
    except NoSuchElementException:
        description = ""
        hasDescription = False

    ratingText = driver.find_element(By.CSS_SELECTOR, 'p.star-rating').get_attribute('class').split()[-1]
    rating = ratingMap.get(ratingText, 0)

    upc = getTableData("UPC")
    productType = getTableData("Product Type")
    priceExclTax = getTableData("Price (excl. tax)")
    priceInclTax = getTableData("Price (incl. tax)")
    tax = getTableData("Tax")
    availability = getTableData("Availability")
    numberOfReviews = getTableData("Number of reviews")

    return {
        'Name': name,
        'Category': category,
        'UPC': upc,
        'Product Type': productType,
        'Price (excl. tax)': priceExclTax,
        'Price (incl. tax)': priceInclTax,
        'Tax': tax,
        'Availability': availability,
        '# Reviews': numberOfReviews,
        'Rating': rating,
        'Has Description': hasDescription,
        'Image URL': absoluteImageUrl,
        'Description': description
    }

def main():
    """
    Orchestrates the entire scraping process and saves the results.
    """
    browserOptions = ChromeOptions()
    browserOptions.add_argument("--headless")
    browserOptions.add_argument("--no-sandbox")
    browserOptions.add_argument("--window-size=1920,1080")

    driver = Chrome(options=browserOptions)

    # Phase 1
    allUrls = getAllBookUrls(driver, HOMEPAGE)

    # Phase 2
    print("\nPhase 2: Extracting details for each book...")
    allBookDetails = []
    for url in tqdm(allUrls, desc="Processing books"):
        details = scrapeBookDetails(driver, url)
        allBookDetails.append(details)

    driver.quit()

    print("\nCreating DataFrame and saving to CSV...")
    df = pd.DataFrame(allBookDetails)
    df.to_csv("detailed_books.csv", index=False, encoding='utf-8-sig')

    print("\nProcess finished. Data saved to 'detailed_books.csv'")
    print("First 5 rows of the DataFrame:")
    print(df.head())

In [None]:
if __name__ == '__main__':
    main()

Phase 1: Collecting all book URLs...
1000 book URLs found.

Phase 2: Extracting details for each book...


Processing books: 100%|██████████| 1000/1000 [10:34<00:00,  1.58it/s]



Creating DataFrame and saving to CSV...

Process finished. Data saved to 'detailed_books.csv'
First 5 rows of the DataFrame:
                                    Name            Category  \
0                   A Light in the Attic              Poetry   
1                     Tipping the Velvet  Historical Fiction   
2                             Soumission             Fiction   
3                          Sharp Objects             Mystery   
4  Sapiens: A Brief History of Humankind             History   

                UPC Product Type Price (excl. tax) Price (incl. tax)    Tax  \
0  a897fe39b1053632        Books            £51.77            £51.77  £0.00   
1  90fa61229261140a        Books            £53.74            £53.74  £0.00   
2  6957f44c3847a760        Books            £50.10            £50.10  £0.00   
3  e00eb4fd7b871a48        Books            £47.82            £47.82  £0.00   
4  4165285e1663650f        Books            £54.23            £54.23  £0.00   

              