In [2]:
pip install beautifulsoup4 requests pandas lxml fake_useragent


Collecting fake_useragent
  Downloading fake_useragent-2.0.3-py3-none-any.whl.metadata (17 kB)
Downloading fake_useragent-2.0.3-py3-none-any.whl (201 kB)
Installing collected packages: fake_useragent
Successfully installed fake_useragent-2.0.3
Note: you may need to restart the kernel to use updated packages.


In [14]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re

# Amazon URL for Data Engineering Books
URL = "https://www.amazon.com/s?k=data+engineering+books"

# User-Agent list to avoid detection
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.83 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"
]

# Headers to mimic a real browser request
HEADERS = {
    "User-Agent": random.choice(USER_AGENTS),
    "Accept-Language": "en-US,en;q=0.5"
}

def get_publication_date_from_product_page(book_url):
    """Fetches the publication date from the product page if missing in search results."""
    try:
        product_response = requests.get(book_url, headers=HEADERS)
        if product_response.status_code == 200:
            product_soup = BeautifulSoup(product_response.text, "html.parser")

            # **Check multiple locations in the product page**
            possible_locations = [
                "#detailBullets_feature_div",
                "#productDetailsTable",
                "#prodDetails"
            ]

            for location in possible_locations:
                details = product_soup.select_one(location)
                if details:
                    detail_text = details.get_text(strip=True)
                    date_match = re.search(r"(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}", detail_text)
                    if date_match:
                        return date_match.group()

            # **Fallback: Search for a year in the entire page**
            year_match = re.search(r"(19|20)\d{2}", product_soup.text)
            if year_match:
                return year_match.group()

    except Exception as e:
        print(f"Error fetching product page: {e}")
    
    return "N/A"

# Fetch the webpage
response = requests.get(URL, headers=HEADERS)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, "html.parser")

    # Lists to store extracted data
    titles, authors, pub_dates, ratings, prices = [], [], [], [], []

    # Locate book results
    books = soup.find_all("div", {"data-component-type": "s-search-result"})

    for book in books[:10]:  # Extract first 10 books
        # ** Extracting the Title **
        title_tag = book.find("h2", class_="a-size-base-plus a-spacing-none a-color-base a-text-normal")
        title = title_tag.text.strip() if title_tag else "N/A"

        # ** Extracting the Author **
        author_tag = book.find("div", class_="a-row a-size-base a-color-secondary")
        if author_tag:
            author_links = author_tag.find_all("a")  # Extract all author names
            author = ", ".join([a.text.strip() for a in author_links]) if author_links else "N/A"
        else:
            author = "N/A"

        # ** Extracting the Publication Date (from search results) **
        pub_date = "N/A"
        date_spans = book.find_all("span", class_="a-size-base a-color-secondary")
        for span in date_spans:
            span_text = span.get_text(strip=True)
            date_match = re.search(r"(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}", span_text)
            if date_match:
                pub_date = date_match.group()
                break

        # If still N/A, extract a year if available
        if pub_date == "N/A":
            year_match = re.search(r"(19|20)\d{2}", book.text)
            if year_match:
                pub_date = year_match.group()

        # ** Extracting the Rating **
        rating_tag = book.find("span", class_="a-icon-alt")
        rating = rating_tag.text.strip().split()[0] if rating_tag else "N/A"

        # ** Extracting the Price **
        price_tag = book.find("span", class_="a-price-whole")
        if not price_tag:
            price_tag = book.find("span", class_="a-offscreen")  # Alternative selector
        price = price_tag.text.strip() if price_tag else "N/A"

        # ** Extract Book URL to Scrape Product Page for More Details (if needed) **
        book_url_tag = book.find("a", class_="a-link-normal s-no-outline")
        book_url = "https://www.amazon.com" + book_url_tag["href"] if book_url_tag else None

        # ** Fetch product page if the publication date is missing **
        if pub_date == "N/A" and book_url:
            pub_date = get_publication_date_from_product_page(book_url)

        # Append to lists
        titles.append(title)
        authors.append(author)
        pub_dates.append(pub_date)
        ratings.append(rating)
        prices.append(price)

        # Simulate human-like browsing to avoid detection
        time.sleep(random.uniform(1, 3))

    # Create DataFrame
    df = pd.DataFrame({
        "Title": titles,
        "Author": authors,
        "Publication Date": pub_dates,
        "Rating": ratings,
        "Price": prices
    })

    # Print formatted output
    print("\nScraped Data:\n")
    print(df.to_string(index=False))

    # Save to CSV
    df.to_csv("amazon_data_engineering_books.csv", index=False)
    print("\nData successfully saved to 'amazon_data_engineering_books.csv'.")

else:
    print("Failed to retrieve webpage. Amazon may have blocked the request.")



Scraped Data:

                                                                                                                                              Title                 Author  Publication Date Rating Price
Cracking the Data Engineering Interview: Land your dream job with the help of resume-building tips, over 100 mock questions, and a unique portfolio         Kedeisha Bryan  November 7, 2023    4.0   26.
                               Data Engineering with AWS: Acquire the skills to design and build AWS-based data transformation pipelines like a pro           Gareth Eagar  October 31, 2023    4.3   24.
                                                         Data Engineering with Alteryx: Helping data engineers apply DataOps practices with Alteryx          Paul Houghton     June 30, 2022    4.3   31.
                                                                                            Hands-On Data Engineering with R, Python and PostgreSQL        Michel Ballings Decem