In [4]:
!pip install mysql-connector-python


Collecting mysql-connector-python
  Downloading mysql_connector_python-9.2.0-cp312-cp312-macosx_14_0_arm64.whl.metadata (6.0 kB)
Downloading mysql_connector_python-9.2.0-cp312-cp312-macosx_14_0_arm64.whl (15.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.1/15.1 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: mysql-connector-python
Successfully installed mysql-connector-python-9.2.0


In [23]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re
import csv
import mysql.connector

# Amazon URL for Data Engineering Books
URL = "https://www.amazon.com/s?k=data+engineering+books"

# User-Agent list to avoid detection
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.83 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"
]

# Headers to mimic a real browser request
HEADERS = {
    "User-Agent": random.choice(USER_AGENTS),
    "Accept-Language": "en-US,en;q=0.5"
}

def get_publication_date_from_product_page(book_url):
    """Fetch the publication date from a book's product page."""
    try:
        product_response = requests.get(book_url, headers=HEADERS)
        if product_response.status_code == 200:
            product_soup = BeautifulSoup(product_response.text, "html.parser")
            possible_locations = [
                "#detailBullets_feature_div",
                "#productDetailsTable",
                "#prodDetails"
            ]
            for location in possible_locations:
                details = product_soup.select_one(location)
                if details:
                    detail_text = details.get_text(strip=True)
                    date_match = re.search(r"(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}", detail_text)
                    if date_match:
                        return date_match.group()
            
            year_match = re.search(r"(19|20)\d{2}", product_soup.text)
            if year_match:
                return year_match.group()
    except Exception as e:
        print(f"Error fetching product page: {e}")
    return None

# Fetch main search results page
response = requests.get(URL, headers=HEADERS)

titles, authors, pub_dates, ratings, prices = [], [], [], [], []
if response.status_code == 200:
    soup = BeautifulSoup(response.text, "html.parser")
    books = soup.find_all("div", {"data-component-type": "s-search-result"})
    
    for book in books[:25]:
        title_tag = book.find("h2", class_="a-size-base-plus a-spacing-none a-color-base a-text-normal")
        title = title_tag.text.strip() if title_tag else None
        
        author_tag = book.find("div", class_="a-row a-size-base a-color-secondary")
        if author_tag:
            author_links = author_tag.find_all("a")
            author = ", ".join([a.text.strip() for a in author_links]) if author_links else None
        else:
            author = None

        pub_date = None
        date_spans = book.find_all("span", class_="a-size-base a-color-secondary")
        for span in date_spans:
            span_text = span.get_text(strip=True)
            date_match = re.search(r"(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}", span_text)
            if date_match:
                pub_date = date_match.group()
                break
        
        if pub_date is None:
            year_match = re.search(r"(19|20)\d{2}", book.text)
            if year_match:
                pub_date = year_match.group()
        
        rating_tag = book.find("span", class_="a-icon-alt")
        rating = rating_tag.text.strip().split()[0] if rating_tag else None

        price_tag = book.find("span", class_="a-price-whole")
        if not price_tag:
            price_tag = book.find("span", class_="a-offscreen")
        price = price_tag.text.strip() if price_tag else None

        book_url_tag = book.find("a", class_="a-link-normal s-no-outline")
        book_url = "https://www.amazon.com" + book_url_tag["href"] if book_url_tag else None

        if pub_date is None and book_url:
            pub_date = get_publication_date_from_product_page(book_url)
        
        titles.append(title)
        authors.append(author)
        pub_dates.append(pub_date)
        ratings.append(rating)
        prices.append(price)
        time.sleep(random.uniform(1, 3))

    df = pd.DataFrame({
        "Title": titles,
        "Author": authors,
        "Publication Date": pub_dates,
        "Rating": ratings,
        "Price": prices
    })
    
    df.to_csv("amazon_data_engineering_books.csv", index=False, quoting=csv.QUOTE_ALL)

    # MySQL Database Connection
    conn = mysql.connector.connect(
        host="localhost",
        user="root",
        password="jasonrules12",
        database="data_engineering_books"
    )
    cursor = conn.cursor()

    # Insert Data into MySQL Table with NULL filtering
    for index, row in df.iterrows():
        if pd.notnull(row['Title']) and pd.notnull(row['Author']) and pd.notnull(row['Publication Date']) and pd.notnull(row['Rating']) and pd.notnull(row['Price']):
            cursor.execute("""
                INSERT INTO books (title, author, publication_date, rating, price)
                VALUES (%s, %s, %s, %s, %s)
            """, (row['Title'], row['Author'], row['Publication Date'], row['Rating'], row['Price']))
    conn.commit()

    # Retrieve only 15 valid rows
    query = """
        SELECT * FROM books 
        WHERE title IS NOT NULL 
        AND author IS NOT NULL 
        AND publication_date IS NOT NULL 
        AND rating IS NOT NULL 
        AND price IS NOT NULL
        LIMIT 15;
    """
    cursor.execute(query)
    results = cursor.fetchall()
    df_result = pd.DataFrame(results, columns=["ID", "Title", "Author", "Publication Date", "Rating", "Price"])
    print(df_result)
    df_result.to_csv("sorted_books.csv", index=False)

    cursor.close()
    conn.close()
    print("Data successfully stored in MySQL and sorted data saved to CSV.")
else:
    print("Failed to retrieve webpage. Amazon may have blocked the request.")


    ID                                              Title  \
0    1  Data Engineering with Alteryx: Helping data en...   
1    2  Data Engineering with AWS: Acquire the skills ...   
2    3  Data Engineering with AWS Cookbook: A recipe-b...   
3    4  Getting Started with DuckDB: A practical guide...   
4    5  Fundamentals of Data Engineering: Plan and Bui...   
5    6  Designing Data-Intensive Applications: The Big...   
6    7  Data Engineering with AWS: Acquire the skills ...   
7    9  Financial Data Engineering: Design and Build D...   
8   10  AI Engineering: Building Applications with Fou...   
9   11  Data Pipelines Pocket Reference: Moving and Pr...   
10  12  Data Engineering Best Practices: Architect rob...   
11  14  Cracking the Data Engineering Interview: Land ...   
12  16  Data Engineering with Databricks Cookbook: Bui...   
13  17  Cracking the Data Engineering Interview: Land ...   
14  18                         Snowflake Data Engineering   

                       