In [4]:
!pip install mysql-connector-python


Collecting mysql-connector-python
  Downloading mysql_connector_python-9.2.0-cp312-cp312-macosx_14_0_arm64.whl.metadata (6.0 kB)
Downloading mysql_connector_python-9.2.0-cp312-cp312-macosx_14_0_arm64.whl (15.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.1/15.1 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: mysql-connector-python
Successfully installed mysql-connector-python-9.2.0


In [27]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re
import csv
import mysql.connector

# Amazon URL for Data Engineering Books
URL = "https://www.amazon.com/s?k=data+engineering+books"

# User-Agent list to avoid detection
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.83 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"
]

HEADERS = {"User-Agent": random.choice(USER_AGENTS), "Accept-Language": "en-US,en;q=0.5"}

# Fetch search results
response = requests.get(URL, headers=HEADERS)
titles, authors, ratings, prices = [], [], [], []

if response.status_code == 200:
    soup = BeautifulSoup(response.text, "html.parser")
    books = soup.find_all("div", {"data-component-type": "s-search-result"})
    
    for book in books[:25]:
        title_tag = book.find("h2", class_="a-size-base-plus a-spacing-none a-color-base a-text-normal")
        title = title_tag.text.strip() if title_tag else None

        author_tag = book.find("div", class_="a-row a-size-base a-color-secondary")
        author_links = author_tag.find_all("a") if author_tag else []
        author = ", ".join([a.text.strip() for a in author_links]) if author_links else None

        rating_tag = book.find("span", class_="a-icon-alt")
        rating = rating_tag.text.strip().split()[0] if rating_tag else None

        price_tag = book.find("span", class_="a-price-whole")
        price = price_tag.text.strip() if price_tag else None

        titles.append(title)
        authors.append(author)
        ratings.append(rating)
        prices.append(price)
        time.sleep(random.uniform(1, 3))

    df = pd.DataFrame({
        "Title": titles,
        "Author": authors,
        "Rating": ratings,
        "Price": prices
    })
    
    df.to_csv("amazon_books.csv", index=False, quoting=csv.QUOTE_ALL)

    # **MySQL Database Connection - Ensuring Database Exists**
    conn = mysql.connector.connect(
        host="localhost",
        user="root",
        password="jasonrules12"
    )
    cursor = conn.cursor()

    # **Create Database If Not Exists**
    cursor.execute("CREATE DATABASE IF NOT EXISTS bookstore_db")
    cursor.execute("USE bookstore_db")

    # **Creating Tables Based on Given Schema**
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS authors (
            AuthorID INT AUTO_INCREMENT PRIMARY KEY,
            AuthorName VARCHAR(255) UNIQUE
        )
    """)

    cursor.execute("""
        CREATE TABLE IF NOT EXISTS books (
            BookID INT AUTO_INCREMENT PRIMARY KEY,
            Title VARCHAR(255) UNIQUE
        )
    """)

    cursor.execute("""
        CREATE TABLE IF NOT EXISTS bookstore (
            StoreID INT AUTO_INCREMENT PRIMARY KEY,
            BookID INT,
            AuthorID INT,
            Price FLOAT,
            Rating FLOAT,
            FOREIGN KEY (BookID) REFERENCES books(BookID),
            FOREIGN KEY (AuthorID) REFERENCES authors(AuthorID)
        )
    """)

    print("Database and tables are ready!")

    # Insert Data into MySQL Tables
    for index, row in df.iterrows():
        if pd.notnull(row['Title']) and pd.notnull(row['Author']) and pd.notnull(row['Rating']) and pd.notnull(row['Price']):
            # Insert Author
            cursor.execute("INSERT IGNORE INTO authors (AuthorName) VALUES (%s)", (row['Author'],))
            conn.commit()

            # Retrieve AuthorID
            cursor.execute("SELECT AuthorID FROM authors WHERE AuthorName = %s", (row['Author'],))
            author_id = cursor.fetchone()[0]

            # Insert Book
            cursor.execute("INSERT IGNORE INTO books (Title) VALUES (%s)", (row['Title'],))
            conn.commit()

            # Retrieve BookID
            cursor.execute("SELECT BookID FROM books WHERE Title = %s", (row['Title'],))
            book_id = cursor.fetchone()[0]

            # Insert into bookstore (Ensuring Unique Entries)
            cursor.execute("""
                INSERT INTO bookstore (BookID, AuthorID, Price, Rating)
                VALUES (%s, %s, %s, %s)
            """, (book_id, author_id, row['Price'], row['Rating']))
            conn.commit()

    # Retrieve and Sort Books
    query = """
        SELECT authors.AuthorName, books.Title, bookstore.Rating, bookstore.Price
        FROM bookstore
        JOIN books ON bookstore.BookID = books.BookID
        JOIN authors ON bookstore.AuthorID = authors.AuthorID
        WHERE books.Title IS NOT NULL
        ORDER BY bookstore.Price ASC
        LIMIT 15;
    """
    
    cursor.execute(query)
    results = cursor.fetchall()
    df_result = pd.DataFrame(results, columns=["Author", "Title", "Rating", "Price"])
    print(df_result)
    df_result.to_csv("sorted_books.csv", index=False)

    cursor.close()
    conn.close()
    print("Data successfully stored in MySQL and sorted data saved to CSV.")
else:
    print("Failed to retrieve webpage. Amazon may have blocked the request.")


Database and tables are ready!
                                               Author  \
0   Book 3 of 3: The Innovators of AI and Data Series   
1                                      James Densmore   
2                             Cole Nussbaumer Knaflic   
3                                        Gareth Eagar   
4                                        Gareth Eagar   
5                                      Kedeisha Bryan   
6                                      Kedeisha Bryan   
7                                       Paul Houghton   
8                           Simon Aubury, Ned Letcher   
9                                       Paul Crickard   
10                                      Roberto Zagni   
11              Richard J. Schiller, David Larochelle   
12                                      Pulkit Chadha   
13                                      Vlad Riscutia   
14                             Joe Reis, Matt Housley   

                                                Title  R