In [1]:
!pip install mysql-connector-python

Collecting mysql-connector-python
  Downloading mysql_connector_python-9.2.0-cp312-cp312-win_amd64.whl.metadata (6.2 kB)
Downloading mysql_connector_python-9.2.0-cp312-cp312-win_amd64.whl (16.1 MB)
   ---------------------------------------- 0.0/16.1 MB ? eta -:--:--
   -------------- ------------------------- 5.8/16.1 MB 32.0 MB/s eta 0:00:01
   ------------------------------- -------- 12.8/16.1 MB 33.6 MB/s eta 0:00:01
   ---------------------------------------  16.0/16.1 MB 30.5 MB/s eta 0:00:01
   ---------------------------------------- 16.1/16.1 MB 24.1 MB/s eta 0:00:00
Installing collected packages: mysql-connector-python
Successfully installed mysql-connector-python-9.2.0


In [3]:
import requests  # makes http requests
from bs4 import BeautifulSoup  # parses html content
import pandas as pd  # data manipulation and storage
import time  # delays between requests
import random  # adds variability in request delays
import re  # pattern matching in text
import mysql.connector

# amazon url for data engineering books
URL = "https://www.amazon.com/s?k=data+engineering+books"

# user-agent list to avoid detection by amazon's anti-scraping measures
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.83 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"
]

# headers to mimic a real browser request
HEADERS = {
    "User-Agent": random.choice(USER_AGENTS),  # randomly selects a user-agent to reduce detection risk
    "Accept-Language": "en-US,en;q=0.5"  # specifies the preferred language for the response
}

def get_publication_date_from_product_page(book_url):
    """fetches the publication date from a book's product page."""
    try:
        product_response = requests.get(book_url, headers=HEADERS)  # sends request to the book's product page
        if product_response.status_code == 200:  # checks if request was successful
            product_soup = BeautifulSoup(product_response.text, "html.parser")  # parses html content
            possible_locations = [  # defines possible locations where publication date might be found
                "#detailBullets_feature_div",
                "#productDetailsTable",
                "#prodDetails"
            ]
            for location in possible_locations:
                details = product_soup.select_one(location)  # selects the first matching html element
                if details:
                    detail_text = details.get_text(strip=True)  # extracts text and removes extra spaces
                    date_match = re.search(r"(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}", detail_text)
                    if date_match:
                        return date_match.group()  # returns the matched publication date

            year_match = re.search(r"(19|20)\d{2}", product_soup.text)  # searches for a 4-digit year in the text
            if year_match:
                return year_match.group()  # returns the matched year
    except Exception as e:
        print(f"error fetching product page: {e}")  # prints error message if fetching fails
    return None  # returns none if no publication date is found

response = requests.get(URL, headers=HEADERS)  # sends a request to amazon

if response.status_code == 200:  # checks if request was successful
    soup = BeautifulSoup(response.text, "html.parser")  # parses html content
    books = soup.find_all("div", {"data-component-type": "s-search-result"})  # finds all book listings

    # lists to store extracted book data
    titles, authors, pub_dates, ratings, prices = [], [], [], [], []

    for book in books[:50]:  # loops through the first 50 book listings
        title_tag = book.find("h2", class_="a-size-base-plus a-spacing-none a-color-base a-text-normal")  # finds book title
        title = title_tag.text.strip() if title_tag else "unknown"  # extracts title text or assigns "unknown"

        author_tag = book.find("div", class_="a-row a-size-base a-color-secondary")  # finds author information
        author_links = author_tag.find_all("a") if author_tag else []  # gets all author links if present
        author = ", ".join([a.text.strip() for a in author_links]) if author_links else "unknown"  # extracts author names

        pub_date = None  # initializes publication date variable
        date_spans = book.find_all("span", class_="a-size-base a-color-secondary")  # finds potential publication date elements
        for span in date_spans:
            span_text = span.get_text(strip=True)  # extracts text content
            date_match = re.search(r"(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}", span_text)
            if date_match:
                pub_date = date_match.group()  # assigns found publication date
                break

        if pub_date is None:
            year_match = re.search(r"(19|20)\d{2}", book.text)  # searches for a 4-digit year
            if year_match:
                pub_date = year_match.group()  # assigns found year

        rating_tag = book.find("span", class_="a-icon-alt")  # finds book rating
        rating = rating_tag.text.strip().split()[0] if rating_tag else "no rating"  # extracts rating value

        price_tag = book.find("span", class_="a-price-whole") or book.find("span", class_="a-offscreen")  # finds price
        price = price_tag.text.strip() if price_tag else "not available"  # extracts price or assigns "not available"

        book_url_tag = book.find("a", class_="a-link-normal s-no-outline")  # finds book url
        book_url = "https://www.amazon.com" + book_url_tag["href"] if book_url_tag else None  # constructs full url

        if pub_date is None and book_url:
            pub_date = get_publication_date_from_product_page(book_url)  # gets publication date from product page if missing

        # appends extracted data to lists
        titles.append(title)
        authors.append(author)
        pub_dates.append(pub_date)
        ratings.append(rating)
        prices.append(price)
        time.sleep(random.uniform(1, 3))  # adds random delay to avoid detection

    # creates a pandas dataframe with collected book data
    df = pd.DataFrame({
        "Title": titles,
        "Author": authors,
        "Publication Date": pub_dates,
        "Rating": ratings,
        "Price": prices
    })

    # saves data to csv file
    df.to_csv("amazon_data_engineering_books.csv", index=False)
    # confirmation message to user if data is successfully saved
    print("Data has been saved to 'data_engineering_books.csv'")


    
    try:
        # establishes database connection
        conn = mysql.connector.connect(
            host="localhost",
            user="root",
            password="jasonrules12",
            database="data_engineering_books"
        )
        # creates cursor to interact with database
        cursor = conn.cursor()

        for index, row in df.iterrows():  # iterates through dataframe rows
            if all(pd.notnull(val) and val != "unknown" for val in row):  # checks if row data is valid
                cursor.execute("""
                    INSERT INTO books (title, author, publication_date, rating, price)
                    VALUES (%s, %s, %s, %s, %s)
                """, (row['Title'], row['Author'], row['Publication Date'], row['Rating'], row['Price']))  # inserts data
        conn.commit()  # commits transaction

        # query that extract only 3 columns from the table and sorts table based on rating column
        query = """
            SELECT title, author, price
            FROM books
            ORDER BY rating DESC;
        """
        cursor.execute(query) # executes query
        results = cursor.fetchall() # fetches all results
        df_result = pd.DataFrame(results, columns=["Title", "Author", "Price", "Rating"]) # converts results into a dataframe
        print(df_result) # prints results

    # handles database errors
    except mysql.connector.Error as db_error:
        print(f"database error: {db_error}")

    finally:
        cursor.close()
        # closes connection to database
        conn.close()
        print("\nThese are the top-rated data engineering books our shoppers love ♡. \nHappy reads!")

else:
    # error handling message for user if parsing fails
    print("ⓘ Failed to retrieve webpage. Amazon may have blocked the request. :(")

database error: 1045 (28000): Access denied for user 'root'@'localhost' (using password: YES)


NameError: name 'cursor' is not defined