In [10]:
!pip install selenium beautifulsoup4 pandas webdriver-manager



In [11]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import time

# 1. Setup Chrome WebDriver with options
options = Options()
options.add_argument('--headless')  # Run in background
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# 2. Open Amazon search results
search_query = "educational insights"
url = f"https://www.amazon.com/s?k={search_query.replace(' ', '+')}"
driver.get(url)
time.sleep(5)  # Wait for page to load

# 3. Parse the page
soup = BeautifulSoup(driver.page_source, 'html.parser')

# 4. Extract product data
products = soup.find_all("div", {"data-component-type": "s-search-result"})

data = []

for product in products:
    try:
        title_elem = product.h2.a
        description = title_elem.text.strip()
        product_url = "https://www.amazon.com" + title_elem['href']
    except:
        description = ""
        product_url = ""

    try:
        price_whole = product.find("span", class_="a-price-whole").text.strip().replace(",", "")
        price_frac = product.find("span", class_="a-price-fraction").text.strip()
        price = f"${price_whole}.{price_frac}"
    except:
        price = ""

    try:
        rating = product.find("span", class_="a-icon-alt").text.strip()
    except:
        rating = ""

    try:
        review_count = product.find("span", {"class": "a-size-base"}).text.strip()
    except:
        review_count = ""

    try:
        image = product.find("img", class_="s-image")['src']
    except:
        image = ""

    data.append({
        "Description": description,
        "Price": price,
        "Rating": rating,
        "Reviews Count": review_count,
        "URL": product_url,
        "Image link": image
    })

driver.quit()

In [13]:
# 5. Export to CSV
df = pd.DataFrame(data)
df.to_csv("amazon_educational_insights.csv", index=False)
df.head()

Unnamed: 0,Description,Price,Rating,Reviews Count,URL,Image link
0,,$53..19,4.6 out of 5 stars,Products highlighted as 'Overall Pick' are:,,https://m.media-amazon.com/images/I/71XC1NcMMy...
1,,$18..86,4.7 out of 5 stars,54,,https://m.media-amazon.com/images/I/71S9YuOg53...
2,,$31..99,4.6 out of 5 stars,298,,https://m.media-amazon.com/images/I/71OdEPAfiY...
3,,$43..49,4.5 out of 5 stars,263,,https://m.media-amazon.com/images/I/81pYVc69sx...
4,,$9..97,4.7 out of 5 stars,31155,,https://m.media-amazon.com/images/I/71v68G+xVA...


# Now Scraping the page sorting by the newest arrivals (1 page)

In [21]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import os

# 1) Configure headless Chrome
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
)

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

# 2) Build URL with sort=newest
search = "educational insights"
base_url = "https://www.amazon.com/s"
params = f"?k={search.replace(' ', '+')}&s=date-desc-rank"
full_url = base_url + params

driver.get(full_url)

# 3) Wait for the results container to load
try:
    WebDriverWait(driver, 15).until(
        EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, 'div[data-component-type="s-search-result"]')
        )
    )
except Exception:
    print("Timed out waiting for page to load")
    driver.quit()
    raise

# 4) Parse page source
soup = BeautifulSoup(driver.page_source, "html.parser")
items = soup.select('div[data-component-type="s-search-result"]')

data = []
for item in items:
    # Description & URL
    a = item.h2.a
    title = a.get_text(strip=True) if a else ""
    url = "https://www.amazon.com" + a["href"] if a and a.has_attr("href") else ""

    # Price
    price_whole = item.select_one("span.a-price-whole")
    price_frac  = item.select_one("span.a-price-fraction")
    if price_whole and price_frac:
        price = f"${price_whole.text.strip()}.{price_frac.text.strip()}"
    else:
        price = ""

    # Rating
    rate = item.select_one("span.a-icon-alt")
    rating = rate.text.strip() if rate else ""

    # Reviews count
    rev = item.select_one("span.a-size-base")
    reviews = rev.text.strip() if rev else ""

    # Image link
    img = item.select_one("img.s-image")
    img_url = img["src"] if img and img.has_attr("src") else ""

    data.append({
        "Description": title,
        "Price": price,
        "Rating": rating,
        "Reviews Count": reviews,
        "URL": url,
        "Image link": img_url
    })

driver.quit()

# 5) Export to CSV
df = pd.DataFrame(data)
out_file = "amazon_educational_insights2.csv"
df.to_csv(out_file, index=False)
print(f"Saved {len(df)} items to {out_file}")
print("You can find it here:", os.getcwd())


Saved 52 items to amazon_educational_insights2.csv
You can find it here: C:\Users\Ali Kazem


# Now Scraping the page sorting by the newest arrivals (3 pages)

In [23]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager

# Setup Chrome options
options = Options()
options.add_argument('--headless')  # comment this out if you want to see the browser
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("window-size=1920,1080")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36")

# Initialize the driver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

# Base Amazon URL
base_url = "https://www.amazon.com/s"
search_term = "educational insights"
params = f"?k={search_term.replace(' ', '+')}&s=date-desc-rank"  # sort by newest arrivals

data = []

# Scrape first 3 pages
for page_num in range(1, 4):
    paginated_url = f"{base_url}{params}&page={page_num}"
    driver.get(paginated_url)

    try:
        WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div[data-component-type="s-search-result"]'))
        )
    except Exception as e:
        print(f"Timeout or error on page {page_num}: {e}")
        continue

    soup = BeautifulSoup(driver.page_source, "html.parser")
    items = soup.select('div[data-component-type="s-search-result"]')

    print(f"Page {page_num}: Found {len(items)} items.")

    for item in items:
        a = item.h2.a
        title = a.get_text(strip=True) if a else ""
        url = "https://www.amazon.com" + a["href"] if a and a.has_attr("href") else ""

        price_whole = item.select_one("span.a-price-whole")
        price_frac = item.select_one("span.a-price-fraction")
        if price_whole and price_frac:
            price = f"${price_whole.text.strip()}.{price_frac.text.strip()}"
        else:
            price = ""

        rate = item.select_one("span.a-icon-alt")
        rating = rate.text.strip() if rate else ""

        rev = item.select_one("span.a-size-base")
        reviews = rev.text.strip() if rev else ""

        img = item.select_one("img.s-image")
        img_url = img["src"] if img and img.has_attr("src") else ""

        data.append({
            "Description": title,
            "Price": price,
            "Rating": rating,
            "Reviews Count": reviews,
            "URL": url,
            "Image link": img_url
        })


Page 1: Found 52 items.
Page 2: Found 52 items.
Page 3: Found 52 items.


In [None]:
# Save to CSV
df = pd.DataFrame(data)
df.to_csv("amazon_educational_insights3.csv", index=False)
print("Saved to amazon_educational_insights.csv")

# Close the browser
driver.quit()

# Export Images into Excel File (Visible as Embedded Images)

In [25]:
!pip install xlsxwriter

Collecting xlsxwriter
  Downloading XlsxWriter-3.2.3-py3-none-any.whl.metadata (2.7 kB)
Downloading XlsxWriter-3.2.3-py3-none-any.whl (169 kB)
Installing collected packages: xlsxwriter
Successfully installed xlsxwriter-3.2.3


In [27]:
import pandas as pd
import requests
from io import BytesIO
import xlsxwriter

# Load your CSV
df = pd.read_csv("amazon_educational_insights.csv")

# Create a new workbook and worksheet
workbook = xlsxwriter.Workbook("amazon_products_with_images.xlsx")
worksheet = workbook.add_worksheet()

# Write headers
for col_num, header in enumerate(df.columns):
    worksheet.write(0, col_num, header)

# Write data with image preview
for row_num, row in df.iterrows():
    for col_num, value in enumerate(row):
        if df.columns[col_num] == "Image link":
            try:
                response = requests.get(value, timeout=10)
                image_data = BytesIO(response.content)
                worksheet.insert_image(row_num + 1, col_num, value, {"image_data": image_data, "x_scale": 0.3, "y_scale": 0.3})
            except Exception as e:
                worksheet.write(row_num + 1, col_num, "Image failed")
        else:
            # Convert NaN/inf to safe text
            if pd.isna(value) or isinstance(value, float) and not pd.isfinite(value):
                worksheet.write(row_num + 1, col_num, "")
            else:
                worksheet.write(row_num + 1, col_num, str(value))

workbook.close()
print("Saved to amazon_products_with_images.xlsx")


Saved to amazon_products_with_images.xlsx


# Add Data in the description column and additional column for Item link (Didn't work)

In [49]:
import time
import csv
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# Set up headless Chrome
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--window-size=1920,1080")

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

base_url = "https://www.amazon.com/s?k=educational+insights&i=toys-and-games&rh=n%3A166164011&dc&qid=1700000000&rnid=2941120011&ref=sr_pg_{}"

# Lists to store data
descriptions = []
prices = []
ratings = []
review_counts = []
image_links = []
sub_descriptions = []
item_links = []

# Loop through first 3 pages
for page in range(1, 4):
    url = base_url.format(page)
    driver.get(url)
    time.sleep(3)
    soup = BeautifulSoup(driver.page_source, "html.parser")

    items = soup.find_all("div", {"data-component-type": "s-search-result"})

    for item in items:
        # Description (short title)
        desc_tag = item.h2
        desc = desc_tag.text.strip() if desc_tag else ""
        descriptions.append(desc)

        # Price
        price_whole = item.find("span", class_="a-price-whole")
        price_fraction = item.find("span", class_="a-price-fraction")
        if price_whole and price_fraction:
            price = f"{price_whole.text.strip()}.{price_fraction.text.strip()}"
        else:
            price = ""
        prices.append(price)

        # Rating
        rating_tag = item.find("span", class_="a-icon-alt")
        rating = rating_tag.text.strip().split(" out")[0] if rating_tag else ""
        ratings.append(rating)

        # Reviews count
        review_tag = item.find("span", {"class": "a-size-base", "dir": "auto"})
        reviews = review_tag.text.strip() if review_tag else ""
        review_counts.append(reviews)

        # Image link
        image_tag = item.find("img", class_="s-image")
        image = image_tag["src"] if image_tag else ""
        image_links.append(image)

        # Sub Description
        sub_desc = ""
        h2_tag = item.find("h2")
        if h2_tag:
            span = h2_tag.find("span")
            if span:
                sub_desc = span.text.strip()
        sub_descriptions.append(sub_desc)

        # Item link
        item_link = ""
        if h2_tag:
            a_tag = h2_tag.find("a")
            if a_tag and a_tag.get("href"):
                item_link = "https://www.amazon.com" + a_tag["href"]
        item_links.append(item_link)

driver.quit()

# Save to CSV
df = pd.DataFrame({
    "Description": descriptions,
    "Price": prices,
    "Rating": ratings,
    "Reviews Count": review_counts,
    "Image link": image_links,
    "Sub Description": sub_descriptions,
    "Item link": item_links
})

df.to_csv("amazon_educational_insights2.csv", index=False)
print("Scraping complete. Data saved to 'amazon_educational_insights.csv'.")


Scraping complete. Data saved to 'amazon_educational_insights.csv'.
