## Trader Joes Scraper Example ##

The following notebook is to guide the user through the web scraping process for extracting prices and recipes from the grocery store, Trader Joe's.

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [3]:
url = "https://www.traderjoes.com/home/products/category/produce"
response = requests.get(url)

# Check that it worked (HTTP 200 = OK)
print(response.status_code)

# Display the first few hundred characters of the HTML
print(response.text[:500])


403
<HTML><HEAD>
<TITLE>Access Denied</TITLE>
</HEAD><BODY>
<H1>Access Denied</H1>
 
You don't have permission to access "http&#58;&#47;&#47;www&#46;traderjoes&#46;com&#47;home&#47;products&#47;category&#47;produce" on this server.<P>
Reference&#32;&#35;18&#46;8609c617&#46;1760809290&#46;35d68d6f
<P>https&#58;&#47;&#47;errors&#46;edgesuite&#46;net&#47;18&#46;8609c617&#46;1760809290&#46;35d68d6f</P>
</BODY>
</HTML>



In [4]:
import requests

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/122.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.traderjoes.com/"
}

url = "https://www.traderjoes.com/home/products/category/produce"
response = requests.get(url, headers=headers)
print(response.status_code)


403


In [17]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time

# Chrome options
options = Options()
#options.add_argument("--headless")  # runs Chrome without opening a window
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--window-size=1920,1080")

# Initialize WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Navigate to the page
url = "https://www.traderjoes.com/home/products/category/produce"
driver.get(url)

# Allow page to fully load
time.sleep(3)

# Get the HTML
html = driver.page_source
print(html[:1000])  # print first 1000 chars to confirm it loaded

# Always quit when done
driver.quit()


<html lang="en"><head>
    <meta charset="UTF-8">
    <title>Products | Trader Joe's</title><script src="https://static.klaviyo.com/onsite/js/klaviyo.js?company_id=YyBHyr" async="" defer=""></script>
    
    
    <meta name="template" content="product-category-template">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    

    
    


<script type="text/javascript" async="" src="https://www.google-analytics.com/analytics.js"></script><script type="text/javascript" async="" src="https://www.googletagmanager.com/gtag/js?id=G-2HMPBJHQ41&amp;cx=c&amp;gtm=4e5af1h1"></script><script async="" src="https://www.googletagmanager.com/gtm.js?id=GTM-PK37XV6"></script><script defer="defer" type="text/javascript" src="/.rum/@adobe/helix-rum-js@%5E2/dist/rum-standalone.js"></script>
<link rel="canonical" href="/home/products/category.html">

    

    
    
    
<link rel="stylesheet" href="/etc.clientlibs/trjo/clientlibs/clientlib-base.lc-af48c5ed2ea1d900b5b89d68fe8fab26-lc

In [8]:
import requests
import json

url = "https://www.traderjoes.com/graphql"

query = """
query SearchProducts($pageSize: Int, $currentPage: Int, $storeCode: String, $published: String = "1") {
  products(
    filter: {store_code: {eq: $storeCode}, published: {eq: $published}}
    pageSize: $pageSize
    currentPage: $currentPage
  ) {
    items {
      sku
      item_title
      item_description
      retail_price
      country_of_origin
      availability
      updated_at
    }
    total_count
    page_info {
      current_page
      total_pages
    }
  }
}
"""

variables = {
    "pageSize": 50,
    "currentPage": 1,
    "storeCode": "701"
}

headers = {
    "Content-Type": "application/json",
    "User-Agent": "Mozilla/5.0"
}

response = requests.post(url, headers=headers, json={"query": query, "variables": variables})

if response.status_code == 200:
    data = response.json()
    print(json.dumps(data, indent=2))
else:
    print("Error:", response.status_code, response.text)


Error: 403 <HTML><HEAD>
<TITLE>Access Denied</TITLE>
</HEAD><BODY>
<H1>Access Denied</H1>
 
You don't have permission to access "http&#58;&#47;&#47;www&#46;traderjoes&#46;com&#47;graphql" on this server.<P>
Reference&#32;&#35;18&#46;8609c617&#46;1760809777&#46;35fd0762
<P>https&#58;&#47;&#47;errors&#46;edgesuite&#46;net&#47;18&#46;8609c617&#46;1760809777&#46;35fd0762</P>
</BODY>
</HTML>



In [18]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920,1080")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

driver.get("https://www.traderjoes.com/home/products/pdp/organic-milk-a2a2-080971")

driver.implicitly_wait(10)
time.sleep(5)
# Extract product details

try:
    product_title = driver.find_element(By.CSS_SELECTOR, "h1.ProductDetails_main__title__14Cnm").text
    product_price = driver.find_element(By.CSS_SELECTOR, "span.ProductPrice_productPrice__price__3-50j").text

    print("Product Title:", product_title)
    print("Price:", product_price)

except Exception as e:
    print("Error extracting product details:", e)

nutrition_data = {}

try:
    # 1. Find the nutrition container
    container = driver.find_element(By.CSS_SELECTOR, "div.NutritionFacts_nutritionFacts__1Nvz0")

    # 2. Extract characteristics (serving size, calories, etc.)
    characteristics = container.find_elements(By.CSS_SELECTOR, "div.Item_characteristics__item__2TgL-")
    for item in characteristics:
        title_elem = item.find_element(By.CSS_SELECTOR, "div.Item_characteristics__title__7nfa8")
        try:
            text_elem = item.find_element(By.CSS_SELECTOR, "div.Item_characteristics__text__dcfEC")
            nutrition_data[title_elem.text] = text_elem.text
        except:
            nutrition_data[title_elem.text] = None

    # 3. Extract table data
    rows = container.find_elements(By.CSS_SELECTOR, "table.Item_table__2PMbE tbody tr")
    for row in rows:
        cells = row.find_elements(By.TAG_NAME, "td")
        if len(cells) == 3:
            nutrient = cells[0].text
            amount = cells[1].text
            dv = cells[2].text
            nutrition_data[nutrient] = {"amount": amount, "%dv": dv}

    # Print results
    from pprint import pprint
    pprint(nutrition_data)

finally:
    driver.quit()
# Close the WebDriver
driver.quit()

Product Title: Organic Milk A2/A2
Price: $5.99
{'CALORIES PER SERVING': '150',
 'Calcium': {'%dv': '25%', 'amount': '300 mg'},
 'Cholesterol': {'%dv': '10%', 'amount': '30 mg'},
 'Dietary Fiber': {'%dv': '0%', 'amount': '0 g'},
 'Includes': {'%dv': '0%', 'amount': '0 g Added Sugars'},
 'Iron': {'%dv': '0%', 'amount': '0.0 mg'},
 'Potassium': {'%dv': '8%', 'amount': '370 mg'},
 'Protein': {'%dv': '', 'amount': '8 g'},
 'SERVES 8': None,
 'SERVING SIZE': '1 cup(240ml)',
 'Saturated Fat': {'%dv': '23%', 'amount': '4.5 g'},
 'Sodium': {'%dv': '4%', 'amount': '95 mg'},
 'Total Carbohydrate': {'%dv': '4%', 'amount': '11 g'},
 'Total Fat': {'%dv': '12%', 'amount': '9 g'},
 'Total Sugars': {'%dv': '', 'amount': '12 g'},
 'Trans Fat': {'%dv': '', 'amount': '0 g'},
 'Vitamin A': {'%dv': '2%', 'amount': '20 mcg'},
 'Vitamin D': {'%dv': '10%', 'amount': '2.4 mcg'}}


In [29]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
import pandas as pd
import time
import re

def scrape_category(url):
    chrome_options = Options()
    #chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    driver.get(url)

    all_products = []
    page = 1
    time.sleep(3)
    while True:
        print(f"Scraping page {page}...")
        WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "section.ProductCard_card__4WAOg"))
        )
    
        # --- scrape cards (same as before) ---
        cards = driver.find_elements(By.CSS_SELECTOR, "section.ProductCard_card__4WAOg")
        for card in cards:
            try:
                name_elem = card.find_element(By.CSS_SELECTOR, "h2 a")
                name = name_elem.text.strip()
                link = name_elem.get_attribute("href")
            except:
                name = None
                link = None
    
            try:
                price = card.find_element(By.CSS_SELECTOR, "span.ProductPrice_productPrice__price__3-50j").text.strip()
            except:
                price = None
    
            try:
                unit = card.find_element(By.CSS_SELECTOR, "span.ProductPrice_productPrice__unit__2jvkA").text.strip()
            except:
                unit = None
    
            all_products.append({
                "name": name, "price": price, "unit": unit, "url": link, "page": page
            })
    
        # --- pagination handling ---

        try:
            current_page_elem = driver.find_element(
                By.CSS_SELECTOR, "li.PaginationItem_paginationItem_selected__3BZC-"
            )
            current_page_text = current_page_elem.text.strip()
        
            # Try extracting digits from text or aria-label
            match = re.search(r'\d+', current_page_text)
            if not match:
                aria_label = current_page_elem.get_attribute("aria-label") or ""
                match = re.search(r'\d+', aria_label)
        
            if match:
                current_page = match.group()
            else:
                print("⚠️ Could not determine current page number, stopping.")
                break
        
            pagination_items = driver.find_elements(
                By.CSS_SELECTOR, "li.PaginationItem_paginationItem__2f87h"
            )
        
            next_item = None
            for item in pagination_items:
                label = item.text.strip() or (item.get_attribute("aria-label") or "")
                m = re.search(r'\d+', label)
                if m and m.group() == str(int(current_page) + 1):
                    next_item = item
                    break
        
            if not next_item:
                print("✅ No more pages to scrape.")
                break
        
            driver.execute_script("arguments[0].click();", next_item)
        
            # Wait for the selected page to update visually
            WebDriverWait(driver, 10).until(
                EC.text_to_be_present_in_element(
                    (By.CSS_SELECTOR, "li.PaginationItem_paginationItem_selected__3BZC-"),
                    str(int(current_page) + 1)
                )
            )
        
            page += 1
            time.sleep(1)
        
        except (TimeoutException, StaleElementReferenceException):
            print("⚠️ Pagination ended or element went stale.")
            break

    driver.quit()
    return pd.DataFrame(all_products)


# ---- Run it ----
#url = "https://www.traderjoes.com/home/products/category/meat-seafood-plant-based-122"
url = "https://www.traderjoes.com/home/products/category/fresh-fruits-veggies-113"
df = scrape_category(url)

# Preview results
print(df.head())

# Save locally
df.to_csv("traderjoes_fresh-fruits-veggies_products.csv", index=False)
print("✅ Saved traderjoes_fresh-fruits-veggies_products.csv")


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
✅ No more pages to scrape.
                          name  price    unit  \
0    Organic Crimini Mushrooms  $2.69   /8 Oz   
1       Organic Sweet Potatoes  $4.99   /3 Lb   
2                 Fresh Fennel  $2.49  /20 Oz   
3                Trimmed Leeks  $2.49   /6 Oz   
4  Steamed & Peeled Baby Beets  $2.29   /8 Oz   

                                                 url  page  
0  https://www.traderjoes.com/home/products/pdp/o...     1  
1  https://www.traderjoes.com/home/products/pdp/o...     1  
2  https://www.traderjoes.com/home/products/pdp/f...     1  
3  https://www.traderjoes.com/home/products/pdp/t...     1  
4  https://www.traderjoes.com/home/products/pdp/s...     1  
✅ Saved traderjoes_fresh-fruits-veggies_products.csv


In [33]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time, re

# ---- Selenium setup ----
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(options=options)

# ---- Helper function to scrape a single category ----
def scrape_category(url, category_name):
    driver.get(url)
    time.sleep(3)

    products = []
    page = 1

    while True:
        print(f"Scraping {category_name} - page {page}...")
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "section.ProductCard_card__4WAOg"))
        )
        cards = driver.find_elements(By.CSS_SELECTOR, "section.ProductCard_card__4WAOg")

        for card in cards:
            try:
                link_elem = card.find_element(By.CSS_SELECTOR, "h2 a")
                name = link_elem.text.strip()
                link = link_elem.get_attribute("href")
            except:
                name = None
                link = None

            try:
                price = card.find_element(By.CSS_SELECTOR, "span.ProductPrice_productPrice__price__3-50j").text.strip()
            except:
                price = None

            try:
                unit = card.find_element(By.CSS_SELECTOR, "span.ProductPrice_productPrice__unit__2jvkA").text.strip()
            except:
                unit = None

            products.append({
                "category": category_name,
                "product_name": name,
                "price": price,
                "unit": unit, 
                "url": link
            })

        # ---- Pagination ----
        try:
            current_page_elem = driver.find_element(By.CSS_SELECTOR, "li.PaginationItem_paginationItem_selected__3BZC-")
            current_page_text = current_page_elem.text.strip()
            match = re.search(r'\d+', current_page_text)

            if not match:
                aria_label = current_page_elem.get_attribute("aria-label") or ""
                match = re.search(r'\d+', aria_label)

            if not match:
                print("⚠️ Can't determine current page. Stopping pagination.")
                break

            current_page = int(match.group())

            pagination_items = driver.find_elements(By.CSS_SELECTOR, "li.PaginationItem_paginationItem__2f87h")
            next_item = None
            for item in pagination_items:
                label = item.text.strip() or (item.get_attribute("aria-label") or "")
                m = re.search(r'\d+', label)
                if m and int(m.group()) == current_page + 1:
                    next_item = item
                    break

            if not next_item:
                print(f"✅ Finished {category_name}.")
                break

            driver.execute_script("arguments[0].click();", next_item)
            WebDriverWait(driver, 10).until(
                EC.text_to_be_present_in_element(
                    (By.CSS_SELECTOR, "li.PaginationItem_paginationItem_selected__3BZC-"),
                    str(current_page + 1)
                )
            )

            page += 1
            time.sleep(1)

        except Exception as e:
            print(f"⚠️ Pagination stopped for {category_name}: {e}")
            break

    return pd.DataFrame(products)


# ---- Categories to scrape ----
categories = {
    "Bakery": "https://www.traderjoes.com/home/products/category/bakery-11",
    "Cheese": "https://www.traderjoes.com/home/products/category/cheese-29",
    "Dairy & Eggs": "https://www.traderjoes.com/home/products/category/dairy-eggs-44",
    "Dips, Sauces & Dressings": "https://www.traderjoes.com/home/products/category/dips-sauces-dressings-59",
    "Fresh Prepared Foods": "https://www.traderjoes.com/home/products/category/fresh-prepared-foods-80",
    "From the Freezer": "https://www.traderjoes.com/home/products/category/from-the-freezer-95",
    "Fresh Fruits & Veggies": "https://www.traderjoes.com/home/products/category/fresh-fruits-veggies-113",
    "Meat, Seafood & Plant-Based": "https://www.traderjoes.com/home/products/category/meat-seafood-plant-based-122",
    "For the Pantry": "https://www.traderjoes.com/home/products/category/for-the-pantry-137",
    "Snacks & Sweets": "https://www.traderjoes.com/home/products/category/snacks-sweets-167"
}

# ---- Run through all categories ----
all_data = []
for category, link in categories.items():
    df = scrape_category(link, category)
    all_data.append(df)

# ---- Combine & save ----
final_df = pd.concat(all_data, ignore_index=True)
final_df.to_csv("trader_joes_products.csv", index=False)

print("\n Finished scraping to 'trader_joes_products.csv'")

driver.quit()


Scraping Bakery - page 1...
Scraping Bakery - page 2...
Scraping Bakery - page 3...
Scraping Bakery - page 4...
Scraping Bakery - page 5...
Scraping Bakery - page 6...
✅ Finished Bakery.
Scraping Cheese - page 1...
Scraping Cheese - page 2...
Scraping Cheese - page 3...
Scraping Cheese - page 4...
Scraping Cheese - page 5...
✅ Finished Cheese.
Scraping Dairy & Eggs - page 1...
Scraping Dairy & Eggs - page 2...
Scraping Dairy & Eggs - page 3...
✅ Finished Dairy & Eggs.
Scraping Dips, Sauces & Dressings - page 1...
Scraping Dips, Sauces & Dressings - page 2...
Scraping Dips, Sauces & Dressings - page 3...
Scraping Dips, Sauces & Dressings - page 4...
Scraping Dips, Sauces & Dressings - page 5...
Scraping Dips, Sauces & Dressings - page 6...
Scraping Dips, Sauces & Dressings - page 7...
✅ Finished Dips, Sauces & Dressings.
Scraping Fresh Prepared Foods - page 1...
Scraping Fresh Prepared Foods - page 2...
Scraping Fresh Prepared Foods - page 3...
Scraping Fresh Prepared Foods - page 4...
