## Trader Joes Scraper Example ##

The following notebook is to guide the user through the web scraping process for extracting prices and recipes from the grocery store, Trader Joe's.

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920,1080")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

driver.get("https://www.traderjoes.com/home/products/pdp/organic-milk-a2a2-080971")

driver.implicitly_wait(10)
time.sleep(5)
# Extract product details

try:
    product_title = driver.find_element(By.CSS_SELECTOR, "h1.ProductDetails_main__title__14Cnm").text
    product_price = driver.find_element(By.CSS_SELECTOR, "span.ProductPrice_productPrice__price__3-50j").text

    print("Product Title:", product_title)
    print("Price:", product_price)

except Exception as e:
    print("Error extracting product details:", e)

nutrition_data = {}

try:
    # 1. Find the nutrition container
    container = driver.find_element(By.CSS_SELECTOR, "div.NutritionFacts_nutritionFacts__1Nvz0")

    # 2. Extract characteristics (serving size, calories, etc.)
    characteristics = container.find_elements(By.CSS_SELECTOR, "div.Item_characteristics__item__2TgL-")
    for item in characteristics:
        title_elem = item.find_element(By.CSS_SELECTOR, "div.Item_characteristics__title__7nfa8")
        try:
            text_elem = item.find_element(By.CSS_SELECTOR, "div.Item_characteristics__text__dcfEC")
            nutrition_data[title_elem.text] = text_elem.text
        except:
            nutrition_data[title_elem.text] = None

    # 3. Extract table data
    rows = container.find_elements(By.CSS_SELECTOR, "table.Item_table__2PMbE tbody tr")
    for row in rows:
        cells = row.find_elements(By.TAG_NAME, "td")
        if len(cells) == 3:
            nutrient = cells[0].text
            amount = cells[1].text
            dv = cells[2].text
            nutrition_data[nutrient] = {"amount": amount, "%dv": dv}

    # Print results
    from pprint import pprint
    pprint(nutrition_data)

finally:
    driver.quit()
# Close the WebDriver
driver.quit()

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
import pandas as pd
import time
import re

def scrape_category(url):
    chrome_options = Options()
    #chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    driver.get(url)

    all_products = []
    page = 1
    time.sleep(3)
    while True:
        print(f"Scraping page {page}...")
        WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "section.ProductCard_card__4WAOg"))
        )
    
        # --- scrape cards (same as before) ---
        cards = driver.find_elements(By.CSS_SELECTOR, "section.ProductCard_card__4WAOg")
        for card in cards:
            try:
                name_elem = card.find_element(By.CSS_SELECTOR, "h2 a")
                name = name_elem.text.strip()
                link = name_elem.get_attribute("href")
            except:
                name = None
                link = None
    
            try:
                price = card.find_element(By.CSS_SELECTOR, "span.ProductPrice_productPrice__price__3-50j").text.strip()
            except:
                price = None
    
            try:
                unit = card.find_element(By.CSS_SELECTOR, "span.ProductPrice_productPrice__unit__2jvkA").text.strip()
            except:
                unit = None
    
            all_products.append({
                "name": name, "price": price, "unit": unit, "url": link, "page": page
            })
    
        # --- pagination handling ---

        try:
            current_page_elem = driver.find_element(
                By.CSS_SELECTOR, "li.PaginationItem_paginationItem_selected__3BZC-"
            )
            current_page_text = current_page_elem.text.strip()
        
            # Try extracting digits from text or aria-label
            match = re.search(r'\d+', current_page_text)
            if not match:
                aria_label = current_page_elem.get_attribute("aria-label") or ""
                match = re.search(r'\d+', aria_label)
        
            if match:
                current_page = match.group()
            else:
                print("⚠️ Could not determine current page number, stopping.")
                break
        
            pagination_items = driver.find_elements(
                By.CSS_SELECTOR, "li.PaginationItem_paginationItem__2f87h"
            )
        
            next_item = None
            for item in pagination_items:
                label = item.text.strip() or (item.get_attribute("aria-label") or "")
                m = re.search(r'\d+', label)
                if m and m.group() == str(int(current_page) + 1):
                    next_item = item
                    break
        
            if not next_item:
                print("✅ No more pages to scrape.")
                break
        
            driver.execute_script("arguments[0].click();", next_item)
        
            # Wait for the selected page to update visually
            WebDriverWait(driver, 10).until(
                EC.text_to_be_present_in_element(
                    (By.CSS_SELECTOR, "li.PaginationItem_paginationItem_selected__3BZC-"),
                    str(int(current_page) + 1)
                )
            )
        
            page += 1
            time.sleep(1)
        
        except (TimeoutException, StaleElementReferenceException):
            print("⚠️ Pagination ended or element went stale.")
            break

    driver.quit()
    return pd.DataFrame(all_products)


# ---- Run it ----
#url = "https://www.traderjoes.com/home/products/category/meat-seafood-plant-based-122"
url = "https://www.traderjoes.com/home/products/category/fresh-fruits-veggies-113"
df = scrape_category(url)

# Preview results
print(df.head())

# Save locally
df.to_csv("traderjoes_fresh-fruits-veggies_products.csv", index=False)
print("✅ Saved traderjoes_fresh-fruits-veggies_products.csv")


In [15]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time, re

# ---- Selenium setup ----
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(options=options)

BASE_URL = "https://www.traderjoes.com"
RECIPES_URL = f"{BASE_URL}/home/recipes"

# ---- Helper: scrape ingredients from a recipe in a new tab ----
def scrape_ingredients_and_details(recipe_url):
    serves = None
    time_str = None
    ingredients = []

    try:
        # Open in new tab
        driver.execute_script("window.open(arguments[0], '_blank');", recipe_url)
        driver.switch_to.window(driver.window_handles[-1])

        # Wait for ingredient list
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "ul.IngredientsList_ingredientsList__1LoAJ li"))
        )

        # --- Ingredients ---
        ingredients_elems = driver.find_elements(By.CSS_SELECTOR, "ul.IngredientsList_ingredientsList__1LoAJ li")
        ingredients = [i.text.strip() for i in ingredients_elems]

        # --- Serves / Time ---
        try:
            meta_items = driver.find_elements(By.CSS_SELECTOR, "span.RecipeDetails_recipeDetails__complexityItem__2X49n")
            for item in meta_items:
                text = item.text.strip()
                if text.lower().startswith("serves"):
                    serves = text
                elif text.lower().startswith("time"):
                    time_str = text
        except Exception as e:
            print(f"⚠️ Could not find serves/time info for {recipe_url}: {e}")

        time.sleep(1)

    except Exception as e:
        print(f"⚠️ Could not scrape ingredients/details from {recipe_url}: {e}")

    finally:
        # Close tab and return
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
        time.sleep(1)

    return ingredients, serves, time_str

# ---- Main: scrape all recipes ----
def scrape_all_recipes():
    driver.get(RECIPES_URL)
    time.sleep(3)

    recipes = []
    page = 1

    while True:
        print(f"📄 Scraping page {page}...")

        # Wait for recipe cards to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.RecipeGridCard_recipe__1Wo__"))
        )

        cards = driver.find_elements(By.CSS_SELECTOR, "a.RecipeGridCard_recipe__1Wo__")

        for i in range(len(cards)):
            # Re-fetch card each time in case the DOM changed
            try:
                cards = driver.find_elements(By.CSS_SELECTOR, "a.RecipeGridCard_recipe__1Wo__")
                card = cards[i]
            except IndexError:
                print(f"⚠️ Card index {i} out of range — stopping early.")
                break
        
            # Extract title
            try:
                title = card.find_element(By.CSS_SELECTOR, "h3.RecipeGridCard_recipe__title__3-8S-").text.strip()
            except:
                title = None
                print(f"No title found for card {i}")
        
            # Extract category
            try:
                category = card.find_element(By.CSS_SELECTOR, "p.RecipeGridCard_recipe__categories__3b5AM").text.strip()
            except:
                category = None
        
            # Extract link
            try:
                link = card.get_attribute("href")
                if link and link.startswith("/"):
                    link = BASE_URL + link
            except:
                link = None
        
            # Extract image
            try:
                img_elem = card.find_element(By.CSS_SELECTOR, "div.RecipeGridCard_recipe__img__1hv4j img")
                img_url = img_elem.get_attribute("src")
            except:
                img_url = None
        
            # Scrape ingredients from link
            ingredients, serves, time_str = scrape_ingredients_and_details(link) if link else ([], None, None)
        
            recipes.append({
                "title": title,
                "category": category,
                "url": link,
                "image_url": img_url,
                "serves": serves,
                "time": time_str,
                "ingredients": ingredients
            })


        # ---- Pagination ----
        try:
            time.sleep(2)
            # Try to find a "Next" button first
            next_buttons = driver.find_elements(By.CSS_SELECTOR, "li.PaginationItem_paginationItem__2f87h button")
        
            next_button = None
            for btn in next_buttons:
                aria_label = btn.get_attribute("aria-label")
                if aria_label and "Next" in aria_label:
                    next_button = btn
                    break
        
            if next_button:
                driver.execute_script("arguments[0].click();", next_button)
                page += 1
                print(f"➡️ Going to page {page}...")
                time.sleep(3)
                continue
        
            # Fallback: numbered pagination
            pagination_items = driver.find_elements(By.CSS_SELECTOR, "li.PaginationItem_paginationItem__2f87h")
            if not pagination_items:
                print("✅ No pagination found — done!")
                break
        
            selected = None
            for p in pagination_items:
                classes = p.get_attribute("class")
                if "PaginationItem_paginationItem_selected" in classes:
                    selected = p
                    break
        
            # safer number extraction
            text = selected.text.strip() if selected else ""
            match = re.search(r"\d+", text)
            current_page_num = int(match.group()) if match else page
            next_page_num = current_page_num + 1
        
            next_item = None
            for item in pagination_items:
                text = item.text.strip()
                match = re.search(r"\d+", text)
                if match and int(match.group()) == next_page_num:
                    next_item = item
                    break
        
            if next_item:
                driver.execute_script("arguments[0].click();", next_item)
                page += 1
                print(f"➡️ Going to page {page}...")
                time.sleep(3)
            else:
                print("✅ No next page found — done!")
                break
        
        except Exception as e:
            print(f"⚠️ Pagination ended: {e}")
            break

    return pd.DataFrame(recipes)

# ---- Run scraper ----
recipes_df = scrape_all_recipes()

# ---- Save to CSV ----
recipes_df.to_csv("trader_joes_recipes.csv", index=False)
print("✅ Saved recipes to trader_joes_recipes.csv")

driver.quit()


📄 Scraping page 1...
➡️ Going to page 2...
📄 Scraping page 2...
➡️ Going to page 3...
📄 Scraping page 3...
➡️ Going to page 4...
📄 Scraping page 4...
➡️ Going to page 5...
📄 Scraping page 5...
➡️ Going to page 6...
📄 Scraping page 6...
➡️ Going to page 7...
📄 Scraping page 7...
➡️ Going to page 8...
📄 Scraping page 8...
➡️ Going to page 9...
📄 Scraping page 9...
➡️ Going to page 10...
📄 Scraping page 10...
➡️ Going to page 11...
📄 Scraping page 11...
➡️ Going to page 12...
📄 Scraping page 12...
➡️ Going to page 13...
📄 Scraping page 13...
➡️ Going to page 14...
📄 Scraping page 14...
➡️ Going to page 15...
📄 Scraping page 15...
➡️ Going to page 16...
📄 Scraping page 16...
➡️ Going to page 17...
📄 Scraping page 17...
➡️ Going to page 18...
📄 Scraping page 18...
➡️ Going to page 19...
📄 Scraping page 19...
➡️ Going to page 20...
📄 Scraping page 20...
➡️ Going to page 21...
📄 Scraping page 21...
➡️ Going to page 22...
📄 Scraping page 22...
➡️ Going to page 23...
📄 Scraping page 23...
➡️ Go