# Carrefour Product Scraper

This notebook scrapes product information from Carrefour's website, including:
- Product image
- Product link
- Product name
- Price (current and original)
- Labels/tags
- Product description
- Brand

In [27]:
# Import necessary libraries
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
# Add this to your existing imports cell
import os
import logging
import json
from datetime import datetime
from urllib.parse import urlparse, parse_qs
import re

In [28]:
# Set up Chrome options for Selenium
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Initialize the Chrome driver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

In [29]:
# Add this improved scrape_page function that's more resilient to failures
def scrape_page(url, max_retries=3, initial_delay=3):
    """
    Scrape product information from a specific page with built-in retry logic
    and improved wait strategies
    """
    retry_count = 0
    backoff_factor = 1.5
    current_delay = initial_delay
    
    while retry_count <= max_retries:
        try:
            # Clear cookies and cache before loading the page
            if retry_count > 0:
                driver.delete_all_cookies()
                
            # Load the page
            driver.get(url)
            
            # Use a longer wait on first attempt and after failures
            wait_time = 10 if retry_count == 0 else 15
            
            # Wait for the first product card with a longer timeout
            try:
                WebDriverWait(driver, wait_time).until(
                    EC.presence_of_all_elements_located((By.CLASS_NAME, "category-categoryItem-7pb"))
                )
            except Exception as e:
                # If we can't find product cards, check if there's any content
                # Maybe we're on an empty page or error page
                body_text = driver.find_element(By.TAG_NAME, "body").text
                if "No products found" in body_text or "Please try again" in body_text:
                    print(f"Page indicates no products available")
                    return []
                else:
                    raise e  # Re-raise if it's not an empty results page
            
            # Let the page fully render and JS execute
            time.sleep(current_delay)
            
            # Scroll down to ensure lazy-loaded content is loaded
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
            time.sleep(1)
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1)
            
            # Get the page source and parse with BeautifulSoup
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            
            # Find all product cards
            product_cards = soup.find_all("div", class_="category-categoryItem-7pb")
            
            # If we found no products, double-check with Selenium
            if not product_cards:
                # Try direct Selenium approach as fallback
                try:
                    selenium_cards = driver.find_elements(By.CLASS_NAME, "category-categoryItem-7pb")
                    if selenium_cards:
                        print(f"BeautifulSoup couldn't find cards but Selenium found {len(selenium_cards)}")
                        # If Selenium found cards but BS4 didn't, we need to use Selenium extraction
                        return extract_products_with_selenium(driver)
                except Exception as selenium_err:
                    print(f"Selenium fallback also failed: {selenium_err}")
            
            products = []
            
            for card in product_cards:
                product = {}
                
                # Extract product link
                link_elem = card.find("a", class_="galleryItemExtend-images-Rt2")
                if link_elem and link_elem.has_attr('href'):
                    product['link'] = "https://www.carrefour.tn" + link_elem['href']
                
                # Extract product image
                image_elem = card.find("img", class_="image-loaded-QS8")
                if image_elem and image_elem.has_attr('src'):
                    product['image_url'] = "https://www.carrefour.tn" + image_elem['src'].split('?')[0]
                
                # Extract product name
                name_elem = card.find("span", class_="item-name-LPg")
                if name_elem:
                    product['name'] = name_elem.text.strip()
                
                # Extract current price
                price_container = card.find("div", class_="item-priceReduction-hBy")
                if price_container:
                    integer = price_container.find("span", class_="item-miniInteger-NhR")
                    decimal = price_container.find("span", class_="item-miniDecimal-Cwx")
                    currency = price_container.find("span", class_="item-miniCurrency-sAq")
                    
                    if integer and decimal and currency:
                        product['price'] = f"{integer.text}.{decimal.text} {currency.text}"
                
                # Extract old price if available
                old_price_container = card.find("div", class_="item-oldPrice-x2a")
                if old_price_container:
                    integer = old_price_container.find("span", class_="item-miniInteger-NhR")
                    decimal = old_price_container.find("span", class_="item-miniDecimal-Cwx")
                    currency = old_price_container.find("span", class_="item-miniCurrency-sAq")
                    
                    if integer and decimal and currency:
                        product['old_price'] = f"{integer.text}.{decimal.text} {currency.text}"
                else:
                    product['old_price'] = None
                
                # Extract labels/tags if available
                labels_container = card.find("div", class_="productLabels-root--7k")
                if labels_container and labels_container.find_all("div", class_="label-root-K4m"):
                    product['label'] = "Promo"  # Based on your example, this seems to be a promotion label
                else:
                    product['label'] = None
                
                # Extract brand
                brand_elem = card.find("div", class_="item-carrefourLabel-AeJ")
                if brand_elem:
                    product['brand'] = brand_elem.text.strip()
                
                # Extract description if available
                desc_elem = card.find("div", class_="item-description-oxA")
                if desc_elem and desc_elem.find("div", class_="richContent-root-Ddk"):
                    product['description'] = desc_elem.find("div", class_="richContent-root-Ddk").text.strip()
                else:
                    product['description'] = None
                
                products.append(product)
            
            # If we successfully scraped products, return them
            if products:
                print(f"Successfully scraped {len(products)} products")
                return products
            else:
                # If no products found but no error thrown, we might be at the end
                print("No products found on page, might be end of pagination")
                return []
                
        except Exception as e:
            retry_count += 1
            print(f"Error during page scraping (attempt {retry_count}/{max_retries}): {str(e)}")
            
            if retry_count <= max_retries:
                print(f"Waiting {current_delay}s before retrying...")
                time.sleep(current_delay)
                current_delay *= backoff_factor  # Increase delay for next retry
            else:
                print(f"Failed to scrape after {max_retries} attempts")
                # Return empty list on complete failure
                return []
    
    return []

def extract_products_with_selenium(driver):
    """Fallback extraction method using Selenium directly instead of BeautifulSoup"""
    products = []
    
    try:
        # Find all product cards using Selenium
        cards = driver.find_elements(By.CLASS_NAME, "category-categoryItem-7pb")
        
        for card in cards:
            product = {}
            
            # Extract with more defensive error handling for each element
            try:
                link_elem = card.find_element(By.CLASS_NAME, "galleryItemExtend-images-Rt2")
                product['link'] = "https://www.carrefour.tn" + link_elem.get_attribute('href')
            except:
                pass
                
            try:
                image_elem = card.find_element(By.CLASS_NAME, "image-loaded-QS8")
                src = image_elem.get_attribute('src')
                if src:
                    product['image_url'] = "https://www.carrefour.tn" + src.split('?')[0]
            except:
                pass
                
            try:
                name_elem = card.find_element(By.CLASS_NAME, "item-name-LPg")
                product['name'] = name_elem.text.strip()
            except:
                pass
                
            # For price, brand, description, etc. - similar extraction with try/except blocks
            
            if 'name' in product and 'link' in product:  # Only add if we at least have name and link
                products.append(product)
                
    except Exception as e:
        print(f"Selenium extraction failed: {e}")
        
    return products

In [30]:
def dismiss_cookie_banner(driver, max_attempts=3):
    """
    Find and dismiss the cookie banner/popup if present
    """
    for attempt in range(max_attempts):
        try:
            # Look for the "Continue without accepting" button
            cookie_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.CLASS_NAME, "cookies-notacceptBtn-wQ1"))
            )
            
            print("Found cookie consent banner, attempting to dismiss...")
            
            # Scroll the button into view
            driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", cookie_button)
            time.sleep(1)
            
            # Try different click methods
            try:
                # Direct click
                cookie_button.click()
            except:
                # JavaScript click as fallback
                driver.execute_script("arguments[0].click();", cookie_button)
            
            print("Successfully dismissed cookie banner")
            time.sleep(2)  # Wait for banner to disappear
            return True
            
        except Exception as e:
            # If we can't find the button or clicking fails, it might not be present
            if attempt == max_attempts - 1:
                print("No cookie banner found or failed to dismiss it")
            time.sleep(1)
    
    return False

In [31]:
def click_load_more_button(driver, max_attempts=2):
    """Optimized button click function - only check for cookies banner once"""
    
    for attempt in range(max_attempts):
        try:
            # Find the button more efficiently with a shorter timeout
            load_more_button = WebDriverWait(driver, 3).until(
                EC.presence_of_element_located((By.CLASS_NAME, "category-buttonsPag-jTw"))
            )
            
            if "PRODUITS SUIVANTS" not in load_more_button.text:
                return False
                
            # Directly use JavaScript click (fastest method)
            driver.execute_script("arguments[0].click();", load_more_button)
            
            # Wait just enough time for page to start loading new content
            WebDriverWait(driver, 5).until(
                lambda d: len(d.find_elements(By.CLASS_NAME, "category-categoryItem-7pb")) > 
                          int(d.execute_script("return window._last_product_count || 0"))
            )
            
            # Store current count for next comparison
            driver.execute_script("window._last_product_count = document.querySelectorAll('.category-categoryItem-7pb').length;")
            return True
            
        except Exception as e:
            if attempt < max_attempts - 1:
                time.sleep(1)
                continue
            else:
                return False
    
    return False

In [None]:
def save_progress(category_name, products, current_attempt, save_dir):
    """Save current scraping progress to files"""
    os.makedirs(save_dir, exist_ok=True)
    
    # Save the products to CSV
    if products:
        df = pd.DataFrame(products)
        csv_path = os.path.join(save_dir, f"{category_name}_products.csv")
        df.to_csv(csv_path, index=False)
    
    # Save progress information
    progress_info = {
        "category": category_name,
        "last_updated": datetime.now().isoformat(),
        "current_attempt": current_attempt,
        "products_count": len(products),
        "status": "in_progress"
    }
    
    with open(os.path.join(save_dir, "progress.json"), 'w') as f:
        json.dump(progress_info, f, indent=2)
    
    print(f"Progress saved: {len(products)} products from {category_name} (attempt {current_attempt})")

In [32]:
def scrape_category_with_load_more(base_url, category_name=None, max_attempts=100):
    """Optimized category scraper with better performance"""
    if not category_name:
        parsed_url = urlparse(base_url)
        category_name = os.path.splitext(os.path.basename(parsed_url.path))[0]
    
    print(f"\n{'='*50}")
    print(f"Starting to scrape category: {category_name}")
    print(f"{'='*50}")
    
    save_dir = os.path.join("carrefour_data", category_name)
    os.makedirs(save_dir, exist_ok=True)
    
    # Initialize variables
    driver.get(base_url)
    all_products = []
    current_attempt = 0
    previous_count = 0
    no_new_products_streak = 0
    product_links_set = set()  # For faster duplicate checking
    
    # Only check for cookie banner once at the beginning
    dismiss_cookie_banner(driver)
    
    # Wait for initial products with shorter timeout
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "category-categoryItem-7pb"))
        )
        
        # Initial scroll to trigger lazy loading
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)  # Reduced wait time
        
        print("Initial page loaded successfully")
    except Exception as e:
        print(f"Error loading initial page: {e}")
        return []
    
    try:
        while current_attempt < max_attempts:
            # Extract only the product links first (much faster than full extraction)
            product_elements = driver.find_elements(By.CLASS_NAME, "category-categoryItem-7pb")
            current_product_count = len(product_elements)
            print(f"Found {current_product_count} products on the page")
            
            # Check if we made progress
            if current_product_count <= previous_count:
                no_new_products_streak += 1
                if no_new_products_streak >= 2:  # Reduced from 3 to 2
                    print("No new products after multiple attempts. All products likely scraped.")
                    break
            else:
                no_new_products_streak = 0
            
            # Only process products if we have more than before or it's the first run
            if current_product_count > previous_count or not all_products:
                # Extract product data more efficiently - only process new products
                start_idx = min(previous_count, current_product_count)  # Start from where we left off
                
                # Use the page source only once instead of repeatedly
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                product_cards = soup.find_all("div", class_="category-categoryItem-7pb")
                
                # Process only the new cards
                new_cards = product_cards[start_idx:]
                new_products = []
                
                for card in new_cards:
                    product = {}
                    
                    # Extract product link first to check if we've seen it
                    link_elem = card.find("a", class_="galleryItemExtend-images-Rt2")
                    if link_elem and link_elem.has_attr('href'):
                        product_link = "https://www.carrefour.tn" + link_elem['href']
                        
                        # Skip if we've already processed this product
                        if product_link in product_links_set:
                            continue
                            
                        product_links_set.add(product_link)
                        product['link'] = product_link
                        
                        # Now extract other product attributes
                        # [Rest of your product extraction code]
                        # Extract product image
                        image_elem = card.find("img", class_="image-loaded-QS8")
                        if image_elem and image_elem.has_attr('src'):
                            product['image_url'] = "https://www.carrefour.tn" + image_elem['src'].split('?')[0]
                        
                        # Extract product name
                        name_elem = card.find("span", class_="item-name-LPg")
                        if name_elem:
                            product['name'] = name_elem.text.strip()
                        
                        # Extract current price
                        price_container = card.find("div", class_="item-priceReduction-hBy")
                        if price_container:
                            integer = price_container.find("span", class_="item-miniInteger-NhR")
                            decimal = price_container.find("span", class_="item-miniDecimal-Cwx")
                            currency = price_container.find("span", class_="item-miniCurrency-sAq")
                            
                            if integer and decimal and currency:
                                product['price'] = f"{integer.text}.{decimal.text} {currency.text}"
                        
                        # Extract old price if available
                        old_price_container = card.find("div", class_="item-oldPrice-x2a")
                        if old_price_container:
                            integer = old_price_container.find("span", class_="item-miniInteger-NhR")
                            decimal = old_price_container.find("span", class_="item-miniDecimal-Cwx")
                            currency = old_price_container.find("span", class_="item-miniCurrency-sAq")
                            
                            if integer and decimal and currency:
                                product['old_price'] = f"{integer.text}.{decimal.text} {currency.text}"
                        else:
                            product['old_price'] = None
                        
                        # Extract labels/tags
                        labels_container = card.find("div", class_="productLabels-root--7k")
                        if labels_container and labels_container.find_all("div", class_="label-root-K4m"):
                            product['label'] = "Promo"
                        else:
                            product['label'] = None
                        
                        # Extract brand
                        brand_elem = card.find("div", class_="item-carrefourLabel-AeJ")
                        if brand_elem:
                            product['brand'] = brand_elem.text.strip()
                        
                        # Extract description
                        desc_elem = card.find("div", class_="item-description-oxA")
                        if desc_elem and desc_elem.find("div", class_="richContent-root-Ddk"):
                            product['description'] = desc_elem.find("div", class_="richContent-root-Ddk").text.strip()
                        else:
                            product['description'] = None
                            
                        new_products.append(product)
                
                # Add new products to our collection
                if new_products:
                    all_products.extend(new_products)
                    print(f"Added {len(new_products)} new unique products")
                else:
                    no_new_products_streak += 1  # Increment if no new products found
            
            # Save progress less frequently - only every 5 attempts instead of 3
            if current_attempt % 5 == 0:
                save_progress(category_name, all_products, current_attempt + 1, save_dir)
            
            # Click the "Load More" button
            button_clicked = click_load_more_button(driver)
            if button_clicked:
                print("Successfully clicked the 'Load More' button")
                # Update counters
                previous_count = current_product_count
                current_attempt += 1
                
                # Variable delay based on network conditions (reduced overall)
                delay = 1 + (current_attempt % 2)  # Either 1 or 2 seconds
                print(f"Waiting {delay} seconds before continuing...")
                time.sleep(delay)
            else:
                print("Could not click 'Load More' button or it doesn't exist. Ending scraping.")
                break
            
    except Exception as e:
        print(f"Error during category scraping: {e}")
    finally:
        # Always save final progress
        save_progress(category_name, all_products, current_attempt + 1, save_dir)
    
    print(f"Completed scraping category {category_name}: {len(all_products)} total products")
    return all_products

In [33]:
# Add this main execution cell to replace your previous execution code
# List of category URLs to scrape
category_urls = [
    "https://www.carrefour.tn/soins-solaires.html",
    "https://www.carrefour.tn/nos-recettes.html",
    "https://www.carrefour.tn/le-marche.html",
    "https://www.carrefour.tn/surgeles.html",
    "https://www.carrefour.tn/cremerie-et-produits-laitiers.html",
    "https://www.carrefour.tn/boissons.html",
    "https://www.carrefour.tn/epicerie-sucree.html",
    "https://www.carrefour.tn/bio-sans-gluten-et-dietetique.html",
    "https://www.carrefour.tn/entretien-et-nettoyage.html",
    "https://www.carrefour.tn/hygiene-et-beaute.html",
    "https://www.carrefour.tn/bebe.html",
    "https://www.carrefour.tn/animalerie.html",
    "https://www.carrefour.tn/jeux-et-jouets.html",
    "https://www.carrefour.tn/maison-et-loisirs/cuisine.html",
    "https://www.carrefour.tn/maison-et-loisirs/gros-electromenager.html",
    "https://www.carrefour.tn/maison-et-loisirs/image-et-son.html",
    "https://www.carrefour.tn/maison-et-loisirs/informatique.html",
    "https://www.carrefour.tn/maison-et-loisirs/smartphones-et-objets-connectes.html",
    "https://www.carrefour.tn/maison-et-loisirs/entretien-de-la-maison.html",
    "https://www.carrefour.tn/maison-et-loisirs/beaute-et-sante.html",
    "https://www.carrefour.tn/maison-et-loisirs/maison-et-decoration.html",
    "https://www.carrefour.tn/maison-et-loisirs/brico-et-auto.html",
    "https://www.carrefour.tn/maison-et-loisirs/jardin-et-amenagement-dexterieur.html",
    "https://www.carrefour.tn/maison-et-loisirs/mode-et-bagagerie.html",
    "https://www.carrefour.tn/maison-et-loisirs/bebe-11.html",
    "https://www.carrefour.tn/maison-et-loisirs/sport-et-loisirs.html",
    "https://www.carrefour.tn/maison-et-loisirs/culture-et-fournitures-scolaires.html"
]

# Main execution with browser restart after each category
def main():
    os.makedirs("carrefour_data", exist_ok=True)
    
    # Setup logging
    logging.basicConfig(
        filename="carrefour_scraper.log",
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s'
    )
    
    # Track overall progress in a summary file
    summary_file = os.path.join("carrefour_data", "summary.json")
    all_categories_data = {}
    
    if os.path.exists(summary_file):
        try:
            with open(summary_file, 'r') as f:
                all_categories_data = json.load(f)
            print(f"Loaded previous summary data with {len(all_categories_data)} categories")
        except Exception as e:
            print(f"Error loading summary file: {e}")
    
    start_time = datetime.now()
    logging.info(f"Starting scraping job at {start_time}")
    print(f"Starting scraping job at {start_time}")
    
    total_products = 0
    
    # Process each category with a fresh browser instance
    for i, url in enumerate(category_urls):
        category_name = os.path.splitext(os.path.basename(urlparse(url).path))[0]
        
        # Skip if already completed successfully
        if category_name in all_categories_data and all_categories_data[category_name].get("status") == "completed":
            print(f"Skipping {category_name} - already completed")
            total_products += all_categories_data[category_name].get("products_count", 0)
            continue
        
        print(f"\nProcessing category {i+1}/{len(category_urls)}: {category_name}")
        logging.info(f"Starting category: {category_name} ({url})")
        
        # Create a fresh browser instance for each category
        driver = None
        try:
            # Set up Chrome options
            chrome_options = Options()
            chrome_options.add_argument("--headless")
            chrome_options.add_argument("--disable-gpu")
            chrome_options.add_argument("--window-size=1920,1080")
            chrome_options.add_argument("--disable-extensions")
            chrome_options.add_argument("--no-sandbox")
            chrome_options.add_argument("--disable-dev-shm-usage")
            chrome_options.add_argument("--disable-notifications")
            # These options can help with stability
            chrome_options.add_argument("--disable-features=NetworkService")
            chrome_options.add_argument("--disable-features=VizDisplayCompositor")
            
            service = Service(ChromeDriverManager().install())
            driver = webdriver.Chrome(service=service, options=chrome_options)
            
            # Set page load timeout
            driver.set_page_load_timeout(30)
            
            # Scrape this category
            products = scrape_category_with_load_more(url, category_name)
            
            # Update summary data
            all_categories_data[category_name] = {
                "url": url,
                "products_count": len(products),
                "last_updated": datetime.now().isoformat(),
                "status": "completed"
            }
            
            total_products += len(products)
            
            # Save updated summary
            with open(summary_file, 'w') as f:
                json.dump(all_categories_data, f, indent=2)
            
            print(f"Completed category: {category_name} with {len(products)} products")
            logging.info(f"Completed category: {category_name} with {len(products)} products")
            
        except Exception as e:
            error_msg = f"Error processing category {category_name}: {str(e)}"
            print(error_msg)
            logging.error(error_msg)
            
            # Mark as failed in summary
            all_categories_data[category_name] = {
                "url": url,
                "status": "failed",
                "error": str(e),
                "last_updated": datetime.now().isoformat()
            }
            
            # Save updated summary even after error
            with open(summary_file, 'w') as f:
                json.dump(all_categories_data, f, indent=2)
                
        finally:
            # Always close the driver
            if driver:
                try:
                    driver.quit()
                except:
                    pass
            
            # Brief pause between categories
            time.sleep(5)
    
    end_time = datetime.now()
    duration = end_time - start_time
    summary_msg = f"Scraping job completed at {end_time}. Total duration: {duration}. Total products: {total_products}"
    print(summary_msg)
    logging.info(summary_msg)
    
    return all_categories_data

# Run the main execution
try:
    results = main()
    
    # Generate a summary report
    print("\nScraping Summary:")
    print("-" * 50)
    total_products = 0
    completed = 0
    failed = 0
    
    for category, data in results.items():
        status = data.get("status", "unknown")
        count = data.get("products_count", 0)
        
        if status == "completed":
            completed += 1
            total_products += count
            print(f"✓ {category}: {count} products")
        else:
            failed += 1
            print(f"✗ {category}: Failed - {data.get('error', 'Unknown error')}")
    
    print("-" * 50)
    print(f"Total categories: {len(results)}")
    print(f"Completed: {completed}")
    print(f"Failed: {failed}")
    print(f"Total products scraped: {total_products}")
    
except Exception as e:
    print(f"Fatal error in main execution: {e}")

Loaded previous summary data with 27 categories
Starting scraping job at 2025-05-06 18:20:07.149965
Skipping soins-solaires - already completed
Skipping nos-recettes - already completed
Skipping le-marche - already completed
Skipping surgeles - already completed
Skipping cremerie-et-produits-laitiers - already completed
Skipping boissons - already completed
Skipping epicerie-sucree - already completed
Skipping bio-sans-gluten-et-dietetique - already completed
Skipping entretien-et-nettoyage - already completed
Skipping hygiene-et-beaute - already completed
Skipping bebe - already completed
Skipping animalerie - already completed
Skipping jeux-et-jouets - already completed
Skipping cuisine - already completed
Skipping gros-electromenager - already completed
Skipping image-et-son - already completed
Skipping informatique - already completed
Skipping smartphones-et-objets-connectes - already completed
Skipping entretien-de-la-maison - already completed
Skipping beaute-et-sante - already c