In [1]:
#pip install selenium selenium-wire selenium-stealth undetected-chromedriver webdriver-manager

In [2]:
import random
import time
from selenium_stealth import stealth
from selenium import webdriver
# --- 1. User-Agent Setup ---
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36",
]
random_user_agent = random.choice(USER_AGENTS)

# --- 2. Proxy Setup ---
# Note: Replace USERNAME and PASSWORD with your actual credentials
#proxy_options = {
#    'proxy': {
#        'no_proxy': 'localhost,127.0.0.1' # Bypasses proxy for local traffic
#    }
#}

# --- 3. Chrome Options Setup ---
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument(f"--user-agent={random_user_agent}")
chrome_options.add_argument("--start-maximized")
# chrome_options.add_argument("--headless") # Commented out to see the browser
chrome_options.add_argument("--lang=en-US,en;q=0.9")

# Basic anti-detection options
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)

# Load only DOM content
#chrome_options.page_load_strategy = 'eager' 

# Initialize driver with selenium-wire
driver = webdriver.Chrome(
    options=chrome_options,
    #seleniumwire_options=proxy_options
)

# --- 4. Stealth Setup ---
stealth(driver,
        languages=["en-US", "en"],
        vendor="Google Inc.",
        platform="Win32",
        webgl_vendor="Intel Inc.",
        renderer="Intel Iris OpenGL Engine",
        fix_hairline=True,
        )

# --- 5. Run the Scraper ---
print("Scraper is running with a stealth configuration!")
print(f"Using IP from proxy and User-Agent: {random_user_agent}")

Scraper is running with a stealth configuration!
Using IP from proxy and User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36


In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import time
import json
import pandas as pd
from selenium.webdriver.common.keys import Keys

In [4]:

def get_filtered_products(products_details, word_to_search):
    """
    Filter products for relevance based on search term
    Returns: list of relevant products
    """
    filtered = []
    search_term_lower = word_to_search.lower()
    
    for p in products_details:
        title_lower = p["name"].lower()
        if search_term_lower in title_lower:
            filtered.append(p)
            
    return filtered



In [5]:
#### Refactored Store Scraping Functions with Try-Catch ####



def scrape_al_fateh(driver, word_to_search, wait_time=10):
    """
    Scrape Al-Fatah store for products
    Returns: list of products with store name, or empty list on error
    """
    try:
        store_name = "Al-Fateh"
        AL_FATEH_GROCERY_URL = f"https://alfatah.pk/search?q={word_to_search}"
        
        driver.get(AL_FATEH_GROCERY_URL)
        wait = WebDriverWait(driver, wait_time)
        
        product_cards = wait.until(EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, ".col-6.col-sm-4.col-md-3.col-lg-2")
        ))
        
        products_details = []
        for product in product_cards:
            try:
                a_element = product.find_element(By.CSS_SELECTOR, "a[class='product-title-ellipsis']")
                product_link = a_element.get_attribute("href")
                product_name = a_element.text
                product_price= product.find_element(By.CLASS_NAME, "product-price").text
                image_container=product.find_element(By.CLASS_NAME, "image")
                image_url=image_container.find_element(By.TAG_NAME, "img").get_attribute("src")

                
                products_details.append({
                    "store": store_name,
                    "name": product_name,
                    "product-link": product_link,
                    "price": product_price,
                    "image_url": image_url
                })
            except Exception as e:
                print(f"[{store_name}] Error extracting product: {str(e)}")
                continue
        
        # Filter for relevance
        filtered_products = get_filtered_products(products_details, word_to_search)
        
        print(f"[{store_name}] Found {len(filtered_products)} relevant products")
        return filtered_products
        
    except Exception as e:
        print(f"[Al-Fateh] Error during scraping: {str(e)}")
        return []


def scrape_metro(driver, word_to_search, wait_time=10):
    try:
        store_name = "Metro"
        METRO_GROCERY_URL = f"https://www.metro-online.pk/search/{word_to_search}?searchText={word_to_search}"
        
        driver.get(METRO_GROCERY_URL)
        wait = WebDriverWait(driver, wait_time)
        
        product_cards = wait.until(EC.presence_of_all_elements_located(
            (By.CLASS_NAME, "CategoryGrid_product_card__FUMXW")
        ))

        products_details = []

        for product_card in product_cards:
            try:
                product_link=product_card.find_element(By.TAG_NAME, "a").get_attribute("href")
                name = product_card.find_element(By.CLASS_NAME, "CategoryGrid_product_name__3nYsN").text
                price = product_card.find_element(By.CLASS_NAME, "CategoryGrid_product_price__Svf8T").text
                
                image_container=product_card.find_element(By.CLASS_NAME, "CategoryGrid_productImg_container__Ga1ll")
                image_url=image_container.find_element(By.TAG_NAME, "img").get_attribute("src")
                products_details.append({
                        "store": store_name,
                        "name": name,
                        "product-link": product_link,
                        "price": price,
                        "image_url": image_url
                    })
            except Exception as e:
                print(f"[{store_name}] Error extracting product details: {str(e)}")
                continue
        
        
        # Filter for relevance
        filtered_products = get_filtered_products(products_details, word_to_search)

        print(f"[{store_name}] Found {len(filtered_products)} products")
        return filtered_products

    except Exception as e:
        print(f"[Metro] Error during scraping: {str(e)}")
        return []


def scrape_jalalsons(driver, word_to_search, wait_time=10):
    """
    Scrape Jalal Sons store for products with name and price
    Returns: list of products with store name, or empty list on error
    """
    try:
        store_name = "Jalal Sons"
        JALALSONS_GROCERY_URL = f"https://jalalsons.com.pk/shop?query={word_to_search}"
        
        driver.get(JALALSONS_GROCERY_URL)
        wait = WebDriverWait(driver, wait_time)
        
        # Close banner if present
        try:
            banner_close_button = wait.until(EC.element_to_be_clickable(
                (By.CSS_SELECTOR, ".cursor-pointer.ms-auto")
            ))
            banner_close_button.click()
        except TimeoutException:
            print(f"[{store_name}] No banner appeared")
        
        # Select location from dropdown
        try:
            from selenium.webdriver.support.ui import Select
            location_dropdown = wait.until(EC.presence_of_element_located(
                (By.ID, "selectDeliveryBranch")
            ))
            select_object = Select(location_dropdown)
            all_options = select_object.options
            
            enabled_options = [
                opt for opt in all_options
                if opt.is_enabled() and opt.get_attribute('value') != ""
            ]
            
            if enabled_options:
                random_option = random.choice(enabled_options)
                select_object.select_by_visible_text(random_option.text)
                
                try:
                    submit_button = driver.find_element(By.CLASS_NAME, "current_loc_pop_btn")
                    submit_button.click()
                except Exception as e:
                    print(f"[{store_name}] No button to confirm location selection: {str(e)}")
        except:
            print(f"[{store_name}] No location box appeared")
        
        # Get products
        product_cards = wait.until(EC.presence_of_all_elements_located(
            (By.CLASS_NAME, "single_product_theme")
        ))

        products_details = []

        for product_card in product_cards:
            try:
                product_link = product_card.find_element(By.TAG_NAME, "a").get_attribute("href")
                name = product_card.find_element(By.CLASS_NAME, "product_name_theme").text
                
                currency = product_card.find_element(By.CLASS_NAME, "item-currency").text
                value = product_card.find_element(By.CLASS_NAME, "price-value").text
                price = f"{currency} {value.strip()}"
                image_url=product_card.find_element(By.TAG_NAME, "img").get_attribute("src")
                
                products_details.append({
                    "store": store_name,
                    "name": name,
                    "product-link": product_link,
                    "price": price,
                    "image_url":image_url
                })
            except Exception as e:
                print(f"[{store_name}] Error extracting product details: {str(e)}")
                continue
        
        
        filtered_products = get_filtered_products(products_details, word_to_search)
        print(f"[{store_name}] Found {len(filtered_products)} products")
        return filtered_products

    except Exception as e:
        print(f"[Jalal Sons] Error during scraping: {str(e)}")
        return []


def scrape_carrefour(driver, word_to_search, wait_time=10):
    """
    Scrape Carrefour store for products with name and price
    Returns: list of products with store name, or empty list on error
    """
    try:
        store_name = "Carrefour"
        CAREFOUR_GROCERY_URL = f"https://www.carrefour.pk/mafpak/en/search?keyword={word_to_search}"
        
        driver.get(CAREFOUR_GROCERY_URL)
        wait = WebDriverWait(driver, wait_time)
        driver.refresh()
        product = wait.until(EC.presence_of_element_located(
            (By.XPATH, "/html/body/div[1]/main/div/div[2]/div[2]/div/div[2]/div/div")
        ))
        
        
        
        product_links = set()
        for link in product.find_elements(By.TAG_NAME, "a"):
            href = link.get_attribute("href")
            if href:
                product_links.add(href)

        products_details = []
        for link in product_links:
            try:
                driver.get(link)
                name = driver.find_element(
                    By.XPATH, "/html/body/div[1]/main/div/div[3]/div/div[2]/h1"
                ).text
                price = driver.find_element(
                    By.XPATH, "/html/body/div[1]/main/div/div[3]/div/div[3]/div[1]/div[1]"
                ).text
                
                products_details.append({
                    "store": store_name,
                    "name": name,
                    "link": link,
                    "price": price
                })
            except Exception as e:
                print(f"[{store_name}] Error extracting product details: {str(e)}")
                continue
        
        filtered_products = get_filtered_products(products_details, word_to_search)
        print(f"[{store_name}] Found {len(filtered_products)} products")
        return filtered_products
        
    except Exception as e:
        print(f"[Carrefour] Error during scraping: {str(e)}")
        return []


def scrape_imtiaz(driver, word_to_search, wait_time=5):
    """
    Scrape Imtiaz store for products with pagination
    Returns: list of products with store name, or empty list on error
    """
    try:
        store_name = "Imtiaz"
        IMTIAZ_GROCERY_URL = f"https://shop.imtiaz.com.pk/search?q={word_to_search}"
        driver.get(IMTIAZ_GROCERY_URL)
        wait = WebDriverWait(driver, wait_time)

        products_details = []
        
        # Select location
        try:
            area = wait.until(EC.presence_of_element_located(
                (By.XPATH, "/html/body/div[2]/div[3]/div/div/div/div/div[3]/div[3]/div/div/input")
            ))
            area.send_keys(Keys.ENTER)
            area.send_keys(Keys.DOWN)
            area.send_keys(Keys.DOWN)
            area.send_keys(Keys.ENTER)
            
            submit_button = wait.until(EC.presence_of_element_located(
                (By.XPATH, "/html/body/div[2]/div[3]/div/div/div/div/div[3]/button")
            ))
            submit_button.click()
        except TimeoutException:
            print(f"[{store_name}] No location box appeared")
        
        # Get initial products
        try:
            products = wait.until(EC.presence_of_all_elements_located(
                (By.CLASS_NAME, "hazle-product-item_product_item__FSm1N")
            ))
            
            current_url = driver.current_url
            
            # Extract products and handle pagination
            while True:
                try:
                    # Wait for products to load
                    products = wait.until(EC.presence_of_all_elements_located(
                        (By.CLASS_NAME, "hazle-product-item_product_item__FSm1N")
                    ))
                    
                    # Extract all products on current page
                    for product in products:
                        try:
                            product_text_container = product.find_element(By.CLASS_NAME, "hazle-product-item_product_item_text_container__Apuq1")
                            
                            product_name = product_text_container.find_element(By.CLASS_NAME, "hazle-product-item_product_item_description__ejRDa").text.strip()
                            product_price = product_text_container.find_element(By.CLASS_NAME, "hazle-product-item_product_item_price_label__ET_we").text.strip()
                            
                            product_link_id = product.get_attribute("id")
                            product_link = f"https://shop.imtiaz.com.pk/product/{product_link_id}"
                            
                            image_url = product.find_element(By.TAG_NAME, "img").get_attribute("src")
                            
                            products_details.append({
                                "store": store_name,
                                "name": product_name,
                                "product-link": product_link,
                                "price": product_price,
                                "image_url": image_url
                            })
                        except Exception as e:
                            print(f"[{store_name}] Error extracting product info: {str(e)}")
                            continue
                    
                    # Try to find and click Next button
                    try:
                        button = driver.find_element(By.XPATH, "//button[normalize-space()='Next']")
                        
                        if button.get_attribute("disabled"):
                            print(f"[{store_name}] Reached last page")
                            break
                        else:
                            current_url = driver.current_url
                            button.click()
                            time.sleep(2)  # Wait for page to load
                    except NoSuchElementException:
                        print(f"[{store_name}] Last page reached")
                        break
                        
                except Exception as e:
                    print(f"[{store_name}] Error in pagination loop: {str(e)}")
                    break
            
            filtered_products = get_filtered_products(products_details, word_to_search)
            print(f"[{store_name}] Found {len(filtered_products)} products")
            return filtered_products
            
        except TimeoutException:
            print(f"[{store_name}] No products found")
            return []
        
    except Exception as e:
        print(f"[{store_name}] Error during scraping: {str(e)}")
        return []


# ===== MAIN SCRAPING FUNCTION =====
def scrape_all_stores(driver, word_to_search):
    """
    Scrape all 5 stores and combine results into a single list
    Returns: list of all products from all stores with store names
    """
    all_products = []
    
    print(f"\n{'='*60}")
    print(f"Starting scraping for: '{word_to_search}'")
    print(f"{'='*60}\n")
    
    # Scrape each store
    stores_scrapers = [
        ("Al-Fateh", scrape_al_fateh),
        ("Metro", scrape_metro),
        ("Jalal Sons", scrape_jalalsons),
        ("Carrefour", scrape_carrefour),
        ("Imtiaz", scrape_imtiaz),
    ]
    
    for store_label, scraper_func in stores_scrapers:
        print(f"\n[SCRAPING {store_label.upper()}]")
        try:
            products = scraper_func(driver, word_to_search)
            all_products.extend(products)
        except Exception as e:
            print(f"FATAL ERROR for {store_label}: {str(e)}")
            continue
    
    print(f"\n{'='*60}")
    print(f"Scraping Complete!")
    print(f"Total products collected: {len(all_products)}")
    print(f"{'='*60}\n")
    
    return all_products



In [6]:


def save_products_to_csv(products_list, filename="all_products.csv"):
    """
    Save scraped products to a CSV file.
    Each product dictionary should have: 'store', 'name', 'price', optional 'link'.
    """
    if not products_list:
        print("No products to save!")
        return
    
    df = pd.DataFrame(products_list)


    df.to_csv(filename, index=False, encoding="utf-8-sig")
    print(f"✅ Saved {len(df)} products to '{filename}'")

In [7]:

# Scrape all stores for a product,
all_products = scrape_all_stores(driver, "pepsi")

# Save to CSV
save_products_to_csv(all_products, "searched-products.csv")


Starting scraping for: 'pepsi'


[SCRAPING AL-FATEH]
[Al-Fateh] Found 15 relevant products

[SCRAPING METRO]
[Metro] Found 8 products

[SCRAPING JALAL SONS]
[Jalal Sons] No banner appeared
[Jalal Sons] Found 6 products

[SCRAPING CARREFOUR]
[Carrefour] Error during scraping: Message: 
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0x7ff6c92aa235
	0x7ff6c9002630
	0x7ff6c8d916dd
	0x7ff6c8dea27e
	0x7ff6c8dea58c
	0x7ff6c8e3ed77
	0x7ff6c8e3baba
	0x7ff6c8ddb0ed
	0x7ff6c8ddbf63
	0x7ff6c92d5d60
	0x7ff6c92cfe8a
	0x7ff6c92f1005
	0x7ff6c901d71e
	0x7ff6c9024e1f
	0x7ff6c900b7c4
	0x7ff6c900b97f
	0x7ff6c8ff18e8
	0x7ff9bd82e8d7
	0x7ff9bfaec53c


[SCRAPING IMTIAZ]
[Imtiaz] Last page reached
[Imtiaz] Found 5 products

Scraping Complete!
Total products collected: 34

✅ Saved 34 products to 'searched-products.csv'


In [8]:
import os
print(os.getcwd())

D:\UNIVERSITY\Semester 7\DS\project
