In [1]:
#!pip install selenium
#!pip install pyvirtualdisplay
#!apt-get install xvfb
#!pip install undetected-chromedriver


In [8]:
import re
import os
import csv
import time
from urllib.parse import urljoin, quote_plus
from datetime import date

import undetected_chromedriver as uc
from bs4 import BeautifulSoup


# =======================
# ✅ ONLY THIS LIST
# =======================
SEARCH_MAP = {
      

  

    "Home_Appliances": [
       ["vacuum cleaner"]
    ],

    "TV_and_Screens": [
        ["smart tv"], ["tv"], ["led tv"], ["uhd tv"], ["android tv"]
    ]
}


# =======================
# Noon Settings
# =======================
BASE = "https://www.noon.com"
BASE_SEARCH = "https://www.noon.com/egypt-en/search/?page={page}&q={query}"

# =======================
# Output: one file per day in Data/
# =======================
DATA_DIR = "Data"
TODAY_STR = date.today().isoformat()  # YYYY-MM-DD
OUT_CSV = os.path.join(DATA_DIR, f"{TODAY_STR}.csv")

# =======================
# ✅ LIMIT: max pages per SEARCH TERM (e.g., oil = max 10 pages total)
# =======================
MAX_PAGES_PER_TERM = 10


# =======================
# Helpers
# =======================
def ensure_dir(path: str):
    os.makedirs(path, exist_ok=True)

def safe_text(el):
    return el.get_text(strip=True) if el else None

def clean_space(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()

def detect_total_pages_from_html(html: str) -> int:
    """
    أدق من جمع كل page= من الصفحة: يحاول يقرأ أرقام pagination فقط.
    """
    soup = BeautifulSoup(html, "html.parser")
    nums = []

    for a in soup.select('a[href*="page="]'):
        href = a.get("href", "")
        m = re.search(r"[?&]page=(\d+)", href)
        if m:
            nums.append(int(m.group(1)))

    # fallback
    if not nums:
        for m in re.finditer(r"[?&]page=(\d+)", html):
            try:
                nums.append(int(m.group(1)))
            except:
                pass

    return max(nums) if nums else 1

def detect_stock(txt: str):
    """
    listing page مش دايمًا بيعرض stock صريح، فده best-effort detection
    """
    t = (txt or "").lower()

    out_words = ["sold out", "out of stock", "نفدت", "غير متوفر", "غير متاح"]
    in_words = ["add to cart", "add to bag", "اضف", "أضف", "متوفر"]

    for w in out_words:
        if w in t:
            return "out_of_stock", w

    for w in in_words:
        if w in t:
            return "in_stock", w

    return "unknown", None


# =======================
# ✅ Undetected Chrome Driver
# =======================
def open_driver(headless: bool = True):
    options = uc.ChromeOptions()

    if headless:
        options.add_argument("--headless=new")

    options.add_argument("--window-size=1400,900")
    options.add_argument("--lang=en-US")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    # anti-detection tweaks
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--disable-infobars")
    options.add_argument("--disable-extensions")

    driver = uc.Chrome(
        options=options,
        use_subprocess=True,
        version_main=None
    )
    return driver


def get_page_with_retry(driver, url: str, retries: int = 5, base_wait: float = 3.5) -> str:
    last_err = None
    for attempt in range(1, retries + 1):
        try:
            driver.get(url)
            time.sleep(base_wait)

            # scroll يساعد lazy load
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight * 0.6);")
            time.sleep(1.0)
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1.3)

            html = driver.page_source
            if html and "<html" in html.lower():
                return html

            last_err = "HTML not loaded properly"
        except Exception as e:
            last_err = str(e)

        time.sleep(1.5 * attempt)

    raise RuntimeError(f"Failed to load: {url}. Last error: {last_err}")


def parse_products(
    html: str,
    category: str,
    search_group: str,
    search_term: str,
    query_variant: str,
    page_no: int
):
    soup = BeautifulSoup(html, "html.parser")
    rows = []

    titles = soup.select('h2[data-qa="plp-product-box-name"]')
    for h2 in titles:
        title = h2.get("title") or safe_text(h2)
        title = clean_space(title)

        # climb up to card container
        card = h2
        for _ in range(30):
            if not card:
                break
            if card.name == "div" and (
                card.select_one('[data-qa="plp-product-box-price"]')
                or card.select_one("a[href]")
            ):
                break
            card = card.parent

        if not card:
            continue

        a = h2.find_parent("a")
        if not a:
            a = card.select_one("a[href]")
        product_url = urljoin(BASE, a["href"]) if a and a.get("href") else None

        # price box
        price_box = card.select_one('[data-qa="plp-product-box-price"]')
        currency = safe_text(price_box.select_one("span")) if price_box else None
        price = safe_text(price_box.select_one("strong")) if price_box else None
        old_price = safe_text(price_box.select_one('[class*="oldPrice"]')) if price_box else None

        txt = clean_space(card.get_text(" ", strip=True))

        rating = None
        m = re.search(r"\b(\d\.\d)\b", txt)
        if m:
            rating = m.group(1)

        reviews = None
        count_span = card.select_one('[class*="countCtr"] span')
        if count_span:
            reviews = safe_text(count_span)
        else:
            m2 = re.search(r"\b(\d+(?:\.\d+)?[KkMm])\b", txt)
            if m2:
                reviews = m2.group(1)

        stock_status, stock_text = detect_stock(txt)

        rows.append({
            "date": TODAY_STR,
            "category": category,
            "search_group": search_group,
            "search_term": search_term,
            "query_variant": query_variant,
            "page": page_no,

            "title": title,
            "url": product_url,
            "currency": currency,
            "price": price,
            "old_price": old_price,
            "rating": rating,
            "reviews": reviews,
            "stock_status": stock_status,
            "stock_text": stock_text
        })

    return rows


def append_rows_to_csv(path: str, fieldnames: list, rows: list):
    file_exists = os.path.exists(path)
    with open(path, "a", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=fieldnames)
        if not file_exists:
            w.writeheader()
        w.writerows(rows)


def fetch_rows_for_page(
    driver,
    url: str,
    category: str,
    search_group: str,
    search_term: str,
    query_variant: str,
    page_no: int,
    tries: int = 3
):
    last_rows = []
    last_html = None

    for t in range(1, tries + 1):
        last_html = get_page_with_retry(driver, url)
        last_rows = parse_products(last_html, category, search_group, search_term, query_variant, page_no)

        if len(last_rows) > 0:
            return last_rows, last_html, True

        time.sleep(1.2 * t)
        try:
            driver.refresh()
        except:
            pass
        time.sleep(0.8)

    return last_rows, last_html, False


# =======================
# Main scrape (one daily file)
# =======================
def run_all(headless: bool = False):
    """
    ✅ كل نتائج اليوم في ملف واحد: Data/YYYY-MM-DD.csv
    ✅ الحد الأقصى للصفحات لكل SEARCH TERM (مثل oil) = MAX_PAGES_PER_TERM
       يعني لو oil ليها variants (oil + cooking oil) هنقسّم الـ 10 صفحات عليهم بالترتيب.
    """
    ensure_dir(DATA_DIR)

    fieldnames = [
        "date",
        "category",
        "search_group",
        "search_term",
        "query_variant",
        "page",
        "title",
        "url",
        "currency",
        "price",
        "old_price",
        "rating",
        "reviews",
        "stock_status",
        "stock_text"
    ]

    driver = open_driver(headless=headless)

    try:
        for category, terms in SEARCH_MAP.items():
            for item in terms:
                # item: ["oil","cooking oil"] or ["rice"]
                search_term = item[0]
                variants = item  # variants list

                # ✅ صفحات هذا الـ term كله (مش لكل variant)
                pages_left_for_term = MAX_PAGES_PER_TERM

                for query_variant in variants:
                    if pages_left_for_term <= 0:
                        print(f"[{category}] term '{search_term}' -> reached MAX_PAGES_PER_TERM={MAX_PAGES_PER_TERM} (skip remaining variants)")
                        break

                    # page 1
                    first_url = BASE_SEARCH.format(page=1, query=quote_plus(query_variant))
                    rows1, html1, ok1 = fetch_rows_for_page(
                        driver, first_url,
                        category=category,
                        search_group=category,
                        search_term=search_term,
                        query_variant=query_variant,
                        page_no=1,
                        tries=3
                    )

                    if not ok1 or len(rows1) == 0:
                        print(f"[{category}] '{query_variant}' -> page 1 returned 0 products (skipped)")
                        continue

                    total_pages = detect_total_pages_from_html(html1)

                    # ✅ limit pages for this variant based on remaining pages for the term
                    pages_for_this_variant = min(total_pages, pages_left_for_term)

                    # save page 1
                    append_rows_to_csv(OUT_CSV, fieldnames, rows1)
                    pages_left_for_term -= 1
                    print(f"[{category}] term='{search_term}' variant='{query_variant}' page 1/{pages_for_this_variant} -> {len(rows1)} rows appended (pages_left_for_term={pages_left_for_term})")

                    if pages_for_this_variant <= 1:
                        continue

                    # باقي الصفحات بالتسلسل (حتى pages_for_this_variant)
                    for p in range(2, pages_for_this_variant + 1):
                        if pages_left_for_term <= 0:
                            print(f"[{category}] term '{search_term}' -> reached MAX_PAGES_PER_TERM={MAX_PAGES_PER_TERM} (stop term)")
                            break

                        url = BASE_SEARCH.format(page=p, query=quote_plus(query_variant))
                        rows, _html, ok = fetch_rows_for_page(
                            driver, url,
                            category=category,
                            search_group=category,
                            search_term=search_term,
                            query_variant=query_variant,
                            page_no=p,
                            tries=3
                        )

                        if not ok or len(rows) == 0:
                            print(f"[{category}] term='{search_term}' variant='{query_variant}' page {p}/{pages_for_this_variant} -> 0 products (stop this variant)")
                            break

                        append_rows_to_csv(OUT_CSV, fieldnames, rows)
                        pages_left_for_term -= 1
                        print(f"[{category}] term='{search_term}' variant='{query_variant}' page {p}/{pages_for_this_variant} -> {len(rows)} rows appended (pages_left_for_term={pages_left_for_term})")

                        time.sleep(1.0)

        print(f"\nDONE ✅ Daily file created/updated: {OUT_CSV}")

    finally:
        try:
            driver.quit()
        except:
            pass


if __name__ == "__main__":
    # headless=False عشان تشوفي اللي بيحصل، لو عايزة headless خليها True
    run_all(headless=False)


[Home_Appliances] term='vacuum cleaner' variant='vacuum cleaner' page 1/10 -> 50 rows appended (pages_left_for_term=9)
[Home_Appliances] term='vacuum cleaner' variant='vacuum cleaner' page 2/10 -> 54 rows appended (pages_left_for_term=8)
[Home_Appliances] term='vacuum cleaner' variant='vacuum cleaner' page 3/10 -> 54 rows appended (pages_left_for_term=7)
[Home_Appliances] term='vacuum cleaner' variant='vacuum cleaner' page 4/10 -> 54 rows appended (pages_left_for_term=6)
[Home_Appliances] term='vacuum cleaner' variant='vacuum cleaner' page 5/10 -> 54 rows appended (pages_left_for_term=5)
[Home_Appliances] term='vacuum cleaner' variant='vacuum cleaner' page 6/10 -> 54 rows appended (pages_left_for_term=4)
[Home_Appliances] term='vacuum cleaner' variant='vacuum cleaner' page 7/10 -> 54 rows appended (pages_left_for_term=3)
[Home_Appliances] term='vacuum cleaner' variant='vacuum cleaner' page 8/10 -> 54 rows appended (pages_left_for_term=2)
[Home_Appliances] term='vacuum cleaner' variant=

RuntimeError: Failed to load: https://www.noon.com/egypt-en/search/?page=4&q=smart+tv. Last error: Message: invalid session id
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0x11512d3
	0x1151314
	0xf3e52b
	0xf7cf57
	0xf7e1d4
	0x13a5314
	0x13a08cb
	0x13bd1aa
	0x116b1d8
	0x11731dd
	0x11595d8
	0x1159799
	0x1143b28
	0x76e45d49
	0x76fad5db
	0x76fad561
