In [None]:
import os
import csv
import time
import random
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import date

# ================== إعدادات عامة ==================
BASE = "https://www.jumia.com.eg"
SEARCH_URL = "https://www.jumia.com.eg/catalog/"
SOURCE_NAME = "jumia"
RUN_DATE = date.today().isoformat()

# فولدر واحد فقط: Data
DATA_DIR = os.path.join(os.getcwd(), "Data")
os.makedirs(DATA_DIR, exist_ok=True)

# ملف واحد يوميًا باسم prices_2025-12-27.csv
OUTPUT_FILE = os.path.join(DATA_DIR, f"prices_{RUN_DATE}.csv")

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9,ar;q=0.8",
}

SEARCH_MAP = {
    "Grocery": [
        ["oil", "cooking oil"],
        ["rice"],
        ["sugar"],
        ["pasta"],
        ["tea"],
        ["coffee"],
        ["milk"],
        ["tuna"],
        ["tomato paste"],
        ["detergent"],
        ["dishwashing liquid"],
        ["diapers"]
    ],
    "Electronics": [
        ["smartphone"],
        ["mobile phone"],
        ["laptop"],
        ["tablet"],
        ["smart watch"],
        ["earbuds"],
        ["power bank"],
        ["headphones"]
    ],
    "Home_Appliances": [
        ["air fryer"],
        ["microwave"],
        ["coffee machine"],
        ["electric kettle"],
        ["blender"],
        ["vacuum cleaner"]
    ],
    "TV_and_Screens": [
        ["smart tv"],
        ["tv"],
        ["led tv"],
        ["uhd tv"],
        ["android tv"]
    ]
}

# ================== Helpers ==================
def clean_text(x: str):
    return " ".join(x.split()).strip() if x else None

def parse_price_egp(text: str):
    if not text:
        return None
    t = text.replace("EGP", "").replace(",", "").strip()
    try:
        return float(t)
    except Exception:
        return None

def fetch_html(query: str, page: int):
    params = {"q": query}
    if page > 1:
        params["page"] = page
    r = requests.get(SEARCH_URL, params=params, headers=HEADERS, timeout=30)
    r.raise_for_status()
    return r.text

def parse_product_card(a_tag):
    href = a_tag.get("href")
    product_url = urljoin(BASE, href) if href else None
    name_el = a_tag.select_one("h3.name")
    product_name = clean_text(name_el.get_text()) if name_el else None
    prc_el = a_tag.select_one("div.prc")
    price_text = clean_text(prc_el.get_text()) if prc_el else None
    price = parse_price_egp(price_text)
    old_el = a_tag.select_one("div.old")
    old_price_text = clean_text(old_el.get_text()) if old_el else None
    old_price = parse_price_egp(old_price_text)
    disc_el = a_tag.select_one("div.bdg._dsct")
    discount = clean_text(disc_el.get_text()) if disc_el else None
    rating = None
    reviews_count = None
    rev = a_tag.select_one("div.rev")
    if rev:
        stars = rev.select_one("div.stars")
        if stars:
            rating = clean_text(stars.get_text()).replace("out of 5", "").strip()
        txt = clean_text(rev.get_text()) or ""
        if "(" in txt and ")" in txt:
            try:
                reviews_count = int(txt.split("(")[-1].split(")")[0].strip())
            except Exception:
                reviews_count = None
    img_url = None
    img = a_tag.select_one("img.img")
    if img:
        img_url = img.get("data-src") or img.get("src")
    brand = a_tag.get("data-gtm-brand") or a_tag.get("data-ga4-item_brand")
    item_id = a_tag.get("data-gtm-id") or a_tag.get("data-ga4-item_id")
    category_path = a_tag.get("data-gtm-category") or a_tag.get("data-ga4-item_category")
    return {
        "date": RUN_DATE,
        "source": SOURCE_NAME,
        "category": None,
        "search_query": None,
        "page": None,
        "item_id": item_id,
        "brand": brand,
        "category_path": category_path,
        "product_name": product_name,
        "price": price,
        "old_price": old_price,
        "discount": discount,
        "rating": rating,
        "reviews_count": reviews_count,
        "image_url": img_url,
        "product_url": product_url,
    }

def scrape_one_keyword(keyword: str, category: str, max_pages: int = 30, sleep_range=(1.5, 3.0)):
    rows = []
    seen = set()
    for page in range(1, max_pages + 1):
        try:
            html = fetch_html(keyword, page)
            soup = BeautifulSoup(html, "html.parser")
            cards = soup.select("a.core[href*='.html']")
            if not cards:
                break
            new_count = 0
            for a in cards:
                data = parse_product_card(a)
                key = data["item_id"] or data["product_url"]
                if not key or key in seen:
                    continue
                seen.add(key)
                data["category"] = category
                data["search_query"] = keyword
                data["page"] = page
                rows.append(data)
                new_count += 1
            if new_count == 0:
                break
            time.sleep(random.uniform(*sleep_range))
        except Exception as e:
            print(f"خطأ في صفحة {page} للكلمة '{keyword}': {e}")
            break  # لو حصل خطأ في صفحة، نوقف الكلمة دي ونكمل الكلمات التانية
    return rows

def save_daily_file(path: str, rows: list):
    if not rows:
        print("No data collected.")
        return
    
    fieldnames = [
        "date", "source", "category", "search_query", "page",
        "item_id", "brand", "category_path",
        "product_name", "price", "old_price", "discount",
        "rating", "reviews_count",
        "image_url", "product_url"
    ]
    
    file_exists = os.path.exists(path)
    with open(path, "a", newline="", encoding="utf-8-sig") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        if not file_exists:
            writer.writeheader()
        writer.writerows(rows)
    print(f"تم حفظ/إضافة {len(rows)} منتج في: {path}")

def run_all(max_pages_per_query: int = 30):
    all_rows = []
    try:
        for category, keywords in SEARCH_MAP.items():
            for kw in keywords:
                print(f"جاري جمع [{category}] → '{kw}' ...")
                rows = scrape_one_keyword(kw, category, max_pages=max_pages_per_query)
                all_rows.extend(rows)
                print(f"    → تم جمع {len(rows)} منتج")
                # نحفظ كل شوية عشان لو حصل خطأ في النص نكون محتفظين باللي اتجمع
                if rows:
                    save_daily_file(OUTPUT_FILE, rows)
                    rows.clear()  # نفرغ القايمة عشان ما نكررش الحفظ

        # حفظ نهائي لأي بيانات متبقية
        if all_rows:
            save_daily_file(OUTPUT_FILE, all_rows)

        print(f"\nتم الانتهاء بنجاح! الملف: {OUTPUT_FILE}")

    except Exception as e:
        print(f"\nحصل خطأ كبير: {e}")
        print("لكن تم حفظ كل اللي اتجمع لحد دلوقتي!")
        if all_rows:
            save_daily_file(OUTPUT_FILE, all_rows)
    finally:
        # ضمان الحفظ دايماً في النهاية
        if all_rows:
            save_daily_file(OUTPUT_FILE, all_rows)
        print(f"الملف النهائي: {OUTPUT_FILE}")

if __name__ == "__main__":
    run_all(max_pages_per_query=30)

[Grocery] ['oil', 'cooking oil'] -> 421 items
[Grocery] ['rice'] -> 678 items
[Grocery] ['sugar'] -> 450 items
[Grocery] ['pasta'] -> 375 items
[Grocery] ['tea'] -> 1200 items
[Grocery] ['coffee'] -> 1183 items
[Grocery] ['milk'] -> 1200 items
[Grocery] ['tuna'] -> 54 items
[Grocery] ['tomato paste'] -> 4 items
[Grocery] ['detergent'] -> 391 items
[Grocery] ['dishwashing liquid'] -> 68 items
[Grocery] ['diapers'] -> 919 items
[Electronics] ['smartphone'] -> 1178 items
