In [None]:
# Creamy — Web Scraping (com padronização de benefícios via models)

import sys, subprocess, os, re, csv, json, time
from urllib.parse import urljoin, urlparse, parse_qs

import requests
from bs4 import BeautifulSoup
import unicodedata
sys.path.append(os.path.abspath(".."))

from skin import (
    SKIN_TYPE_CANONICAL_ORDER,
    SKIN_TYPE_SYNONYMS_PT,
)

from exclude import (
    EXCLUDE_KEYWORDS,
)

from ingredient import (
    INGREDIENTES_VALIDOS,
)

from benefits import (
    BENEFIT_SYNONYMS_PT,
    BENEFIT_CANONICAL_ORDER,
)

BASE_URL = "https://www.creamy.com.br/"
LISTING_URL_TEMPLATE = "https://www.creamy.com.br/produtos?page={page}"
MAX_PAGES = 9


OUT_JSON = "creamy_products.json"
OUT_CSV  = "creamy_products.csv"
IMG_DIR  = "imagens"
os.makedirs(IMG_DIR, exist_ok=True)

SESSION = requests.Session()
SESSION.headers.update({
    "User-Agent": (
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
})

def strip_accents(s: str) -> str:
    if not isinstance(s, str):
        s = str(s)
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def _strip_accents_lower(s: str) -> str:
    return strip_accents(s or "").lower().strip()

def normalize_space(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

def slugify(text: str) -> str:
    text = strip_accents(text.lower())
    text = re.sub(r"[^a-z0-9]+", "-", text)
    text = re.sub(r"-+", "-", text).strip("-")
    return text or "produto"

def get_soup(url, max_retries=3, timeout=25):
    for attempt in range(max_retries):
        try:
            r = SESSION.get(url, timeout=timeout)
            if r.status_code == 200:
                return BeautifulSoup(r.text, "lxml")
            print(f"[WARN] {url} -> status {r.status_code}")
        except Exception as e:
            print(f"[ERROR] {url} -> {e}")
        time.sleep(1.1 * (attempt + 1))
    return None

def looks_excluded(text: str) -> bool:
    t = strip_accents((text or "").lower())
    return any(kw in t for kw in EXCLUDE_KEYWORDS)

def parse_price_to_str(price_text: str) -> str:
    if not price_text:
        return ""
    t = price_text.replace("R$", "").replace("r$", "").strip()
    t = t.replace(" ", "").replace(".", "").replace(",", ".")
    m = re.findall(r"[0-9]+(?:\.[0-9]{1,2})?", t)
    if not m:
        return ""
    try:
        return f"{float(m[0]):.2f}"
    except:
        return ""

def split_list_candidates(text: str):
    if not text:
        return []
    t = text.replace("<br>", ";").replace("<br/>", ";").replace("<br />", ";")
    parts = re.split(r"[;•|/\n,]", t)
    return [normalize_space(p) for p in parts if normalize_space(p)]

def filter_ingredients_creamy(ings_raw):
    allowed_norm = [a.lower() for a in INGREDIENTES_VALIDOS]
    allowed_noacc = [strip_accents(a.lower()) for a in INGREDIENTES_VALIDOS]
    out = []
    
    for ing in ings_raw:
        ing_l = ing.lower()
        ing_noacc = strip_accents(ing_l)
        match = None
        for a_norm, a_noacc in zip(allowed_norm, allowed_noacc):
            if a_norm in ing_l or a_noacc in ing_noacc:
                match = a_norm
                break
        if match and match not in out:
            out.append(match)
    return "; ".join(out)

def find_all_text(soup, selectors):
    for sel in selectors:
        node = soup.select_one(sel)
        if node and node.get_text(strip=True):
            return node.get_text(" ", strip=True)
    return ""

# ==== NOVO: padronização de benefícios ====
def padroniza_beneficios(textos_beneficios):
    if not textos_beneficios:
        return []
    encontrados = set()
    norm_syn = {
        canonico: [_strip_accents_lower(s) for s in patt_list if s]
        for canonico, patt_list in BENEFIT_SYNONYMS_PT.items()
    }
    for txt in textos_beneficios:
        n = _strip_accents_lower(txt)
        for canonico, padds in norm_syn.items():
            if any(patt in n for patt in padds):
                encontrados.add(canonico)

    if BENEFIT_CANONICAL_ORDER:
        order_map = {name: i for i, name in enumerate(BENEFIT_CANONICAL_ORDER)}
        return sorted(list(encontrados), key=lambda x: order_map.get(x, 999))
    return sorted(list(encontrados))

def extract_benefits(soup):
    # Coleta bruta
    items = []
    for ul in soup.select("ul, ol"):
        lis = [normalize_space(li.get_text(" ", strip=True)) for li in ul.select("li")]
        for li in lis:
            if 0 < len(li) <= 120:
                items.append(li)
    for th in soup.select("th, td"):
        txt = normalize_space(th.get_text(" ", strip=True))
        if 0 < len(txt) <= 120:
            items.append(txt)
    # Únicos mantendo ordem de primeira aparição
    uniq = []
    seen = set()
    for it in items:
        if it and it not in seen:
            uniq.append(it)
            seen.add(it)
    # === NOVO: padroniza com base no models ===
    pad = padroniza_beneficios(uniq)
    return "; ".join(pad)

def extract_ingredients(soup):
    possible_labels = [
        "ingredientes", "composição", "composicao", "fórmula", "formula", "ingredients", "active ingredients"
    ]
    text_blocks = []

    for el in soup.select("div, section, table, article, ul, ol, p"):
        txt = el.get_text(" ", strip=True)
        low = txt.lower()
        if any(lbl in low for lbl in possible_labels):
            text_blocks.append(txt)
    text_blocks = sorted(set(text_blocks), key=len)
    raw = []
    for block in text_blocks:
        raw.extend(split_list_candidates(block))
    raw = [r for r in raw if len(r) <= 100]
    return filter_ingredients_creamy(raw)

def extract_size_from_text(text: str) -> str:
    if not text:
        return ""
    m = re.search(r"(\d+[\.,]?\d*)\s*(ml|g|l)\b", text.lower())
    if m:
        val = m.group(1).replace(",", ".")
        unit = m.group(2).upper()
        if unit == 'L' and not val.endswith('L'):
            return f"{val}L"
        return f"{val}{unit}"
    return ""

def extract_tipos_pele(soup):
    """Extrai os tipos de pele mencionados no produto e ordena canonicamente, se disponível."""
    tipos_encontrados = set()
    tipos_mapeamento = {
        "oleosa": "oleosa",
        "seca": "seca", 
        "mista": "mista",
        "sensivel": "sensivel",
        "sensível": "sensivel",
        "normal": "normal",
        "acneica": "acneica",
        "madura": "madura"
    }
    page_text = soup.get_text(" ", strip=True).lower()
    for palavra, tipo in tipos_mapeamento.items():
        if palavra in page_text:
            tipos_encontrados.add(tipo)

    if not tipos_encontrados:
        tipos_encontrados = {"mista", "oleosa", "seca", "sensivel"}

    tipos_list = list(tipos_encontrados)
    if SKIN_TYPE_CANONICAL_ORDER:
        order_map = {name: i for i, name in enumerate(SKIN_TYPE_CANONICAL_ORDER)}
        tipos_list = sorted(tipos_list, key=lambda x: order_map.get(x, 999))
    else:
        tipos_list = sorted(tipos_list)

    return "; ".join(tipos_list)

def download_image(soup, product_name):
    selectors = [
        "img.vtex-store-components-3-x-productImageTag",
        "img.product-image",
        "img[src*='/arquivos/']",
        "img[src*='cdn']",
    ]
    src = None

    for sel in selectors:
        node = soup.select_one(sel)
        if node and node.get("src"):
            src = node.get("src")
            break
        if node and node.get("data-src"):
            src = node.get("data-src")
            break
    if not src:
        return ""
    
    img_url = src if src.startswith("http") else urljoin(BASE_URL, src)
    from urllib.parse import urlparse
    ext = os.path.splitext(urlparse(img_url).path)[1] or ".jpg"
    fname = f"{slugify(product_name)}{ext}"
    fpath = os.path.join(IMG_DIR, fname)

    try:
        r = SESSION.get(img_url, timeout=25)
        if r.status_code == 200 and r.content:
            with open(fpath, "wb") as f:
                f.write(r.content)
            return fname
    except Exception as e:
        print(f"[IMG] Falha ao baixar {img_url}: {e}")
    return ""

def guess_category(url: str, name: str) -> str:
    nlow = name.lower()
    mapping = [
        ("protetor", "protetor solar"), ("fps", "protetor solar"),
        ("bastão", "bastão"), ("bastao", "bastão"),
        ("sérum", "sérum"), ("serum", "sérum"),
        ("creme", "creme"), ("hidratante", "hidratante"),
        ("gel de limpeza", "gel de limpeza"), ("sabonete", "sabonete"),
        ("tônico", "tônico"), ("tonico", "tônico"),
        ("óleo", "óleo"), ("oleo", "óleo"),
        ("emulsão", "emulsão"), ("emulsao", "emulsão"),
        ("lip balm", "lip balm"), ("bálsamo", "bálsamo"),
        ("peel", "peel"), ("esfoliante", "esfoliante"),
        ("limpador", "limpador"), ("fragrância", "fragrância"),
        ("body", "creme corporal"), ("corporal", "creme corporal"),
        ("capilar", "capilar")
    ]

    for key, val in mapping:
        if key in nlow:
            return val
    
    path = urlparse(url).path.strip("/")
    base = path.split("/")[-1]
    tokens = base.replace(".p", "").replace(".html", "").split("-")
    cat = " ".join(tokens[:2]).strip()
    return normalize_space(cat) if cat else ""

def parse_product_page(url, fallback_category=""):
    soup = get_soup(url)
    if soup is None:
        return None
    
    name = find_all_text(soup, [
        "h1.vtex-store-components-3-x-productName",
        "h1.productName",
        "h1",
        "div.product-name h1",
    ])
    
    if not name:
        if soup.title and soup.title.string:
            name = soup.title.string.split("|")[0].strip()
    
    if not name:
        return None
    
    if looks_excluded(name) or looks_excluded(url):
        print(f"[SKIP] Produto excluído: {name}")
        return None

    subtitle = find_all_text(soup, [
        "span.vtex-product-summary-2-x-description-short div",
        "span.vtex-product-summary-2-x-description-short",
        "div.vtex-rich-text-0-x-container p",
        "div.productDescription",
        "div.product-brief",
    ])
    subtitle = subtitle if (subtitle and len(subtitle) <= 220) else ""
    
    price_text = find_all_text(soup, [
        "span.vtex-product-price-1-x-sellingPriceValue",
        "span.selling-price",
        "span.price",
    ])
    price = parse_price_to_str(price_text)
    
    beneficios = extract_benefits(soup)
    ingredientes = extract_ingredients(soup)
    tipos_pele = extract_tipos_pele(soup)
    
    size = extract_size_from_text(name)
    if not size:
        size = extract_size_from_text(subtitle)
    if not size:
        details_txt = find_all_text(soup, [
            "div.vtex-store-components-3-x-productDescriptionText",
            "div.productDescription",
            "section#descricao",
        ])
        size = extract_size_from_text(details_txt)
    
    categoria = fallback_category or guess_category(url, name)
    img_name = download_image(soup, name)
    
    return {
        "site": "creamy",
        "categoria": categoria,
        "nome": name.strip(),
        "subtitulo": subtitle if subtitle else "",
        "preco": price if price else "",
        "beneficios": beneficios.lower() if beneficios else "",
        "ingredientes": ingredientes.lower() if ingredientes else "",
        "tamanho": size if size else "",
        "tipos_pele": tipos_pele,
        "imagem": img_name,
        "_source_url": url,
    }

def listing_get_product_links(page_url: str):
    soup = get_soup(page_url)
    if soup is None:
        return []
    links = set()
    for a in soup.select("a[href]"):
        href = a.get("href")
        if not href:
            continue
        full = href if href.startswith("http") else urljoin(BASE_URL, href)
        if re.search(r"/p($|\?)", full):
            links.add(full)
    return sorted(links)

def run_scraper():
    visited = set()
    items = []

    for page in range(1, MAX_PAGES+1):
        url = LISTING_URL_TEMPLATE.format(page=page)
        print(f"[LIST] {url}")
        prod_links = listing_get_product_links(url)
        print(f"  - {len(prod_links)} links")
        
        for purl in prod_links:
            if purl in visited or looks_excluded(purl):
                continue
            item = parse_product_page(purl)
            if item:
                visited.add(purl)
                items.append(item)
                print(f"  [+] {item['nome']} :: {item['preco']}")
            time.sleep(0.6)
    return items

def save_outputs(items):
    cols = ["site", "categoria", "nome", "subtitulo", "preco", "beneficios", "ingredientes", "tamanho", "tipos_pele", "imagem"]
    clean = [{k: it.get(k, "") for k in cols} for it in items]
    
    with open("creamy_products.json", "w", encoding="utf-8") as f:
        json.dump(clean, f, ensure_ascii=False, indent=2)
    print(f"[OK] JSON salvo em creamy_products.json ({len(clean)} itens)")

    with open("creamy_products.csv", "w", encoding="utf-8", newline="") as f:
        w = csv.DictWriter(f, fieldnames=cols)
        w.writeheader()
        for row in clean:
            w.writerow(row)
    print(f"[OK] CSV salvo em creamy_products.csv ({len(clean)} linhas)")

# Executar o scraper
if __name__ == "__main__":
    items = run_scraper()
    print(f"Total coletado: {len(items)}")
    save_outputs(items)

    if items:
        print("Prévia do primeiro item:")
        print(json.dumps(items[0], ensure_ascii=False, indent=2))


[LIST] https://www.creamy.com.br/produtos?page=1
  - 10 links
  [+] Ácido Glicólico :: 84.20
  [+] Ácido Lático :: 73.67
  [+] Ácido Mandélico :: 84.20
  [+] Ácido Salicílico :: 94.73
  [+] Calming Cream :: 52.62
  [+] Gel de Limpeza :: 63.15
  [+] Protetor  Solar FPS 60 Watery Lotion :: 63.15
  [+] Sérum Facial Retinal :: 136.83
  [+] Vitamina C Gold :: 126.31
  [+] Vitamina C :: 105.25
[LIST] https://www.creamy.com.br/produtos?page=2
  - 11 links
  [+] Sérum Hidratante Facial :: 63.15
  [+] Lip Balm Incolor :: 31.49
  [+] Niacinamide B Complex 20% - 30ml :: 94.73
[LIST] https://www.creamy.com.br/produtos?page=3
  - 11 links
  [+] Retinol :: 105.25
[LIST] https://www.creamy.com.br/produtos?page=4
  - 11 links
[LIST] https://www.creamy.com.br/produtos?page=5
  - 11 links
[LIST] https://www.creamy.com.br/produtos?page=6
  - 11 links
[LIST] https://www.creamy.com.br/produtos?page=7
  - 11 links
[LIST] https://www.creamy.com.br/produtos?page=8
  - 11 links
  [+] Glicointense Peel :: 105.2