In [1]:
# -*- coding: utf-8 -*-
# === NIVEA (Rosto) Scraper - célula única para Jupyter/IPYNB ===
import sys, os, re, json, time, unicodedata, io
from pathlib import Path
from typing import List, Dict, Optional, Tuple
from urllib.parse import urljoin, urlparse, urlencode, parse_qs, urlunparse

import requests
import pandas as pd
from bs4 import BeautifulSoup

# ===== seus dicionários / regras =====
sys.path.append(os.path.abspath("/home/usuario/Área de trabalho/Dados/models"))
from skin import SKIN_TYPE_CANONICAL_ORDER, SKIN_TYPE_SYNONYMS_PT
from exclude import EXCLUDE_KEYWORDS
from ingredient import INGREDIENTES_VALIDOS
from benefits import BENEFIT_SYNONYMS_PT, BENEFIT_CANONICAL_ORDER
from category import CATEGORY_CANONICAL_ORDER, CATEGORY_HINTS

# ================== Config ==================
BASE_URL = "https://www.nivea.com.br"
LISTING_URL = "https://www.nivea.com.br/produtos/rosto"

JSON_PATH = Path("nivea_products.json")
CSV_PATH  = Path("nivea_products.csv")

IMG_DIR = Path("./images/nivea")
IMG_DIR.mkdir(parents=True, exist_ok=True)

SESSION = requests.Session()
SESSION.headers.update({
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124 Safari/537.36",
    "Accept-Language": "pt-BR,pt;q=0.9,en;q=0.1",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
})

NBSP = "\xa0"
CURRENCY_RE = re.compile(r"R\$\s*([\d\.]+,\d{2})")
QTY_RE      = re.compile(r"(\d+(?:[.,]\d+)?)\s*(ml|g|l|kg|un|unid|unidades)\b", re.I)
SRCSET_RE   = re.compile(r"\s*(\S+)\s+(\d+)w\s*")

# ================== Utils ==================
def _strip(s: str) -> str:
    return re.sub(r"\s+", " ", s or "").strip()

def _norm(s: str) -> str:
    s = (s or "").replace(NBSP, " ")
    s = unicodedata.normalize("NFKD", s)
    s = "".join(c for c in s if not unicodedata.combining(c))
    s = re.sub(r"[\s_/,-]+", " ", s)
    return s.strip().lower()

def sanitize_filename(s: str) -> str:
    s = _norm(s)
    s = re.sub(r"[^a-z0-9\._-]+", "-", s)
    s = re.sub(r"-{2,}", "-", s).strip("-")
    return s or "img"

def br_money_to_float_str(text: str) -> Optional[str]:
    if not text:
        return None
    m = CURRENCY_RE.search(text)
    if not m:
        return None
    raw = m.group(1).replace(".", "").replace(",", ".")
    try:
        return f"{float(raw):.2f}"
    except ValueError:
        return None

def extract_quantity(text: str) -> Optional[str]:
    if not text:
        return None
    m = QTY_RE.search(text)
    if not m:
        return None
    val = m.group(1).replace(",", ".")
    unit = m.group(2).lower()
    unit = {"unid": "un", "unidades": "un"}.get(unit, unit)
    return f"{val}{unit}" if unit in {"ml","g","l","kg"} else f"{val} {unit}"

def should_exclude(name: str) -> bool:
    n = _norm(name)
    for kw in list(EXCLUDE_KEYWORDS) + ["kit", "refil", "refill", "combo", "duo", "trio", "necessaire", "presente", "gift"]:
        if _norm(kw) in n:
            return True
    return False

def get_soup(url: str, referer: Optional[str] = None) -> Optional[BeautifulSoup]:
    for i in range(3):
        try:
            headers = SESSION.headers.copy()
            if referer:
                headers["Referer"] = referer
            r = SESSION.get(url, timeout=40, headers=headers)
            if r.status_code == 200:
                return BeautifulSoup(r.text, "lxml")
        except requests.RequestException:
            pass
        time.sleep(1 + i)
    return None

# ================== Paginação (Carregar mais) ==================
def discover_ajax_url(soup: BeautifulSoup) -> Optional[str]:
    """
    Captura o 'data-ajax-url' do botão Carregar mais e retorna URL absoluta.
    """
    btn = soup.select_one("button.nx-btn--load-more[data-ajax-url]")
    if not btn:
        return None
    rel = btn.get("data-ajax-url", "").strip()
    if not rel:
        return None
    if rel.startswith("/"):
        return urljoin(BASE_URL, rel)
    return rel

def update_skip_param(url: str, skip: int) -> str:
    pr = urlparse(url)
    qs = parse_qs(pr.query)
    qs["skip"] = [str(skip)]
    new_q = urlencode({k: v[0] if isinstance(v, list) else v for k, v in qs.items()})
    return urlunparse((pr.scheme, pr.netloc, pr.path, pr.params, new_q, pr.fragment))

def fetch_listing_html(url: str, referer: Optional[str] = None) -> Optional[str]:
    try:
        headers = SESSION.headers.copy()
        if referer:
            headers["Referer"] = referer
        r = SESSION.get(url, timeout=40, headers=headers)
        if r.status_code == 200:
            return r.text
    except requests.RequestException:
        return None
    return None

def collect_product_urls() -> List[str]:
    """
    Coleta todos os links de produto via listagem + chamadas "Carregar mais".
    """
    # 1) primeira página
    soup = get_soup(LISTING_URL)
    if not soup:
        return []
    product_urls = set()

    def collect_from_soup(sp: BeautifulSoup):
        for a in sp.select("a.nx-product-teaser__link-wrapper[href]"):
            href = a["href"].strip()
            if href.startswith("/"):
                href = urljoin(BASE_URL, href)
            if re.search(r"/produtos/.*\.html(\?|$)", href):
                product_urls.add(href)

    collect_from_soup(soup)

    # 2) paginação via data-ajax-url
    ajax_base = discover_ajax_url(soup)
    if not ajax_base:
        # pode ser que tudo esteja na primeira página
        return sorted(product_urls)

    # estima incremento 'skip' pelo número inicial de cards
    initial_count = len(product_urls)
    skip = initial_count if initial_count > 0 else 21  # padrão visto no snippet
    max_rounds = 20  # proteção
    rounds = 0

    while rounds < max_rounds:
        rounds += 1
        page_url = update_skip_param(ajax_base, skip)
        html = fetch_listing_html(page_url, referer=LISTING_URL)
        if not html or "<" not in html:
            break
        frag = BeautifulSoup(html, "lxml")
        before = len(product_urls)
        collect_from_soup(frag)
        after = len(product_urls)
        if after <= before:
            break
        # próximo incremento (usa delta real desta chamada)
        skip += (after - before)

    return sorted(product_urls)

# ================== PDP: campos ==================
def extract_name(soup: BeautifulSoup) -> Optional[str]:
    h1 = soup.select_one("h1.nx-product-stage__headline")
    if not h1:
        return None
    return _strip(h1.get_text(" ", strip=True))

def extract_subtitle(soup: BeautifulSoup) -> Optional[str]:
    p = soup.select_one("p.nx-product-information__description")
    if p:
        return _strip(p.get_text(" ", strip=True))
    return None

def extract_quantity_from_variation(soup: BeautifulSoup) -> Optional[str]:
    # variações:
    sizes = [ _strip(el.get_text(" ", strip=True)) for el in soup.select(".product-variation__size") ]
    for s in sizes:
        q = extract_quantity(s)
        if q:
            return q
    # fallback: tenta do nome
    name = extract_name(soup)
    return extract_quantity(name or "")

def extract_price(soup: BeautifulSoup) -> Optional[str]:
    # site BR de NIVEA costuma não exibir preço direto (vende via parceiros)
    # mas deixo um fallback genérico, caso apareça:
    for sel in ["[class*='price']", ".nx-price", ".price"]:
        for el in soup.select(sel):
            val = br_money_to_float_str(el.get_text(" ", strip=True))
            if val:
                return val
    return None

def extract_skin_text(soup: BeautifulSoup) -> str:
    """
    Tenta capturar bloco(s) com 'Tipo de pele' e, se não achar, usa o texto descritivo.
    """
    txts = []
    # título "Tipo de pele"
    title = soup.find(lambda t: t.name in ["h5","h4","h3"] and _norm(t.get_text()).startswith("tipo de pele"))
    if title:
        # pega o container pai/irmão que contenha os chips/labels
        cont = title.find_next()
        if cont:
            txts.append(_strip(cont.get_text(" ", strip=True)))
    # descrição geral
    desc = extract_subtitle(soup)
    if desc:
        txts.append(desc)
    return " | ".join(t for t in txts if t)

def map_skin_types_from_text(text: str) -> List[str]:
    t = _norm(text)
    if "todos os tipos de pele" in t:
        return ["todos os tipos"]
    out = set()
    for canonical, syns in SKIN_TYPE_SYNONYMS_PT.items():
        for s in syns + [canonical]:
            if _norm(s) in t:
                out.add(canonical)
                break
    ordered = [s for s in SKIN_TYPE_CANONICAL_ORDER if s in out]
    return ordered or (["todos os tipos"] if "pele" in t else [])

def extract_benefits(soup: BeautifulSoup) -> List[str]:
    """
    Benefícios em labels e listas:
      - h5.nx-benefit__label
      - .nx-benefits-list__item (li)
    Normalização via BENEFIT_SYNONYMS_PT.
    """
    candidates = []
    for el in soup.select("h5.nx-benefit__label, .nx-benefits-list__item, li"):
        txt = _strip(el.get_text(" ", strip=True))
        if not txt:
            continue
        # aceita bullets genéricos com termos úteis
        if any(k in _norm(txt) for k in ["hidrata", "prote", "uva", "uvb", "absor", "matte", "oleos", "antissinais", "uniformiza", "acalma", "limpa", "renova", "esfolia"]):
            candidates.append(txt)
    # normaliza
    found = set()
    joined = " " + " ; ".join(_norm(x) for x in candidates) + " "
    for canonical in BENEFIT_CANONICAL_ORDER:
        syns = BENEFIT_SYNONYMS_PT.get(canonical, [])
        if any(re.search(rf"\b{re.escape(_norm(s))}\b", joined) for s in syns + [canonical]):
            found.add(canonical)
    return [b for b in BENEFIT_CANONICAL_ORDER if b in found]

def extract_ingredients_text(soup: BeautifulSoup) -> str:
    """
    Seção 'Lista de ingredientes' (abre/expande via HTML; pegamos o texto renderizado).
    """
    # título “Lista de ingredientes”
    title = soup.find(lambda t: t.name in ["h2","h3","h4"] and "lista de ingredientes" in _norm(t.get_text()))
    chunks = []
    if title:
        # container da seção
        container = title.find_parent(class_="nx-expand-section__container")
        if not container:
            container = title.find_parent()
        if container:
            for p in container.select(".nx-ingredients__section, p"):
                t = _strip(p.get_text(" ", strip=True))
                if t and (t.count(",") >= 3 or "/" in t):
                    chunks.append(t)
    # fallback: qualquer bloco químico longo
    if not chunks:
        best = ""
        for el in soup.select("p, div, span"):
            tt = _strip(el.get_text(" ", strip=True))
            if (tt.count(",") >= 6 or "/" in tt) and len(tt) > len(best):
                best = tt
        if best:
            chunks.append(best)
    # retorna o maior
    chunks.sort(key=len, reverse=True)
    return chunks[0] if chunks else ""

def tokenize_and_filter_ingredients(raw_text: str) -> List[str]:
    """
    Tokeniza por vírgula e '/', normaliza e filtra ESTRITAMENTE contra INGREDIENTES_VALIDOS (português).
    """
    if not raw_text:
        return []
    parts: List[str] = []
    for chunk in re.split(r",", raw_text):
        parts.extend(re.split(r"/", chunk))
    valid_norm_map = {_norm(v): v for v in INGREDIENTES_VALIDOS}
    out, seen = [], set()
    for p in parts:
        tok = _strip(p).strip().strip(".:;")
        if not tok:
            continue
        key = _norm(tok)
        matched = None
        if key in valid_norm_map:
            matched = valid_norm_map[key]
        else:
            # aproximação por contenção (sirva p/ “acetato de tocoferila” vs “tocoferol”, etc.)
            for k_valid, v_canon in valid_norm_map.items():
                if k_valid and (k_valid in key or key in k_valid) and len(k_valid) >= 4:
                    matched = v_canon
                    break
        if matched:
            k2 = _norm(matched)
            if k2 not in seen:
                seen.add(k2)
                out.append(matched)
    return out

def classify_category(name: Optional[str], subtitle: Optional[str]) -> Optional[str]:
    base = f"{name or ''} {subtitle or ''}"
    text = _norm(base)
    hits = []
    for cat, hints in CATEGORY_HINTS.items():
        if any(_norm(h) in text for h in hints):
            hits.append(cat)
    if not hits:
        return None
    for cat in CATEGORY_CANONICAL_ORDER:
        if cat in hits:
            return cat
    return hits[0]

# ================== Imagens ==================
def _parse_srcset(srcset: str) -> List[Tuple[str, int]]:
    out = []
    if not srcset:
        return out
    for part in srcset.split(","):
        part = part.strip()
        m = SRCSET_RE.match(part)
        if m:
            url, w = m.group(1), int(m.group(2))
            out.append((url, w))
        else:
            url = part.split()[0]
            if url:
                out.append((url, 0))
    return out

def _canonicalize_url(u: str) -> str:
    pr = urlparse(u)
    return urlunparse((pr.scheme or "https", pr.netloc, pr.path, "", pr.query, ""))

def extract_all_images(soup: BeautifulSoup) -> List[str]:
    """
    Coleta todas as imagens de produto (src + srcset) de picture/img (host img.nivea.com),
    ordenadas por largura desc e remove bad assets (logos, icons).
    """
    blacklist = ("logo", "icon", "icone", "sprite", "favicon", "/icons/")
    def _bad(u: str) -> bool:
        lu = (u or "").lower()
        return any(b in lu for b in blacklist)

    candidates: Dict[str, int] = {}
    # picture > source + img
    for pic in soup.select("picture"):
        # sources (webp/jpg)
        for s in pic.select("source[srcset]"):
            for url, w in _parse_srcset(s.get("srcset", "")):
                if "img.nivea.com" in url and not _bad(url):
                    candidates[_canonicalize_url(url)] = max(candidates.get(url, 0), w)
        # img fallback
        img = pic.find("img")
        if img:
            s = img.get("src")
            if s and "img.nivea.com" in s and not _bad(s):
                candidates[_canonicalize_url(s)] = max(candidates.get(s, 0), 0)
            for url, w in _parse_srcset(img.get("srcset", "")):
                if "img.nivea.com" in url and not _bad(url):
                    candidates[_canonicalize_url(url)] = max(candidates.get(url, 0), w)

    # imagens soltas também
    for img in soup.select("img"):
        s = img.get("src") or ""
        if "img.nivea.com" in s and not _bad(s):
            candidates[_canonicalize_url(s)] = max(candidates.get(s, 0), 0)
        for url, w in _parse_srcset(img.get("srcset", "")):
            if "img.nivea.com" in url and not _bad(url):
                candidates[_canonicalize_url(url)] = max(candidates.get(url, 0), w)

    # ordena por largura desc, depois por path len
    def score(u: str) -> Tuple[int, int]:
        pr = urlparse(u)
        return (candidates.get(u, 0), len(pr.path))
    ordered = sorted(set(candidates.keys()), key=score, reverse=True)
    return ordered

def _ext_from_url(u: str) -> str:
    p = urlparse(u).path.lower()
    for ext in (".jpg", ".jpeg", ".png", ".webp"):
        if p.endswith(ext):
            return ext
    return ".jpg"

def download_images(img_urls: List[str], product_name: str, referer: str) -> List[str]:
    """
    Baixa TODAS as imagens e retorna a lista de filenames salvos.
    """
    saved = []
    base = sanitize_filename(product_name or "produto")
    for idx, url in enumerate(img_urls, 1):
        if not url:
            continue
        if url.startswith("//"):
            url = "https:" + url
        elif url.startswith("/"):
            url = urljoin(BASE_URL, url)
        ext = _ext_from_url(url)
        fname = f"{base}-{idx}{ext}"
        fpath = IMG_DIR / fname
        try:
            headers = SESSION.headers.copy()
            headers["Referer"] = referer
            r = SESSION.get(url, timeout=40, headers=headers)
            if r.status_code == 200:
                ctype = (r.headers.get("Content-Type") or "").lower()
                if not ctype.startswith("image"):
                    continue
                if len(r.content) < 6000:   # evita placeholders muito pequenos
                    continue
                with open(fpath, "wb") as f:
                    f.write(r.content)
                saved.append(fname)
        except requests.RequestException:
            continue
    return saved

# ================== Parse completo de um PDP ==================
def parse_nivea_product(url: str) -> Optional[Dict]:
    soup = get_soup(url, referer=LISTING_URL)
    if not soup:
        return None

    name = extract_name(soup)
    if not name or should_exclude(name):
        return None

    subtitle  = extract_subtitle(soup)
    quantity  = extract_quantity_from_variation(soup)
    price     = extract_price(soup)  # pode ser None (sem preço no site)
    skin_txt  = extract_skin_text(soup)
    skin      = map_skin_types_from_text(skin_txt)
    benefits  = extract_benefits(soup)
    ing_text  = extract_ingredients_text(soup)
    ingred    = tokenize_and_filter_ingredients(ing_text)
    category  = classify_category(name, subtitle)

    imgs      = extract_all_images(soup)
    saved     = download_images(imgs, name, referer=url)
    main_img  = saved[0] if saved else None

    return {
        "marca": "nivea",
        "nome": name,
        "subtitulo": subtitle if subtitle else None,
        "categoria": category,
        "quantidade": quantity,
        "preco": price,
        "beneficios": "; ".join(benefits) if benefits else None,
        "ingredientes": "; ".join(ingred) if ingred else None,
        "tipo_pele": "; ".join(skin) if skin else None,
        "imagem": main_img,  # primeira salva (demais ficaram em ./images/nivea)
        "url": url,
    }

# ================== Main ==================
def main():
    print("[NIVEA] Coletando links da listagem…")
    product_urls = collect_product_urls()
    product_urls = sorted(set(product_urls))
    print(f"[NIVEA] Total de links coletados: {len(product_urls)}")

    results = []
    seen_urls = set()
    for i, url in enumerate(product_urls, 1):
        if url in seen_urls:
            print(f"[{i}/{len(product_urls)}] pulado (URL repetida) - {url}")
            continue
        seen_urls.add(url)
        try:
            item = parse_nivea_product(url)
            status = "ok" if item else "descartado"
            if item:
                results.append(item)
        except Exception as e:
            status = f"erro:{e.__class__.__name__}"
        print(f"[{i}/{len(product_urls)}] {status} - {url}")
        time.sleep(0.15)

    with open(JSON_PATH, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    pd.DataFrame(results).to_csv(CSV_PATH, index=False, encoding="utf-8-sig")

    print(f"Salvo JSON: {JSON_PATH}")
    print(f"Salvo CSV : {CSV_PATH}")
    print(f"Itens válidos: {len(results)}")

if __name__ == "__main__":
    main()


  from pandas.core import (


[NIVEA] Coletando links da listagem…
[NIVEA] Total de links coletados: 76
[1/76] ok - https://www.nivea.com.br/produtos/nivea-agua-micelar-solu%c3%a7%c3%a3o-de-limpeza-7-em-1-efeito-matte-40059006620190033.html
[2/76] ok - https://www.nivea.com.br/produtos/nivea-cellular-luminous-630-antispot-antiolheiras-40059009293580033.html
[3/76] ok - https://www.nivea.com.br/produtos/nivea-complexo-de-repara%c3%a7%c3%a3o-noturna-luminous-630-40059009865590033.html
[4/76] ok - https://www.nivea.com.br/produtos/nivea-creme-antissinais-contorno-dos-olhos-q10-40059009159000033.html
[5/76] ok - https://www.nivea.com.br/produtos/nivea-creme-facial-antissinais-100g-423604140033.html
[6/76] ok - https://www.nivea.com.br/produtos/nivea-creme-facial-antissinais-dia-cellular-40059001398490033.html
[7/76] ok - https://www.nivea.com.br/produtos/nivea-creme-facial-antissinais-noite-cellular-40059001398560033.html
[8/76] ok - https://www.nivea.com.br/produtos/nivea-creme-facial-antissinais-q10-energy-noite-4005

KeyboardInterrupt: 