In [1]:
# -*- coding: utf-8 -*-
import sys, os, re, json, time, unicodedata
from pathlib import Path
from typing import List, Dict, Optional
from urllib.parse import urljoin, urlparse

import requests
import pandas as pd
from bs4 import BeautifulSoup

# ===== Ajuste o sys.path para sua pasta "models" =====
sys.path.append(os.path.abspath("/home/usuario/Área de trabalho/Dados/models"))

from skin import SKIN_TYPE_CANONICAL_ORDER, SKIN_TYPE_SYNONYMS_PT
from exclude import EXCLUDE_KEYWORDS
from ingredient import INGREDIENTES_VALIDOS
from benefits import BENEFIT_SYNONYMS_PT, BENEFIT_CANONICAL_ORDER
from category import CATEGORY_CANONICAL_ORDER, CATEGORY_HINTS

# -------------------- Config --------------------
BASE_URL = "https://www.natura.com.br"

# Salvar diretamente na raiz do projeto
JSON_PATH = Path("natura_products.json")
CSV_PATH  = Path("natura_products.csv")

# Pasta de imagens (fora de outputs/)
IMG_DIR = Path("./images/natura")
IMG_DIR.mkdir(parents=True, exist_ok=True)

SESSION = requests.Session()
SESSION.headers.update({
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124 Safari/537.36",
    "Accept-Language": "pt-BR,pt;q=0.9,en;q=0.8",
})

NBSP = "\xa0"
CURRENCY_RE = re.compile(r"R\$\s*([\d\.]+,\d{2})")
QTY_RE = re.compile(r"(\d+(?:[.,]\d+)?)\s*(ml|g|l|kg|un|unid|unidades)\b", re.I)

# -------------------- Helpers básicos --------------------
def _strip(s: str) -> str:
    return re.sub(r"\s+", " ", s or "").strip()

def _norm(s: str) -> str:
    s = (s or "").replace(NBSP, " ")
    s = unicodedata.normalize("NFKD", s)
    s = "".join(c for c in s if not unicodedata.combining(c))
    s = re.sub(r"[\s_/,-]+", " ", s)  # normaliza separadores
    return s.strip().lower()

def sanitize_filename(s: str) -> str:
    s = _norm(s)
    s = re.sub(r"[^a-z0-9\._-]+", "-", s)
    s = re.sub(r"-{2,}", "-", s).strip("-")
    return s or "img"

def br_money_to_float_str(text: str) -> Optional[str]:
    if not text:
        return None
    m = CURRENCY_RE.search(text)
    if not m:
        return None
    raw = m.group(1).replace(".", "").replace(",", ".")
    try:
        return f"{float(raw):.2f}"
    except ValueError:
        return None

def extract_quantity(text: str) -> Optional[str]:
    if not text:
        return None
    m = QTY_RE.search(text)
    if not m:
        return None
    val = m.group(1).replace(",", ".")
    unit = m.group(2).lower()
    unit = {"unid": "un", "unidades": "un"}.get(unit, unit)
    return f"{val}{unit}" if unit in {"ml","g","l","kg"} else f"{val} {unit}"

def should_exclude(name: str) -> bool:
    n = _norm(name)
    for kw in list(EXCLUDE_KEYWORDS) + ["kit", "refil", "refill", "combo", "duo", "trio", "necessaire", "presente", "gift"]:
        if _norm(kw) in n:
            return True
    return False

# -------------------- Coleta de URLs via sitemap --------------------
def get_sitemap_product_urls() -> List[str]:
    r = SESSION.get(urljoin(BASE_URL, "/robots.txt"), timeout=20)
    r.raise_for_status()
    m = re.search(r"Sitemap:\s*(\S+Feed-SiteMap)", r.text)
    if not m:
        return []
    sitemap_url = m.group(1)
    r2 = SESSION.get(sitemap_url, timeout=40)
    r2.raise_for_status()
    soup = BeautifulSoup(r2.text, "xml")
    urls = sorted({loc.get_text(strip=True) for loc in soup.find_all("loc") if "/p/" in loc.get_text()})

    # filtra candidatos de skincare por slug para reduzir chamadas
    SKIN_SLUGS = ("chronos", "derma", "rosto", "protetor", "serum", "sérum",
                  "hidratante-facial", "creme-antissinais", "sabonete-facial",
                  "agua-micelar", "máscara", "mascara", "clareador", "antissinais", "antioxidante")
    urls = [u for u in urls if any(s in u.lower() for s in SKIN_SLUGS)]
    return urls

def get_soup(url: str) -> Optional[BeautifulSoup]:
    for i in range(3):
        try:
            resp = SESSION.get(url, timeout=30)
            if resp.status_code == 200:
                return BeautifulSoup(resp.text, "lxml")
        except requests.RequestException:
            pass
        time.sleep(1 + i)
    return None

# -------------------- Extração por produto --------------------
def extract_name_subtitle_qty(soup: BeautifulSoup) -> (str, Optional[str], Optional[str]):
    name = ""
    h1 = soup.find("h1")
    if h1:
        name = _strip(h1.get_text())
    subtitle = None
    for p in soup.select("p.text-xs, p.text-low-emphasis, p"):
        txt = _strip(p.get_text())
        if len(txt) >= 6 and (" ml" in f" {txt.lower()} " or " g" in f" {txt.lower()} "):
            subtitle = txt
            break
    qty = extract_quantity(subtitle or "")
    return name, subtitle, qty

def extract_price_str(soup: BeautifulSoup) -> Optional[str]:
    prices = set()
    for el in soup.select("span, div, p"):
        val = br_money_to_float_str(el.get_text(" ", strip=True))
        if val:
            prices.add(val)
    if not prices:
        return None
    return f"{min(prices, key=lambda x: float(x))}"

# ---------- BENEFÍCIOS ----------
def map_benefits(raw_texts: List[str]) -> List[str]:
    found = set()
    joined = " " + " ; ".join(_norm(t) for t in raw_texts) + " "
    for canonical in BENEFIT_CANONICAL_ORDER:
        syns = BENEFIT_SYNONYMS_PT.get(canonical, [])
        # casa contra sinônimos e o próprio canônico
        if any(re.search(rf"\b{re.escape(_norm(s))}\b", joined) for s in syns + [canonical]):
            found.add(canonical)
    ordered = [b for b in BENEFIT_CANONICAL_ORDER if b in found]
    return ordered

def extract_product_tags_and_benefits(soup: BeautifulSoup) -> (List[str], List[str]):
    tags = []
    ben_raw = []

    cont = soup.find(attrs={"data-testid": "product-tags"})
    if cont:
        # 1) data-gtm-product-tags (ex.: "chronos derma|creme antissinais|todos os tipos de pele|antissinais")
        d = cont.get("data-gtm-product-tags")
        if d:
            tags.extend([_strip(x) for x in d.split("|") if _strip(x)])

        # 2) texto dos botões
        for btn in cont.find_all("button"):
            spans = btn.select("span[aria-hidden='true']")
            t = _strip(spans[-1].get_text()) if spans else _strip(btn.get_text())
            if t:
                tags.append(t)

    # 3) “Resultados comprovados”: bullets como benefícios
    # localizar o botão com texto "resultados comprovados"
    btns = soup.find_all("button")
    for b in btns:
        label = _strip(b.get_text())
        if _norm(label) == "resultados comprovados":
            # varre irmãos seguintes em busca de <p> bullets (•)
            sib = b.find_next_sibling()
            steps = 0
            while sib and steps < 12:
                for p in sib.select("p"):
                    txt = _strip(p.get_text(" ", strip=True))
                    if re.match(r"^[•\-\u2022]\s*", txt):
                        txt = re.sub(r"^[•\-\u2022]\s*", "", txt).strip()
                        if 3 <= len(txt) <= 240:
                            ben_raw.append(txt)
                sib = sib.find_next_sibling()
                steps += 1
            break

    # dedup preservando ordem
    tags = list(dict.fromkeys(tags))
    benefits = map_benefits(ben_raw + tags)  # tags também podem carregar benefícios (ex.: "antissinais")
    return tags, benefits

# ---------- TIPOS DE PELE (somente via tags) ----------
def map_skin_types_from_tags(tags: List[str]) -> List[str]:
    out = set()
    joined = " " + " ; ".join(_norm(t) for t in tags) + " "
    for canonical, syns in SKIN_TYPE_SYNONYMS_PT.items():
        if any(re.search(rf"\b{re.escape(_norm(s))}\b", joined) for s in syns + [canonical]):
            out.add(canonical)
    ordered = [s for s in SKIN_TYPE_CANONICAL_ORDER if s in out]
    return ordered or ["todos os tipos"]

# ---------- INGREDIENTES (filtrando contra INGREDIENTES_VALIDOS) ----------
def extract_ingredients_text(soup: BeautifulSoup) -> str:
    # Procura o botão "ingredientes" e lê o conteúdo seguinte
    for btn in soup.find_all("button"):
        label = _strip(btn.get_text())
        if _norm(label) == "ingredientes":
            # Na Natura, o bloco aberto aparece logo após o botão
            # Buscamos spans/p/divs com o texto, priorizando <span class="text-sm">
            collected = []
            sib = btn.find_next_sibling()
            steps = 0
            while sib and steps < 12:
                spans = sib.select("span.text-sm")
                if spans:
                    for s in spans:
                        t = _strip(s.get_text(" ", strip=True))
                        if len(t) > 10:
                            collected.append(t)
                else:
                    # fallback: pega texto do irmão se for grande
                    t = _strip(sib.get_text(" ", strip=True))
                    if t.count(",") >= 6 or "/" in t:
                        collected.append(t)
                sib = sib.find_next_sibling()
                steps += 1
            if collected:
                # escolha o maior bloco
                collected.sort(key=len, reverse=True)
                return collected[0]
    # Fallback: maior bloco químico na página
    candidate = ""
    for el in soup.select("span, p, div"):
        tt = _strip(el.get_text(" ", strip=True))
        if tt.count(",") >= 6 or "/" in tt:
            if len(tt) > len(candidate):
                candidate = tt
    return candidate

def tokenize_and_filter_ingredients(raw_text: str) -> List[str]:
    if not raw_text:
        return []
    # Split por vírgula e por barra (ex.: "AQUA/ WATER/ EAU")
    parts = []
    for chunk in re.split(r",", raw_text):
        parts.extend(re.split(r"/", chunk))
    # normaliza a lista válida uma vez
    valid_norm_map = {_norm(v): v for v in INGREDIENTES_VALIDOS}

    out = []
    seen = set()
    for p in parts:
        tok = _strip(p).strip().strip(".")
        if not tok:
            continue
        key = _norm(tok)
        # regra de match: igualdade exata por normalização OU
        # o válido contido no token (para casos com sufixos) OU token contido no válido (casos compactos)
        matched = None
        if key in valid_norm_map:
            matched = valid_norm_map[key]
        else:
            for k_valid, v_canon in valid_norm_map.items():
                if k_valid and (k_valid in key or key in k_valid) and len(k_valid) >= 4:
                    matched = v_canon
                    break
        if matched:
            k2 = _norm(matched)
            if k2 not in seen:
                seen.add(k2)
                out.append(matched)
    return out

# ---------- Categoria ----------
def classify_category(name: str, subtitle: Optional[str]) -> Optional[str]:
    base = f"{name} {subtitle or ''}"
    text = _norm(base)
    hits = []
    for cat, hints in CATEGORY_HINTS.items():
        if any(_norm(h) in text for h in hints):
            hits.append(cat)
    if not hits:
        return None
    for cat in CATEGORY_CANONICAL_ORDER:
        if cat in hits:
            return cat
    return hits[0]

# ---------- Imagem (prioriza '/Produtos/') ----------
def extract_product_image_url(soup: BeautifulSoup) -> Optional[str]:
    def _split_srcset(s: str) -> List[str]:
        return [p.strip().split(" ")[0] for p in (s or "").split(",") if p.strip()]

    blacklist = ("logo", "icone", "icon", "sprite", "favicon", "/icons/")
    def _bad(u: str) -> bool:
        u_low = u.lower()
        return any(b in u_low for b in blacklist)

    candidates: List[str] = []

    # 1) picture/source com srcset contendo '/Produtos/'
    for src in soup.select("picture source[srcset]"):
        for url in _split_srcset(src.get("srcset", "")):
            if "/Produtos/" in url and not _bad(url):
                candidates.append(url)

    # 2) imgs do “gallery”/“product-image” e correlatos
    for img in soup.select(
        "[data-testid='product-tags'] ~ * img, "
        "[data-testid*='product'] img, "
        "[class*='gallery'] img, "
        "img[pictureclassname], img.object-cover, img.object-contain, img"
    ):
        url = img.get("src") or img.get("data-src") or img.get("data-zoom-src") or ""
        if not url:
            for url2 in _split_srcset(img.get("srcset", "")):
                if "/Produtos/" in url2 and not _bad(url2):
                    candidates.append(url2)
            continue
        if "/Produtos/" in url and not _bad(url):
            candidates.append(url)

    # ranking
    def score(u: str) -> tuple:
        path = urlparse(u).path.lower()
        return (1 if "natbra-" in path else 0, 1 if "/Produtos/" in path else 0, len(path))

    candidates = sorted(set(candidates), key=score, reverse=True)
    return candidates[0] if candidates else None

def download_image(img_url: str, product_name: str) -> Optional[str]:
    if not img_url:
        return None
    if img_url.startswith("//"):
        img_url = "https:" + img_url
    elif img_url.startswith("/"):
        img_url = urljoin(BASE_URL, img_url)

    ext = os.path.splitext(urlparse(img_url).path)[1].lower()
    if ext not in (".jpg", ".jpeg", ".png", ".webp"):
        ext = ".jpg"

    fname = sanitize_filename(product_name) + ext
    fpath = IMG_DIR / fname
    try:
        r = SESSION.get(img_url, timeout=30)
        if r.status_code == 200:
            ctype = (r.headers.get("Content-Type") or "").lower()
            if not ctype.startswith("image"):
                return None
            if len(r.content) < 15000:  # evita logos pequenas
                return None
            with open(fpath, "wb") as f:
                f.write(r.content)
            return fname  # apenas o filename
    except requests.RequestException:
        return None
    return None

# ---------- Parse do produto completo ----------
def parse_product(url: str) -> Optional[Dict]:
    soup = get_soup(url)
    if not soup:
        return None

    name, subtitle, qty = extract_name_subtitle_qty(soup)
    if not name or should_exclude(name):
        return None

    price = extract_price_str(soup)  # "99.90" (string)

    # tags (para pele e possíveis benefícios)
    tags, benefits_from_tags = extract_product_tags_and_benefits(soup)
    # tipos de pele: SOMENTE pelos tags
    skin_types = map_skin_types_from_tags(tags)

    # ingredientes: via botão "ingredientes"
    ing_text = extract_ingredients_text(soup)
    ingredients = tokenize_and_filter_ingredients(ing_text)

    # benefícios finais (tags + bullets “resultados comprovados” já mapeados)
    benefits = benefits_from_tags

    category = classify_category(name, subtitle)

    # imagem
    img_src = extract_product_image_url(soup)
    img_filename = download_image(img_src, name)

    # === Formato EXATO pedido ===
    item = {
        "marca": "natura",
        "nome": name,
        "subtitulo": subtitle if subtitle else None,
        "categoria": category,
        "quantidade": qty,
        "preco": price,  # string "99.90"
        "beneficios": "; ".join(benefits) if benefits else None,
        "ingredientes": "; ".join(ingredients) if ingredients else None,
        "tipo_pele": "; ".join(skin_types) if skin_types else None,
        "imagem": img_filename,  # apenas o filename
    }
    return item

# -------------------- Main --------------------
def main():
    print("[Natura/Rosto] Coletando URLs pelo sitemap…")
    urls = get_sitemap_product_urls()
    print(f"URLs candidatadas: {len(urls)}")

    results = []
    seen = set()
    for i, url in enumerate(urls, 1):
        try:
            it = parse_product(url)
            if not it:
                status = "descartado"
            else:
                key = _norm(it["nome"])
                if key in seen:
                    status = "duplicado"
                else:
                    seen.add(key)
                    results.append(it)
                    status = "ok"
        except Exception as e:
            status = f"erro:{e.__class__.__name__}"
        print(f"[{i}/{len(urls)}] {status} - {url}")
        time.sleep(0.25)

    # salva JSON e CSV na raiz, com os nomes pedidos
    with open(JSON_PATH, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    df = pd.DataFrame(results)
    df.to_csv(CSV_PATH, index=False, encoding="utf-8-sig")

    print(f"Salvo JSON: {JSON_PATH}")
    print(f"Salvo CSV : {CSV_PATH}")
    print(f"Itens válidos: {len(results)}")

if __name__ == "__main__":
    main()


  from pandas.core import (


[Natura/Rosto] Coletando URLs pelo sitemap…
URLs candidatadas: 276
[1/276] ok - https://www.natura.com.br/p/agua-micelar-demaquilante-suave-chronos-derma-150-ml/NATBRA-133503
[2/276] ok - https://www.natura.com.br/p/balm-redutor-de-rugas-chronos-derma-15-g/NATBRA-169217
[3/276] ok - https://www.natura.com.br/p/balm-redutor-de-rugas-para-olhos-chronos-15-g/NATBRA-111332
[4/276] ok - https://www.natura.com.br/p/base-serum-nude-me-una-30ml/NATBRA-PAI110201
[5/276] duplicado - https://www.natura.com.br/p/base-serum-nude-me-una/NATBRA-110181
[6/276] duplicado - https://www.natura.com.br/p/base-serum-nude-me-una/NATBRA-110182
[7/276] duplicado - https://www.natura.com.br/p/base-serum-nude-me-una/NATBRA-110183
[8/276] duplicado - https://www.natura.com.br/p/base-serum-nude-me-una/NATBRA-110184
[9/276] duplicado - https://www.natura.com.br/p/base-serum-nude-me-una/NATBRA-110185
[10/276] duplicado - https://www.natura.com.br/p/base-serum-nude-me-una/NATBRA-110186
[11/276] duplicado - https://ww