# Beyoung

In [12]:
import re
import csv
import time, sys, os, unicodedata, json
from typing import List, Dict, Optional
from urllib.parse import urljoin, urlparse

import requests
from selectolax.parser import HTMLParser

sys.path.append(os.path.abspath("/home/usuario/Área de trabalho/Dados/models"))

from skin import (
    SKIN_TYPE_CANONICAL_ORDER,
    SKIN_TYPE_SYNONYMS_PT,
)

from exclude import EXCLUDE_KEYWORDS
from ingredient import INGREDIENTES_VALIDOS
from benefits import BENEFIT_SYNONYMS_PT, BENEFIT_CANONICAL_ORDER

from category import CATEGORY_HINTS, CATEGORY_CANONICAL_ORDER

## Informações Iniciais

In [13]:
BASE = "https://www.beyoung.com.br"
COLLECTION_URL = "https://www.beyoung.com.br/collections/skincare"
SITE_LABEL = "beyoung"
OUT_CSV = "beyoung_skincare.csv"
OUT_JSON = "beyoung_skincare.json"
IMAGES_DIR = "images"   
HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
    "Accept-Language": "pt-BR,pt;q=0.9,en;q=0.8"
}

os.makedirs(IMAGES_DIR, exist_ok=True)

## Utilitários

In [14]:

def get_html(url: str) -> Optional[HTMLParser]:
    try:
        resp = requests.get(url, headers=HEADERS, timeout=30)
        if resp.ok:
            return HTMLParser(resp.text)
    except Exception:
        pass
    return None

def text(node) -> str:
    return (node.text().strip() if node else "").strip()

def norm_spaces(s: str) -> str:
    return re.sub(r"\s+", " ", s or "").strip()

def strip_accents(s: str) -> str:
    if not s: return ""
    return "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")

def norm_for_match(s: str) -> str:
    s = norm_spaces(s).lower()
    s = strip_accents(s)
    return s

def price_to_float(s: str) -> Optional[float]:
    if not s:
        return None
    m = re.search(r"(\d{1,3}(?:\.\d{3})*|\d+)(?:,(\d{2}))?", s.replace("\xa0"," ").replace("\n"," "))
    if not m:
        return None
    inteiro = m.group(1).replace(".", "")
    centavos = m.group(2) or "00"
    try:
        return float(f"{inteiro}.{centavos}")
    except Exception:
        return None

def has_excluded_keyword(name: str) -> bool:
    n = norm_for_match(name)
    for kw in EXCLUDE_KEYWORDS:
        if kw and norm_for_match(kw) in n:
            return True
    return False

def guess_quantity(name: str, fallback: str = "") -> str:
    patterns = [
        r"(\d+)\s*(ml|g|kg|l|L)\b",
        r"(\d+,\d+)\s*(ml|g|kg|l|L)\b",
        r"\b(\d{2,4})\s*(ml)\b",
    ]
    for pat in patterns:
        m = re.search(pat, name, flags=re.IGNORECASE)
        if m:
            return "".join(m.groups()).replace(" ", "")
    return fallback.strip()

## Imagem

In [15]:

def pick_best_from_srcset(srcset: str) -> Optional[str]:
    best_url, best_w = None, -1
    for part in srcset.split(","):
        part = part.strip()
        if not part:
            continue
        m = re.match(r"(.+?)\s+(\d+)w", part)
        if m:
            url, w = m.group(1).strip(), int(m.group(2))
            if w > best_w:
                best_url, best_w = url, w
        else:
            if best_url is None:
                best_url = part
    return best_url

def abs_url(href: str) -> str:
    if not href:
        return ""
    if href.startswith("//"):
        return "https:" + href
    if href.startswith("http"):
        return href
    return urljoin(BASE, href)

def infer_ext_from_url(u: str) -> str:
    path = urlparse(u).path
    ext = os.path.splitext(path)[1].lower()
    if ext in (".jpg", ".jpeg", ".png", ".webp", ".gif"):
        return ".jpg" if ext == ".jpeg" else ext
    return ".jpg"

def sanitize_filename(name: str) -> str:
    base = norm_for_match(name)
    base = re.sub(r"[^a-z0-9]+", "-", base).strip("-")
    base = re.sub(r"-{2,}", "-", base)
    return base or "produto"

def download_image(image_url: str, product_name: str) -> Optional[str]:
    if not image_url:
        return None
    try:
        r = requests.get(image_url, headers=HEADERS, timeout=40)
        r.raise_for_status()
        ext = infer_ext_from_url(image_url)
        base = sanitize_filename(product_name)
        filename = f"{base}{ext}"
        dest = os.path.join(IMAGES_DIR, filename)
        counter = 1
        while os.path.exists(dest):
            filename = f"{base}-{counter}{ext}"
            dest = os.path.join(IMAGES_DIR, filename)
            counter += 1
        with open(dest, "wb") as f:
            f.write(r.content)
        return filename
    except Exception:
        return None

def extract_main_image_url(doc: HTMLParser) -> Optional[str]:

    og = doc.css_first('meta[property="og:image"]')
    if og:
        content = og.attributes.get("content", "")
        if content:
            return abs_url(content)

    candidates = []
    for sel in [
        "img.product__media", "img.product__image", "img",  
    ]:
        for im in doc.css(sel):
            srcset = im.attributes.get("srcset", "")
            src = im.attributes.get("src", "")
            if srcset:
                best = pick_best_from_srcset(srcset)
                if best:
                    candidates.append(best)
            if src:
                candidates.append(src)
    for u in candidates:
        u = abs_url(u)
        if u:
            return u
    return None

## Benefícios

In [16]:
def extract_product_urls_from_collection(doc: HTMLParser) -> List[str]:
    urls = set()
    for a in doc.css("a.full-unstyled-link"):
        href = a.attributes.get("href", "")
        if href and "/products/" in href:
            urls.add(urljoin(BASE, href))
    for a in doc.css("a.card__heading, a.product-grid-item, a.product-item"):
        href = a.attributes.get("href", "")
        if href and "/products/" in href:
            urls.add(urljoin(BASE, href))
    for a in doc.css("a"):
        href = a.attributes.get("href", "")
        if href and "/products/" in href:
            urls.add(urljoin(BASE, href))
    return sorted(urls)

def collect_benefits_text(doc: HTMLParser) -> str:
    parts = []
    for sel in [
        ".product__description", ".product__description.rte", ".rte",
        ".product__accordion", ".accordion__content", ".product__text",
        "section, article"
    ]:
        for n in doc.css(sel):
            t = norm_spaces(n.text())
            if t and len(t) > 40:
                parts.append(t)
    joined = " ".join(parts)
    if not joined:
        joined = norm_spaces(doc.body.text() if doc.body else "")
    return joined

def classify_benefits(doc: HTMLParser) -> str:
    txt = norm_for_match(collect_benefits_text(doc))
    found = set()
    for canonical, synonyms in BENEFIT_SYNONYMS_PT.items():
        for syn in synonyms:
            if syn and norm_for_match(syn) in txt:
                found.add(canonical)
                break
    if not found:
        return ""
    ordered = [b for b in BENEFIT_CANONICAL_ORDER if b in found]
    return ", ".join(ordered)

## Ingredientes

In [17]:

def collect_ingredients_text(doc: HTMLParser) -> str:
    parts = []
    anchors = ("COMPOSIÇÃO", "COMPOSICAO", "INGREDIENTES", "ATIVOS", "PRINCIPAIS ATIVOS")
    for node in doc.css("strong, b, h1, h2, h3"):
        title = norm_spaces(node.text()).upper()
        if any(a in title for a in anchors):
            hops, cur = 0, node
            while cur and hops < 12:
                cur = cur.next
                if not cur: break
                try:
                    t = getattr(cur, "text", None)
                    if t:
                        val = norm_spaces(cur.text())
                        if val:
                            parts.append(val)
                except Exception:
                    pass
                hops += 1
    if not parts:
        parts = [norm_spaces(doc.body.text() if doc.body else "")]
    return " ".join(parts)

def filter_ingredients(doc: HTMLParser) -> str:
    txt = norm_for_match(collect_ingredients_text(doc))
    hits = set()
    for ing in INGREDIENTES_VALIDOS:
        if norm_for_match(ing) in txt:
            hits.add(ing)
    if not hits:
        return ""
    return ", ".join(sorted(hits, key=lambda s: strip_accents(s).lower()))


## Tipos de pele

In [18]:

def collect_skin_text(doc: HTMLParser) -> str:
    parts = []
    anchors = (
        "PARA QUAIS TIPOS DE PELE","TIPO DE PELE","TIPOS DE PELE",
        "PELE OLEOSA","PELE MISTA","PELE SECA","PELE SENSÍVEL","PELE SENSIVEL",
        "TODOS OS TIPOS"
    )
    for node in doc.css("strong, b, h1, h2, h3"):
        title = norm_spaces(node.text()).upper()
        if any(a in title for a in anchors):
            hops, cur = 0, node
            while cur and hops < 10:
                cur = cur.next
                if not cur: break
                try:
                    t = getattr(cur, "text", None)
                    if t:
                        val = norm_spaces(cur.text())
                        if val:
                            parts.append(val)
                except Exception:
                    pass
                hops += 1
    if not parts:
        parts = [norm_spaces(doc.body.text() if doc.body else "")]
    return " ".join(parts)

def classify_skin_types(doc: HTMLParser) -> str:
    txt = norm_for_match(collect_skin_text(doc))
    found = set()
    for canonical, synonyms in SKIN_TYPE_SYNONYMS_PT.items():
        for syn in synonyms:
            if syn and norm_for_match(syn) in txt:
                found.add(canonical)
                break
    if not found:
        return ""
    ordered = [k for k in SKIN_TYPE_CANONICAL_ORDER if k in found]
    return ", ".join(ordered)

## Categoria

In [19]:

_CATEGORY_ORDER_MAP = {name: i for i, name in enumerate(CATEGORY_CANONICAL_ORDER)}

def classify_category_from_name(name: str, subtitle: str | None = None) -> str:
    if not CATEGORY_HINTS:
        return ""
    txt = norm_for_match(f"{name or ''} {subtitle or ''}")
    hits = []
    for cat, needles in CATEGORY_HINTS.items():
        for n in needles:
            if n and norm_for_match(n) in txt:
                hits.append(cat)
                break
    if not hits:
        return ""
    hits.sort(key=lambda c: _CATEGORY_ORDER_MAP.get(c, 10_000))
    return hits[0]


## Paginação

In [20]:
def extract_product_data(url: str) -> Optional[Dict]:
    doc = get_html(url)
    if not doc:
        return None

    # Nome
    name_node = doc.css_first("h1.product__title") or doc.css_first("h1.product__title.hd3")
    name = text(name_node)

    # Subtítulo
    subtitle_node = doc.css_first("p.product__text.inline-richtext") or doc.css_first(".product__subtitle, .product__text")
    subtitle = text(subtitle_node)

    # Preço
    price_selectors = [
        "span.f-price-item.f-price-item--sale",
        "span.price-item.price-item--sale",
        "span.price-item.price-item--regular",
        "span.money",
        "[data-product-price] .price-item--sale",
        ".price__container .price-item--sale",
        ".price__regular .price-item--regular",
    ]

    raw_price = ""
    for sel in price_selectors:
        node = doc.css_first(sel)
        if node and node.text().strip():
            raw_price = node.text().strip()
            break
    price = price_to_float(raw_price)

    # Quantidade
    qty_node = doc.css_first('[data-selected-swatch-value="Tamanho"]') or doc.css_first("label[for*='template'][for*='main'][for*='-0']")
    quantity = text(qty_node) or guess_quantity(name, "")

    if not name or has_excluded_keyword(name):
        return None

    # Benefícios / Ingredientes / Tipo de pele
    beneficios = classify_benefits(doc)
    ingredientes = filter_ingredients(doc)
    tipo_pele = classify_skin_types(doc)

    # Categoria
    categoria = classify_category_from_name(name, subtitle)

    # Imagem
    image_url = extract_main_image_url(doc)
    image_filename = download_image(image_url, name) if image_url else None

    return {
        "marca": SITE_LABEL,
        "nome": name,
        "subtitulo": subtitle,
        "categoria": categoria,
        "quantidade": quantity,
        "preco": f"{price:.2f}" if price is not None else "",
        "beneficios": beneficios,
        "ingredientes": ingredientes,
        "tipo_pele": tipo_pele,
        "imagem": image_filename or "",   
    }

## Paginação

In [21]:
def paginate_collection(base_url: str, sleep_s: float = 0.8, max_pages: int = 50) -> List[str]:
    all_urls = set()
    page = 1
    while page <= max_pages:
        url = f"{base_url}?page={page}"
        doc = get_html(url)
        if not doc:
            break
        urls = extract_product_urls_from_collection(doc)
        if not urls:
            if page == 1 and base_url != url:
                doc0 = get_html(base_url)
                if doc0:
                    urls0 = extract_product_urls_from_collection(doc0)
                    for u in urls0:
                        all_urls.add(u)
            break
        for u in urls:
            all_urls.add(u)
        page += 1
        time.sleep(sleep_s)
    return sorted(all_urls)

def write_csv(rows: List[Dict], path: str):
    fieldnames = [
        "marca","nome","subtitulo", "categoria", "quantidade", "preco",
        "beneficios","ingredientes","tipo_pele","imagem"
    ]
    with open(path, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=fieldnames)
        w.writeheader()
        for r in rows:
            w.writerow({k: r.get(k, "") for k in fieldnames})

def write_json(rows: List[Dict], path: str):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(rows, f, ensure_ascii=False, indent=2)

## Execução

In [22]:

all_product_urls = paginate_collection(COLLECTION_URL)
print(f"URLs de produtos encontrados na coleção: {len(all_product_urls)}")

rows = []
for i, purl in enumerate(all_product_urls, 1):
    data = extract_product_data(purl)
    if data:
        rows.append(data)
    time.sleep(0.6 if i % 3 else 1.0)

write_csv(rows, OUT_CSV)
write_json(rows, OUT_JSON)

print(f"Registros válidos coletados: {len(rows)}")
print(f"CSV salvo em:  {OUT_CSV}")
print(f"JSON salvo em: {OUT_JSON}")
print(f"Imagens salvas em: {os.path.abspath(IMAGES_DIR)}")


URLs de produtos encontrados na coleção: 25
Registros válidos coletados: 23
CSV salvo em:  beyoung_skincare.csv
JSON salvo em: beyoung_skincare.json
Imagens salvas em: /home/usuario/Área de trabalho/Dados/Beyoung/images
