# Beyoung

In [7]:
import os, re, json, time, unicodedata, sys
from pathlib import Path
from urllib.parse import urljoin, urlparse
from typing import List, Dict, Optional
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import requests
from bs4 import BeautifulSoup

sys.path.append(os.path.abspath("/home/usuario/Área de trabalho/Dados/models"))

from skin import (SKIN_TYPE_CANONICAL_ORDER, SKIN_TYPE_SYNONYMS_PT)
from exclude import (EXCLUDE_KEYWORDS,)
from ingredient import (INGREDIENTES_VALIDOS,)
from benefits import (BENEFIT_SYNONYMS_PT, BENEFIT_CANONICAL_ORDER)
from category import (CATEGORY_CANONICAL_ORDER, CATEGORY_HINTS)


## Configurações Iniciais

In [8]:

BASE_URL = "https://www.beyoung.com.br"
BASE_SKINCARE_URL = "https://www.beyoung.com.br/collections/skincare"

CATEGORY_URLS = {
    "Limpeza":        "https://www.beyoung.com.br/collections/skincare-limpeza",
    "Tratamento":     "https://www.beyoung.com.br/collections/skincare-tratamento",
    "Hidratação":     "https://www.beyoung.com.br/collections/skincare-hidratacao",
    "Proteção Solar": "https://www.beyoung.com.br/collections/protecao-solar",
}

OUTPUT_JSON_PATH = "/home/usuario/Área de trabalho/Dados/Beyoung/beyoung_scraper.json"
IMAGES_DIR = Path("images")
IMAGES_DIR.mkdir(parents=True, exist_ok=True)

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7",
}

def build_session() -> requests.Session:
    s = requests.Session()
    s.headers.update(HEADERS)
    retry = Retry(
        total=5, backoff_factor=0.7,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET", "HEAD", "OPTIONS"]
    )
    s.mount("https://", HTTPAdapter(max_retries=retry))
    s.mount("http://", HTTPAdapter(max_retries=retry))
    return s

SESSION = build_session()

## Utilitários

In [9]:
def _strip_accents_lower(s: str) -> str:
    if not s:
        return ""
    s = "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")
    s = s.lower()
    s = s.replace("-", " ")
    s = re.sub(r"[^\w\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def sentence_case(s: str) -> str:
    """Primeira letra maiúscula, restante minúsculo (normaliza caixa alta)."""
    if not s:
        return s
    s_low = s.lower()
    return s_low[0].upper() + s_low[1:] if len(s_low) > 1 else s_low.upper()

def slugify(name: str) -> str:
    base = _strip_accents_lower(name)
    base = re.sub(r"[^a-z0-9]+", "-", base).strip("-")
    base = re.sub(r"-{2,}", "-", base)
    return base or "produto"

def http_get(url: str, timeout: int = 30) -> Optional[bytes]:
    try:
        r = SESSION.get(url, timeout=timeout)
        if r.status_code != 200:
            return None
        low = r.text.lower()
        if any(x in low for x in ["please enable cookies", "attention required", "access denied"]):
            return None
        return r.content
    except Exception:
        return None

def soup_from_url(url: str) -> Optional[BeautifulSoup]:
    content = http_get(url)
    if not content:
        return None
    return BeautifulSoup(content, "html.parser")

def should_exclude(name: str) -> bool:
    n = _strip_accents_lower(name)
    for kw in EXCLUDE_KEYWORDS:
        if not kw:
            continue
        if _strip_accents_lower(kw) in n:
            return True
    return False

def normalize_price_text(txt: str) -> Optional[str]:
    if not txt: return None
    t = re.sub(r"\s+", " ", txt).replace("\xa0", " ").strip()
    m = re.search(r"(?:R\$)?\s*(\d{1,3}(?:\.\d{3})*,\d{2})", t)
    if not m:
        m = re.search(r"(?:R\$)?\s*(\d{1,3}(?:\.\d{3})*)\b(?!,)", t)
        if m:
            num = m.group(1).replace(".", "")
            return f"{float(num):.2f}"
    if m:
        num = m.group(1).replace(".", "").replace(",", ".")
        try:
            return f"{float(num):.2f}"
        except ValueError:
            return None
    return None

def safe_join_url(url: str) -> str:
    return "https:" + url if url and url.startswith("//") else url

def infer_img_ext(url: str) -> str:
    if not url:
        return ".jpg"
    path = urlparse(url).path
    ext = os.path.splitext(path)[1].lower()
    if ext in [".jpg", ".jpeg", ".png", ".webp"]:
        return ".jpg" if ext == ".jpeg" else ext
    return ".jpg"


## Características

In [10]:
def download_image(img_url: str, product_name: str) -> Optional[str]:
    if not img_url:
        return None
    try:
        r = SESSION.get(
            img_url,
            headers={
                **HEADERS,
                "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
                "Referer": BASE_URL
            },
            timeout=40
        )
        r.raise_for_status()
        ext = infer_img_ext(img_url)
        base = slugify(product_name)
        dest = IMAGES_DIR / f"{base}{ext}"
        i = 1
        while dest.exists():
            dest = IMAGES_DIR / f"{base}-{i}{ext}"
            i += 1
        dest.write_bytes(r.content)
        return dest.name
    except Exception as e:
        print(f"[download_image] Falha: {img_url} -> {e}")
        return None

#  Benefícios
_norm_benefit_map = {
    canonico: [_strip_accents_lower(x) for x in syns if x]
    for canonico, syns in (BENEFIT_SYNONYMS_PT or {}).items()
}
_benefit_order = {b: i for i, b in enumerate(BENEFIT_CANONICAL_ORDER or [])}

def standardize_benefits(raw_benefits: List[str]) -> str:
    if not raw_benefits:
        return ""
    found = set()
    for txt in raw_benefits:
        n = _strip_accents_lower(txt)
        for canon, pats in _norm_benefit_map.items():
            if any(p and p in n for p in pats):
                found.add(canon)
    ordered = sorted(found, key=lambda x: _benefit_order.get(x, 999))
    return "; ".join(ordered)

#  Ingredientes 
_valid_norm_map = {_strip_accents_lower(v): v for v in INGREDIENTES_VALIDOS}
_valid_norm_list = list(_valid_norm_map.keys())
_valid_order = {v: i for i, v in enumerate(INGREDIENTES_VALIDOS)}  # ordena como na sua lista

def _collect_ativos_block_text(soup: BeautifulSoup) -> str:

    for h in soup.find_all(re.compile(r"h\d", re.I)):
        txt = h.get_text(" ", strip=True)
        if txt and _strip_accents_lower(txt) == "principais ativos":

            parts = []
            sib = h.find_next_sibling()
            # percorre até próximo heading
            while sib and not re.match(r"h\d", sib.name or "", re.I):
                t = sib.get_text(" ", strip=True)
                if t:
                    parts.append(t)
                sib = sib.find_next_sibling()
            if parts:
                return " \n ".join(parts)

    blocks = [el.get_text(" \n ", strip=True) for el in soup.select(".metafield-multi_line_text_field")]
    if blocks:
        return " \n ".join(blocks)

    return soup.get_text(" ", strip=True)

def _ingredients_from_text(block_text: str) -> List[str]:

    if not block_text:
        return []
    nbig = _strip_accents_lower(block_text)
    found_norm = set()
    nbig = re.sub(r"\b\d+[.,]?\d*\s*%\b", " ", nbig)

    for vnorm in _valid_norm_list:
        if vnorm and vnorm in nbig:
            found_norm.add(vnorm)

    ordered = sorted(found_norm, key=lambda x: _valid_order.get(_valid_norm_map[x], 9999))
    return [_valid_norm_map[vn] for vn in ordered]

def extract_active_ingredients(soup: BeautifulSoup) -> str:
    block = _collect_ativos_block_text(soup)
    itens = _ingredients_from_text(block)
    return "; ".join(itens)

# Categorias
def extract_category_product_links(cat_url: str, categoria: str) -> List[Dict]:

    soup = soup_from_url(cat_url)
    if not soup:
        return []

    products = []
    for a in soup.select('a[href*="/products/"]'):
        href = a.get("href", "")
        if "/products/" not in href:
            continue
        full = urljoin(BASE_URL, href)
        name = a.get_text(" ", strip=True) or ""
        if not name:
            img = a.find("img", alt=True)
            if img and img.get("alt"):
                name = img["alt"].strip()
        if not name:
            name = href.rstrip("/").split("/")[-1].replace("-", " ").title()

        if should_exclude(name):
            continue

        products.append({"url": full, "categoria": categoria, "nome_card": name})

    seen = set()
    dedup = []
    for p in products:
        if p["url"] not in seen:
            seen.add(p["url"])
            dedup.append(p)
    return dedup

# Extrações na página de produto
def extract_name(soup: BeautifulSoup) -> Optional[str]:
    for sel in ["h1.product__title", "h1", "meta[property='og:title']", "title"]:
        node = soup.select_one(sel)
        if node:
            txt = node.get("content", "") if node.name == "meta" else node.get_text(" ", strip=True)
            if txt:
                return txt.strip()
    return None

def extract_price(soup: BeautifulSoup) -> Optional[str]:
    candidates = [
        ".price__container .price__current, .price__regular .price-item--regular",
        ".price__container .price-item--regular",
        ".product__price .price",
        ".f-price-item--current",
        ".price, .Price",
    ]
    for sel in candidates:
        node = soup.select_one(sel)
        if node:
            p = normalize_price_text(node.get_text(" ", strip=True))
            if p:
                return p
    p = normalize_price_text(soup.get_text(" ", strip=True))
    return p

def extract_quantity(soup: BeautifulSoup) -> Optional[str]:
    node = soup.select_one('[data-selected-swatch-value="Tamanho"]')
    if node and node.get_text(strip=True):
        return node.get_text(strip=True)
    text = soup.get_text(" ", strip=True)
    m = re.search(r"\b\d+\,?\d*\s*(?:ml|g|mg|kg|l|oz)\b", text, re.I)
    if m:
        return m.group(0)
    return None

def extract_benefits(soup: BeautifulSoup) -> List[str]:
    out = []
    for el in soup.select(".section__content .feature-list__items .feature-item__text"):
        txt = el.get_text(" ", strip=True)
        if txt and len(txt) <= 200:
            out.append(txt)
    if not out:
        for li in soup.select("li"):
            t = li.get_text(" ", strip=True)
            if t and 3 <= len(t) <= 160:
                out.append(t)
    return out

def extract_image_url(soup: BeautifulSoup) -> Optional[str]:
    for img in soup.find_all("img"):
        src = img.get("src") or ""
        if "cdn/shop/files" in src or "cdn.shopify.com" in src:
            return safe_join_url(src.strip())
    meta = soup.find("meta", property="og:image")
    if meta and meta.get("content"):
        return safe_join_url(meta["content"].strip())
    return None


## Execução

In [11]:

def scrape_beyoung() -> List[Dict]:
    print("Coletando links por categoria…")
    links = []
    for categoria, url in CATEGORY_URLS.items():
        items = extract_category_product_links(url, categoria)
        print(f" - {categoria}: {len(items)} links após filtro")
        links.extend(items)
        time.sleep(0.6)

    seen = set()
    final_links = []
    for it in links:
        if it["url"] not in seen:
            seen.add(it["url"])
            final_links.append(it)

    print(f"Total de URLs únicas: {len(final_links)}")
    products = []

    for i, item in enumerate(final_links, 1):
        url = item["url"]; categoria = item["categoria"]
        print(f"[{i}/{len(final_links)}] {url}")
        soup = soup_from_url(url)
        if not soup:
            print("  ! Falha ao carregar página.")
            continue

        nome_raw = extract_name(soup) or item.get("nome_card") or ""
        if not nome_raw:
            print("  ! Nome não encontrado.")
            continue

        nome = sentence_case(nome_raw)

        if should_exclude(nome):
            print("  - Excluído por keywords.")
            continue

        preco = extract_price(soup) or ""
        quantidade = extract_quantity(soup) or ""
        raw_benefits = extract_benefits(soup)
        beneficios = standardize_benefits(raw_benefits)

        ingredientes = extract_active_ingredients(soup)  

        img_url = extract_image_url(soup)
        img_file = download_image(img_url, nome) if img_url else None

        prod = {
            "marca": "beyoung",
            "nome": nome,
            "subtitulo": None,
            "categoria": categoria,
            "quantidade": quantidade or "",
            "preco": preco or "",
            "beneficios": beneficios or "",
            "ingredientes": ingredientes or "",  
            "tipo_pele": "todos os tipos",        
            "imagem": img_file or "",
        }
        products.append(prod)
        print(f"\n Concluído: {nome}")
        time.sleep(0.6)

    return products




## Saída

In [12]:
def save_json(data: List[Dict], path: str):
    Path(os.path.dirname(path)).mkdir(parents=True, exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"\nJSON salvo em: {path}  ({len(data)} produtos)")

if __name__ == "__main__":
    data = scrape_beyoung()
    save_json(data, OUTPUT_JSON_PATH)

Coletando links por categoria…
 - Limpeza: 13 links após filtro
 - Tratamento: 16 links após filtro
 - Hidratação: 12 links após filtro
 - Proteção Solar: 15 links após filtro
Total de URLs únicas: 27
[1/27] https://www.beyoung.com.br/products/agua-micelar-hidratante

 Concluído: Água micelar hidratante
[2/27] https://www.beyoung.com.br/products/gentle-cleanser

 Concluído: Gel de limpeza facial suave
[3/27] https://www.beyoung.com.br/products/exfoliant-cleanser

 Concluído: Esfoliante facial smart peeling
[4/27] https://www.beyoung.com.br/products/gel-de-limpeza-facial-com-acido-glicolico-controle-de-oleosidade

 Concluído: Gel de limpeza facial com ácido glicólico (controle de oleosidade)
[5/27] https://www.beyoung.com.br/products/stick-multifuncional-com-cor-fps-80#judgeme_product_reviews

 Concluído: Stick bastão multifuncional com cor fps 80
[6/27] https://www.beyoung.com.br/products/booster#judgeme_product_reviews

 Concluído: Sérum facial booster multifuncional 3 em 1
[7/27] htt