# Creamy


In [83]:
import sys, subprocess, os, re, csv, json, time
from urllib.parse import urljoin, urlparse, parse_qs

import requests
from bs4 import BeautifulSoup
import unicodedata
sys.path.append(os.path.abspath("/home/usuario/√Årea de trabalho/Dados/models"))

from skin import (
    SKIN_TYPE_CANONICAL_ORDER,
    SKIN_TYPE_SYNONYMS_PT,
)

from exclude import (
    EXCLUDE_KEYWORDS,
)

from ingredient import (
    INGREDIENTES_VALIDOS,
)

from benefits import (
    BENEFIT_SYNONYMS_PT,
    BENEFIT_CANONICAL_ORDER,
)

from category import (CATEGORY_CANONICAL_ORDER, CATEGORY_HINTS)

## Informa√ß√µes Iniciais

In [84]:
BASE_URL = "https://www.creamy.com.br/"
LISTING_URL_TEMPLATE = "https://www.creamy.com.br/produtos?page={page}"
MAX_PAGES = 9

OUT_JSON = "creamy_products.json"
OUT_CSV  = "creamy_products.csv"
IMG_DIR  = "images"
os.makedirs(IMG_DIR, exist_ok=True)

SESSION = requests.Session()
SESSION.headers.update({
    "User-Agent": (
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
})

## Utilit√°rios

In [85]:
def strip_accents(s: str) -> str:
    if not isinstance(s, str):
        s = str(s)
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def _strip_accents_lower(s: str) -> str:
    return strip_accents(s or "").lower().strip()

def normalize_space(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

def slugify(text: str) -> str:
    text = strip_accents(text.lower())
    text = re.sub(r"[^a-z0-9]+", "-", text)
    text = re.sub(r"-+", "-", text).strip("-")
    return text or "produto"

def get_soup(url, max_retries=3, timeout=25):
    for attempt in range(max_retries):
        try:
            r = SESSION.get(url, timeout=timeout)
            if r.status_code == 200:
                return BeautifulSoup(r.text, "lxml")
            print(f"[WARN] {url} -> status {r.status_code}")
        except Exception as e:
            print(f"[ERROR] {url} -> {e}")
        time.sleep(1.1 * (attempt + 1))
    return None

def looks_excluded(text: str) -> bool:
    """
    Verifica se o texto cont√©m alguma palavra da lista EXCLUDE_KEYWORDS.
    Retorna True se encontrar qualquer palavra de exclus√£o.
    """
    if not text:
        return False
    
    # Lista completa e atualizada de palavras de exclus√£o
    exclude_list = [
        'kit', 'kits', 'combo', 'duo', 'dupla', 'trio', 'rotina', 'corporal', 'corpo', 'hair',
        'cabelo', 'shampoo', 'condicionador', 'body', 'capilar',
        'manguito', 'meia', 'mistery', 'caixa', 'refil', 'caneta', 'geladeira', 'massageador', 
        'pincel', 'pincel', 'adesivo', 'adesivos', 'faixa', 'mini-geladeira', 
        'maquiagem', 'fragrancia', 'fragancia', 'perfume', 'deodorante', 'desodorante'
    ]
    
    # Normaliza o texto para compara√ß√£o (remove acentos e converte para min√∫sculas)
    text_norm = strip_accents(text.lower())
    
    # Verifica cada palavra de exclus√£o
    for keyword in exclude_list:
        if keyword and strip_accents(keyword.lower()) in text_norm:
            return True
    
    # Tamb√©m verifica as palavras do EXCLUDE_KEYWORDS importado (caso esteja atualizado)
    for keyword in EXCLUDE_KEYWORDS:
        if keyword and strip_accents(keyword.lower()) in text_norm:
            return True
    
    return False

def parse_price_to_str(price_text: str) -> str:
    if not price_text:
        return ""
    t = price_text.replace("R$", "").replace("r$", "").strip()
    t = t.replace(" ", "").replace(".", "").replace(",", ".")
    m = re.findall(r"[0-9]+(?:\.[0-9]{1,2})?", t)
    if not m:
        return ""
    try:
        return f"{float(m[0]):.2f}"
    except:
        return ""

def split_list_candidates(text: str):
    if not text:
        return []
    t = text.replace("<br>", ";").replace("<br/>", ";").replace("<br />", ";")
    parts = re.split(r"[;‚Ä¢|/\n,]", t)
    return [normalize_space(p) for p in parts if normalize_space(p)]

def filter_ingredients_creamy(ings_raw):
    allowed_norm = [a.lower() for a in INGREDIENTES_VALIDOS]
    allowed_noacc = [strip_accents(a.lower()) for a in INGREDIENTES_VALIDOS]
    out = []
    for ing in ings_raw:
        ing_l = ing.lower()
        ing_noacc = strip_accents(ing_l)
        match = None
        for a_norm, a_noacc in zip(allowed_norm, allowed_noacc):
            if a_norm in ing_l or a_noacc in ing_noacc:
                match = a_norm
                break
        if match and match not in out:
            out.append(match)
    return "; ".join(out)

def find_all_text(soup, selectors):
    for sel in selectors:
        node = soup.select_one(sel)
        if node and node.get_text(strip=True):
            return node.get_text(" ", strip=True)
    return ""

## Benef√≠cios, Ingredientes

In [86]:
def padroniza_beneficios(textos_beneficios):
    if not textos_beneficios:
        return []
    encontrados = set()
    norm_syn = {
        canonico: [_strip_accents_lower(s) for s in patt_list if s]
        for canonico, patt_list in BENEFIT_SYNONYMS_PT.items()
    }
    for txt in textos_beneficios:
        n = _strip_accents_lower(txt)
        for canonico, padds in norm_syn.items():
            if any(patt in n for patt in padds):
                encontrados.add(canonico)
    if BENEFIT_CANONICAL_ORDER:
        order_map = {name: i for i, name in enumerate(BENEFIT_CANONICAL_ORDER)}
        return sorted(list(encontrados), key=lambda x: order_map.get(x, 999))
    return sorted(list(encontrados))

def extract_benefits(soup):
    items = []
    for ul in soup.select("ul, ol"):
        lis = [normalize_space(li.get_text(" ", strip=True)) for li in ul.select("li")]
        for li in lis:
            if 0 < len(li) <= 120:
                items.append(li)
    for th in soup.select("th, td"):
        txt = normalize_space(th.get_text(" ", strip=True))
        if 0 < len(txt) <= 120:
            items.append(txt)
    uniq, seen = [], set()
    for it in items:
        if it and it not in seen:
            uniq.append(it); seen.add(it)
    pad = padroniza_beneficios(uniq)
    return "; ".join(pad)

def extract_ingredients(soup):
    possible_labels = [
        "ingredientes", "composi√ß√£o", "composicao", "f√≥rmula", "formula", "ingredients", "active ingredients"
    ]
    text_blocks = []
    for el in soup.select("div, section, table, article, ul, ol, p"):
        txt = el.get_text(" ", strip=True)
        low = txt.lower()
        if any(lbl in low for lbl in possible_labels):
            text_blocks.append(txt)
    text_blocks = sorted(set(text_blocks), key=len)
    raw = []
    for block in text_blocks:
        raw.extend(split_list_candidates(block))
    raw = [r for r in raw if len(r) <= 100]
    return filter_ingredients_creamy(raw)

## Pre√ßo e Tipo de pele

In [87]:
def extract_size_from_text(text: str) -> str:
    if not text:
        return ""
    m = re.search(r"(\d+[\.,]?\d*)\s*(ml|g|l)\b", text.lower())
    if m:
        val = m.group(1).replace(",", ".")
        unit = m.group(2).upper()
        if unit == 'L' and not val.endswith('L'):
            return f"{val}L"
        return f"{val}{unit}"
    return ""

def extract_tipos_pele(soup):
    def _strip_accents_lower(s: str) -> str:
        if not s:
            return ""
        s = "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")
        s = s.lower()
        s = s.replace("-", " ")
        s = re.sub(r"[^\w\s]", " ", s)   
        s = re.sub(r"\s+", " ", s).strip()
        return s

    page_text = soup.get_text(" ", strip=True)
    txt_norm = _strip_accents_lower(page_text)

    norm_syn = {
        canonical: [_strip_accents_lower(s) for s in syns if s]
        for canonical, syns in SKIN_TYPE_SYNONYMS_PT.items()
    }

    encontrados = set()
    
    # Primeiro, verifica se tem "todos os tipos" - se tiver, retorna APENAS isso
    todos_synonyms = norm_syn.get("todos os tipos", [])
    for pattern in todos_synonyms:
        if pattern and pattern in txt_norm:
            return "todos os tipos"
    
    # Se n√£o tem "todos os tipos", processa os outros tipos normalmente
    for canonical, patterns in norm_syn.items():
        if canonical == "todos os tipos":
            continue
        if any(p and p in txt_norm for p in patterns):
            encontrados.add(canonical)

    if not encontrados:
        return ""  

    order_map = {name: i for i, name in enumerate(SKIN_TYPE_CANONICAL_ORDER or [])}
    ordered = sorted(encontrados, key=lambda x: order_map.get(x, 10_000))

    return "; ".join(ordered)

## Imagem

In [88]:
def download_image(soup, product_name):
    selectors = [
        "img.vtex-store-components-3-x-productImageTag",
        "img.product-image",
        "img[src*='/arquivos/']",
        "img[src*='cdn']",
    ]
    src = None
    for sel in selectors:
        node = soup.select_one(sel)
        if node and node.get("src"):
            src = node.get("src"); break
        if node and node.get("data-src"):
            src = node.get("data-src"); break
    if not src:
        return ""
    img_url = src if src.startswith("http") else urljoin(BASE_URL, src)
    from urllib.parse import urlparse
    ext = os.path.splitext(urlparse(img_url).path)[1] or ".jpg"
    fname = f"{slugify(product_name)}{ext}"
    fpath = os.path.join(IMG_DIR, fname)
    try:
        r = SESSION.get(img_url, timeout=25)
        if r.status_code == 200 and r.content:
            with open(fpath, "wb") as f:
                f.write(r.content)
            return fname
    except Exception as e:
        print(f"[IMG] Falha ao baixar {img_url}: {e}")
    return ""


## Categoria

In [89]:
_CAT_ORDER_MAP = {c: i for i, c in enumerate(CATEGORY_CANONICAL_ORDER)}

def _norm(s: str) -> str:
    return _strip_accents_lower(s or "")

def classify_category_from_name(name: str, subtitle: str | None = None, desc: str | None = None) -> str:
    """
    Usa CATEGORY_HINTS para mapear nome/subt√≠tulo/descri√ß√£o em uma categoria can√¥nica.
    """
    txt = _norm(f"{name or ''} {subtitle or ''} {desc or ''}")
    hits = []
    for cat, needles in CATEGORY_HINTS.items():
        for needle in needles:
            if _norm(needle) in txt:
                hits.append(cat)
                break
    if not hits:
        return ""   # ou "outros"
    hits.sort(key=lambda c: _CAT_ORDER_MAP.get(c, 10_000))
    return hits[0]

def guess_category(url: str, name: str) -> str:
    """
    Usa apenas CATEGORY_HINTS para classificar categoria, removendo mapeamento manual.
    """
    # Usa a mesma l√≥gica de classify_category_from_name
    return classify_category_from_name(name)

## Produtos

In [90]:
def parse_product_page(url, fallback_category=""):
    soup = get_soup(url)
    if soup is None:
        return None
    
    name = find_all_text(soup, [
        "h1.vtex-store-components-3-x-productName",
        "h1.productName",
        "h1",
        "div.product-name h1",
    ])
    if not name:
        if soup.title and soup.title.string:
            name = soup.title.string.split("|")[0].strip()
    if not name:
        return None
    
    if looks_excluded(name) or looks_excluded(url):
        print(f"[SKIP] Produto exclu√≠do: {name}")
        return None

    subtitle = find_all_text(soup, [
        "span.vtex-product-summary-2-x-description-short div",
        "span.vtex-product-summary-2-x-description-short",
        "div.vtex-rich-text-0-x-container p",
        "div.productDescription",
        "div.product-brief",
    ])
    subtitle = subtitle if (subtitle and len(subtitle) <= 220) else ""
    
    # Corrigida a extra√ß√£o de pre√ßo para capturar os novos seletores
    price_text = find_all_text(soup, [
        "p.priceCustom__sellingPrice span",  # Novo seletor espec√≠fico
        "span.vtex-product-price-1-x-sellingPriceValue",
        "span.selling-price",
        "span.price",
    ])
    price = parse_price_to_str(price_text)
    
    beneficios = extract_benefits(soup)
    ingredientes = extract_ingredients(soup)
    tipos_pele = extract_tipos_pele(soup)
    
    size = extract_size_from_text(name)
    if not size:
        size = extract_size_from_text(subtitle)
    if not size:
        details_txt = find_all_text(soup, [
            "div.vtex-store-components-3-x-productDescriptionText",
            "div.productDescription",
            "section#descricao",
        ])
        size = extract_size_from_text(details_txt)
    
    cat_by_hints = classify_category_from_name(name, subtitle)
    categoria = cat_by_hints or fallback_category or guess_category(url, name)
    
    img_name = download_image(soup, name)
    
    return {
        "marca": "creamy",
        "nome": name.strip(),
        "subtitulo": subtitle if subtitle else "",
        "categoria": categoria,
        "preco": price if price else "",
        "quantidade": size if size else "",
        "beneficios": beneficios.lower() if beneficios else "",
        "ingredientes": ingredientes.lower() if ingredientes else "",
        "tipos_pele": tipos_pele,
        "imagem": img_name,
    }

def listing_get_product_links(page_url: str):
    soup = get_soup(page_url)
    if soup is None:
        return []
    links = set()
    for a in soup.select("a[href]"):
        href = a.get("href")
        if not href:
            continue
        full = href if href.startswith("http") else urljoin(BASE_URL, href)
        if re.search(r"/p($|\?)", full):
            links.add(full)
    return sorted(links)

In [91]:
def run_scraper():
    visited = set()
    items = []
    for page in range(1, MAX_PAGES+1):
        url = LISTING_URL_TEMPLATE.format(page=page)
        print(f"[LIST] {url}")
        prod_links = listing_get_product_links(url)
        print(f"  - {len(prod_links)} links")
        for purl in prod_links:
            if purl in visited or looks_excluded(purl):
                continue
            item = parse_product_page(purl)
            if item:
                visited.add(purl)
                items.append(item)
                print(f"  [+] {item['nome']} :: {item['preco']}")
            time.sleep(0.6)
    return items

def save_data(items):
    """Salva apenas dados em JSON - CSV removido conforme solicitado"""
    cols = ["marca", "nome", "subtitulo", "categoria", "quantidade", "preco", "beneficios", "ingredientes", "tipo_pele", "imagem"]
    clean = [{k: it.get(k, "") for k in cols} for it in items]
    
    with open("creamy_products.json", "w", encoding="utf-8") as f:
        json.dump(clean, f, ensure_ascii=False, indent=2)
    print(f"[OK] JSON salvo em creamy_products.json ({len(clean)} itens)")

# Executar o scraper
if __name__ == "__main__":
    items = run_scraper()
    print(f"Total coletado: {len(items)}")
    save_data(items)
    if items:
        print("Pr√©via do primeiro item:")
        print(json.dumps(items[0], ensure_ascii=False, indent=2))

[LIST] https://www.creamy.com.br/produtos?page=1
  - 10 links
  - 10 links
  [+] Creme Retexturizador - √Åcido Glic√≥lico :: 84.20
  [+] Creme Retexturizador - √Åcido Glic√≥lico :: 84.20
  [+] S√©rum Renovador Suave - √Åcido L√°tico :: 73.67
  [+] S√©rum Renovador Suave - √Åcido L√°tico :: 73.67
  [+] Gel Clareador Antiacne - √Åcido Mand√©lico :: 84.20
  [+] Gel Clareador Antiacne - √Åcido Mand√©lico :: 84.20
  [+] Gel-creme Hidratante Calmante - Calming Cream :: 52.62
  [+] Gel-creme Hidratante Calmante - Calming Cream :: 52.62
  [+] Creme Clareador para Olhos - Eye Cream :: 136.83
  [+] Creme Clareador para Olhos - Eye Cream :: 136.83
  [+] Gel de Limpeza :: 63.15
  [+] Gel de Limpeza :: 63.15
  [+] Protetor  Solar FPS 60 :: 63.15
  [+] Protetor  Solar FPS 60 :: 63.15
  [+] Retinol - Creme Corretivo Anti sinais :: 105.25
  [+] Retinol - Creme Corretivo Anti sinais :: 105.25
  [+] S√©rum Antioxidante Clareador - Vitamina C Gold :: 126.31
  [+] S√©rum Antioxidante Clareador - Vitamina 

## Fun√ß√£o Opcional: Converter JSON para CSV

Use a fun√ß√£o abaixo apenas quando precisar gerar um arquivo CSV a partir do JSON j√° salvo.

In [3]:
import csv

def json_to_csv(json_file="alterado.json", csv_file="creamy_products.csv"):
    """
    Converte o arquivo JSON salvo para CSV.
    Uso: 
    - json_to_csv()  # usa o arquivo padr√£o
    - json_to_csv("meu_arquivo.json", "meu_arquivo.csv")  # usa arquivo personalizado
    """
    try:
        with open(json_file, "r", encoding="utf-8") as f:
            data = json.load(f)
        
        if not data:
            print(f"Nenhum dado encontrado no arquivo {json_file}")
            return
        
        cols = ["marca", "nome", "subtitulo", "categoria", "quantidade", "preco", "beneficios", "ingredientes", "tipo_pele", "imagem"]
        
        with open(csv_file, "w", encoding="utf-8", newline="") as f:
            writer = csv.DictWriter(f, fieldnames=cols)
            writer.writeheader()
            for row in data:
                # Converte None para string vazia no CSV
                csv_row = {k: (row.get(k) or "") for k in cols}
                writer.writerow(csv_row)
        
        print(f"‚úÖ CSV gerado: {csv_file} ({len(data)} linhas)")
        print(f"üìÅ A partir do JSON: {json_file}")
        
    except FileNotFoundError:
        print(f"‚ùå Arquivo {json_file} n√£o encontrado!")
        print("üìÇ Arquivos JSON dispon√≠veis na pasta:")
        import glob
        json_files = glob.glob("*.json")
        if json_files:
            for f in json_files:
                print(f"   - {f}")
        else:
            print("   Nenhum arquivo .json encontrado")
    except Exception as e:
        print(f"‚ùå Erro ao converter JSON para CSV: {e}")


json_to_csv("alterado.json")  # seu arquivo personalizado

‚ùå Erro ao converter JSON para CSV: name 'json' is not defined
