# Creamy


In [3]:
import sys, subprocess, os, re, csv, json, time
from urllib.parse import urljoin, urlparse, parse_qs

import requests
from bs4 import BeautifulSoup
import unicodedata
sys.path.append(os.path.abspath("./../models"))

from skin import (
    SKIN_TYPE_CANONICAL_ORDER,
    SKIN_TYPE_SYNONYMS_PT,
)

from exclude import (
    EXCLUDE_KEYWORDS,
)

from ingredient import (
    INGREDIENTES_VALIDOS,
)

from benefits import (
    BENEFIT_SYNONYMS_PT,
    BENEFIT_CANONICAL_ORDER,
)

from category import (CATEGORY_CANONICAL_ORDER, CATEGORY_HINTS)

### Configurações Iniciais

In [20]:
BASE_URL = "https://www.creamy.com.br/"
LISTING_URL_TEMPLATE = "https://www.creamy.com.br/produtos?page={page}"
MAX_PAGES = 9

OUT_JSON = "creamy_products.json"
OUT_CSV  = "creamy_products.csv"
IMG_DIR  = "images"
os.makedirs(IMG_DIR, exist_ok=True)

SESSION = requests.Session()
SESSION.headers.update({
    "User-Agent": (
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
})

## Utilitários

### Funções auxiliares para normalização de texto, remoção de acentos, tokenização de ingredientes, padronização de dados, sanitização de nomes de arquivos e formatação de preços.

In [None]:
def strip_accents(s: str) -> str:
    if not isinstance(s, str):
        s = str(s)
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def normalize_space(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

def slugify(text: str) -> str:
    text = strip_accents(text.lower())
    text = re.sub(r"[^a-z0-9]+", "-", text)
    return re.sub(r"-+", "-", text).strip("-") or "produto"

def get_soup(url, max_retries=3, timeout=25):
    for attempt in range(max_retries):
        try:
            r = SESSION.get(url, timeout=timeout)
            if r.status_code == 200:
                return BeautifulSoup(r.text, "lxml")
            print(f"[WARN] {url} -> status {r.status_code}")
        except Exception as e:
            print(f"[ERROR] {url} -> {e}")
        time.sleep(1.1 * (attempt + 1))
    return None

def should_exclude_product(text: str) -> bool:
    if not text:
        return False
    
    text_clean = strip_accents(text.lower())
    
    for keyword in EXCLUDE_KEYWORDS:
        keyword_clean = strip_accents(keyword.lower())
        if keyword_clean in text_clean:
            return True
    
    return False

def parse_price(price_text: str) -> str:
    if not price_text:
        return ""
    
    # Remove símbolos e normaliza
    clean_text = price_text.replace("R$", "").replace("r$", "").strip()
    clean_text = clean_text.replace(" ", "").replace(".", "").replace(",", ".")
    
    # Encontra números
    numbers = re.findall(r"[0-9]+(?:\.[0-9]{1,2})?", clean_text)
    if not numbers:
        return ""
    
    try:
        return f"{float(numbers[0]):.2f}"
    except:
        return ""

def split_text_list(text: str):
    """Divide texto em lista usando separadores comuns"""
    if not text:
        return []
    
    # Remove tags HTML
    text = text.replace("<br>", ";").replace("<br/>", ";").replace("<br />", ";")
    
    # Divide por separadores
    parts = re.split(r"[;•|/\n,]", text)
    
    return [normalize_space(part) for part in parts if normalize_space(part)]

def filter_ingredients(raw_ingredients):
    if not raw_ingredients:
        return ""
    
    valid_ingredients = []
    
    for ingredient in raw_ingredients:
        ingredient_clean = strip_accents(ingredient.lower())
        
        for valid in INGREDIENTES_VALIDOS:
            valid_clean = strip_accents(valid.lower())
            if valid_clean in ingredient_clean:
                if valid_clean not in valid_ingredients:
                    valid_ingredients.append(valid_clean)
                break
    
    return "; ".join(valid_ingredients)

def find_text_by_selectors(soup, selectors):
    for selector in selectors:
        element = soup.select_one(selector)
        if element and element.get_text(strip=True):
            return element.get_text(" ", strip=True)
    return ""

## Benefícios, Ingredientes

In [22]:
def standardize_product_benefits(benefit_text_list):
    if not benefit_text_list:
        return []
    
    identified_benefits = set()
    normalized_synonyms = {
        canonical_benefit: [strip_accents(synonym).lower() for synonym in synonym_list if synonym]
        for canonical_benefit, synonym_list in BENEFIT_SYNONYMS_PT.items()
    }
    
    for benefit_text in benefit_text_list:
        normalized_text = strip_accents(benefit_text).lower()
        for canonical_benefit, pattern_list in normalized_synonyms.items():
            if any(pattern in normalized_text for pattern in pattern_list):
                identified_benefits.add(canonical_benefit)
    
    if BENEFIT_CANONICAL_ORDER:
        benefit_order_mapping = {benefit_name: index for index, benefit_name in enumerate(BENEFIT_CANONICAL_ORDER)}
        sorted_benefits = sorted(list(identified_benefits), key=lambda benefit: benefit_order_mapping.get(benefit, 999))
        return sorted_benefits
    
    return sorted(list(identified_benefits))

def extract_product_benefits(html_soup):
    extracted_benefit_items = []
    
    list_elements = html_soup.select("ul, ol")
    for list_element in list_elements:
        list_item_texts = [normalize_space(list_item.get_text(" ", strip=True)) for list_item in list_element.select("li")]
        for item_text in list_item_texts:
            if 0 < len(item_text) <= 120:
                extracted_benefit_items.append(item_text)
    
    table_elements = html_soup.select("th, td")
    for table_element in table_elements:
        element_text = normalize_space(table_element.get_text(" ", strip=True))
        if 0 < len(element_text) <= 120:
            extracted_benefit_items.append(element_text)
    
    unique_benefits = []
    processed_benefits = set()
    for benefit_item in extracted_benefit_items:
        if benefit_item and benefit_item not in processed_benefits:
            unique_benefits.append(benefit_item)
            processed_benefits.add(benefit_item)
    
    standardized_benefits = standardize_product_benefits(unique_benefits)
    return "; ".join(standardized_benefits)

def extract_product_ingredients(html_soup):
    ingredient_section_labels = [
        "ingredientes", "composição", "composicao", "fórmula", "formula", "ingredients", "active ingredients"
    ]
    
    relevant_text_blocks = []
    content_elements = html_soup.select("div, section, table, article, ul, ol, p")
    
    for content_element in content_elements:
        element_text = content_element.get_text(" ", strip=True)
        lowercase_text = element_text.lower()
        
        if any(label in lowercase_text for label in ingredient_section_labels):
            relevant_text_blocks.append(element_text)
    
    sorted_text_blocks = sorted(set(relevant_text_blocks), key=len)
    raw_ingredient_list = []
    
    for text_block in sorted_text_blocks:
        raw_ingredient_list.extend(split_text_list(text_block))
    
    filtered_ingredients = [ingredient for ingredient in raw_ingredient_list if len(ingredient) <= 100]
    return filter_ingredients(filtered_ingredients)

## Preço e Tipo de pele

In [23]:
def extract_product_quantity_from_text(product_text: str) -> str:
    if not product_text:
        return ""
    
    quantity_pattern = re.search(r"(\d+[\.,]?\d*)\s*(ml|g|l)\b", product_text.lower())
    if quantity_pattern:
        numeric_value = quantity_pattern.group(1).replace(",", ".")
        measurement_unit = quantity_pattern.group(2).upper()
        
        if measurement_unit == 'L' and not numeric_value.endswith('L'):
            return f"{numeric_value}L"
        return f"{numeric_value}{measurement_unit}"
    
    return ""

def extract_compatible_skin_types(html_soup):
    def normalize_skin_type_text(input_text: str) -> str:
        if not input_text:
            return ""
        
        accent_removed_text = "".join(
            character for character in unicodedata.normalize("NFD", input_text) 
            if unicodedata.category(character) != "Mn"
        )
        lowercase_text = accent_removed_text.lower()
        hyphen_normalized_text = lowercase_text.replace("-", " ")
        special_chars_removed = re.sub(r"[^\w\s]", " ", hyphen_normalized_text)   
        whitespace_normalized = re.sub(r"\s+", " ", special_chars_removed).strip()
        
        return whitespace_normalized

    complete_page_text = html_soup.get_text(" ", strip=True)
    normalized_page_text = normalize_skin_type_text(complete_page_text)

    normalized_skin_synonyms = {
        canonical_type: [normalize_skin_type_text(synonym) for synonym in synonym_list if synonym]
        for canonical_type, synonym_list in SKIN_TYPE_SYNONYMS_PT.items()
    }

    identified_skin_types = set()
    
    universal_skin_synonyms = normalized_skin_synonyms.get("todos os tipos", [])
    for universal_pattern in universal_skin_synonyms:
        if universal_pattern and universal_pattern in normalized_page_text:
            return "todos os tipos"
    
    for canonical_skin_type, pattern_list in normalized_skin_synonyms.items():
        if canonical_skin_type == "todos os tipos":
            continue
        if any(pattern and pattern in normalized_page_text for pattern in pattern_list):
            identified_skin_types.add(canonical_skin_type)

    if not identified_skin_types:
        return ""  

    skin_type_order_mapping = {skin_type: index for index, skin_type in enumerate(SKIN_TYPE_CANONICAL_ORDER or [])}
    ordered_skin_types = sorted(identified_skin_types, key=lambda skin_type: skin_type_order_mapping.get(skin_type, 10_000))

    return "; ".join(ordered_skin_types)

## Imagem

In [24]:
def download_image(soup, product_name):
    selectors = [
        "img.vtex-store-components-3-x-productImageTag",
        "img.product-image",
        "img[src*='/arquivos/']",
        "img[src*='cdn']",
    ]
    src = None
    for sel in selectors:
        node = soup.select_one(sel)
        if node and node.get("src"):
            src = node.get("src"); break
        if node and node.get("data-src"):
            src = node.get("data-src"); break
    if not src:
        return ""
    img_url = src if src.startswith("http") else urljoin(BASE_URL, src)
    from urllib.parse import urlparse
    ext = os.path.splitext(urlparse(img_url).path)[1] or ".jpg"
    fname = f"{slugify(product_name)}{ext}"
    fpath = os.path.join(IMG_DIR, fname)
    try:
        r = SESSION.get(img_url, timeout=25)
        if r.status_code == 200 and r.content:
            with open(fpath, "wb") as f:
                f.write(r.content)
            return fname
    except Exception as e:
        print(f"[IMG] Falha ao baixar {img_url}: {e}")
    return ""


## Categoria

In [25]:
_CAT_ORDER_MAP = {c: i for i, c in enumerate(CATEGORY_CANONICAL_ORDER)}

def normalize_category_text(s: str) -> str:
    return strip_accents(s or "").lower()

def classify_category_from_name(name: str, subtitle: str | None = None, desc: str | None = None) -> str:
 
    txt = normalize_category_text(f"{name or ''} {subtitle or ''} {desc or ''}")
    hits = []
    for cat, needles in CATEGORY_HINTS.items():
        for needle in needles:
            if normalize_category_text(needle) in txt:
                hits.append(cat)
                break
    if not hits:
        return ""  
    hits.sort(key=lambda c: _CAT_ORDER_MAP.get(c, 10_000))
    return hits[0]

def guess_category(url: str, name: str) -> str:
    return classify_category_from_name(name)

## Produtos

In [26]:
def parse_product_page(product_url, fallback_category=""):
    html_soup = get_soup(product_url)
    if html_soup is None:
        return None
    
    product_name_selectors = [
        "h1.vtex-store-components-3-x-productName",
        "h1.productName",
        "h1",
        "div.product-name h1",
    ]
    product_name = find_text_by_selectors(html_soup, product_name_selectors)
    
    if not product_name:
        if html_soup.title and html_soup.title.string:
            product_name = html_soup.title.string.split("|")[0].strip()
    
    if not product_name:
        return None
    
    if should_exclude_product(product_name) or should_exclude_product(product_url):
        print(f"[SKIP] Produto excluído: {product_name}")
        return None

    subtitle_selectors = [
        "span.vtex-product-summary-2-x-description-short div",
        "span.vtex-product-summary-2-x-description-short",
        "div.vtex-rich-text-0-x-container p",
        "div.productDescription",
        "div.product-brief",
    ]
    product_subtitle = find_text_by_selectors(html_soup, subtitle_selectors)
    validated_subtitle = product_subtitle if (product_subtitle and len(product_subtitle) <= 220) else ""
    
    price_selectors = [
        "p.priceCustom__sellingPrice span",
        "span.vtex-product-price-1-x-sellingPriceValue",
        "span.selling-price",
        "span.price",
    ]
    raw_price_text = find_text_by_selectors(html_soup, price_selectors)
    formatted_price = parse_price(raw_price_text)
    
    extracted_benefits = extract_product_benefits(html_soup)
    extracted_ingredients = extract_product_ingredients(html_soup)
    compatible_skin_types = extract_compatible_skin_types(html_soup)
    
    product_quantity = extract_product_quantity_from_text(product_name)
    if not product_quantity:
        product_quantity = extract_product_quantity_from_text(validated_subtitle)
    if not product_quantity:
        details_selectors = [
            "div.vtex-store-components-3-x-productDescriptionText",
            "div.productDescription",
            "section#descricao",
        ]
        details_text = find_text_by_selectors(html_soup, details_selectors)
        product_quantity = extract_product_quantity_from_text(details_text)
    
    category_from_hints = classify_category_from_name(product_name, validated_subtitle)
    final_category = category_from_hints or fallback_category or guess_category(product_url, product_name)
    
    downloaded_image_name = download_image(html_soup, product_name)
    
    structured_product_data = {
        "marca": "creamy",
        "nome": product_name.strip(),
        "subtitulo": validated_subtitle if validated_subtitle else "",
        "categoria": final_category,
        "preco": formatted_price if formatted_price else "",
        "quantidade": product_quantity if product_quantity else "",
        "beneficios": extracted_benefits.lower() if extracted_benefits else "",
        "ingredientes": extracted_ingredients.lower() if extracted_ingredients else "",
        "tipos_pele": compatible_skin_types,
        "imagem": downloaded_image_name,
    }
    
    return structured_product_data

def extract_product_links_from_listing(listing_page_url: str):
    listing_soup = get_soup(listing_page_url)
    if listing_soup is None:
        return []
    
    discovered_product_links = set()
    anchor_elements = listing_soup.select("a[href]")
    
    for anchor_element in anchor_elements:
        href_attribute = anchor_element.get("href")
        if not href_attribute:
            continue
        
        absolute_url = href_attribute if href_attribute.startswith("http") else urljoin(BASE_URL, href_attribute)
        
        if re.search(r"/p($|\?)", absolute_url):
            discovered_product_links.add(absolute_url)
    
    return sorted(discovered_product_links)

### Saída

In [None]:
def execute_creamy_scraper():
    visited_product_urls = set()
    scraped_products_list = []
    
    for current_page in range(1, MAX_PAGES + 1):
        page_listing_url = LISTING_URL_TEMPLATE.format(page=current_page)
        print(f"Processando página {current_page}: {page_listing_url}")
        
        discovered_product_links = extract_product_links_from_listing(page_listing_url)
        print(f"Encontrados {len(discovered_product_links)} produtos na página")
        
        for product_url in discovered_product_links:
            if product_url in visited_product_urls or should_exclude_product(product_url):
                continue
                
            extracted_product_data = parse_product_page(product_url)
            if extracted_product_data:
                visited_product_urls.add(product_url)
                scraped_products_list.append(extracted_product_data)
                print(f"Produto extraído: {extracted_product_data['nome']} - R$ {extracted_product_data['preco']}")
            
            time.sleep(0.6)
    
    return scraped_products_list

def save_data(products_data):
    output_columns = ["marca", "nome", "subtitulo", "categoria", "quantidade", "preco", "beneficios", "ingredientes", "tipo_pele", "imagem"]
    cleaned_products_data = [{column: product.get(column, "") for column in output_columns} for product in products_data]
    
    with open("creamy_products.json", "w", encoding="utf-8") as output_file:
        json.dump(cleaned_products_data, output_file, ensure_ascii=False, indent=2)
    
    print(f"JSON gerado: creamy_products.json ({len(cleaned_products_data)} produtos)")

if __name__ == "__main__":
    print("_" * 50)
    print("WEB SCRAPING Creamy")
    print("_" * 50)
    
    extracted_products = execute_creamy_scraper()

    print("_" * 50)
    print(f"Fim da execução! Total coletado: {len(extracted_products)} produtos")
    
    save_data(extracted_products)


__________________________________________________
WEB SCRAPING Creamy
__________________________________________________
Processando página 1: https://www.creamy.com.br/produtos?page=1
Encontrados 10 produtos na página
Encontrados 10 produtos na página
Produto extraído: Creme Retexturizador - Ácido Glicólico - R$ 84.20
Produto extraído: Creme Retexturizador - Ácido Glicólico - R$ 84.20
Produto extraído: Sérum Renovador Suave - Ácido Lático - R$ 73.67
Produto extraído: Sérum Renovador Suave - Ácido Lático - R$ 73.67
Produto extraído: Gel Clareador Antiacne - Ácido Mandélico - R$ 84.20
Produto extraído: Gel Clareador Antiacne - Ácido Mandélico - R$ 84.20
Produto extraído: Tônico Antioleosidade - Ácido Salicílico - R$ 85.25
Produto extraído: Tônico Antioleosidade - Ácido Salicílico - R$ 85.25
Produto extraído: Gel-creme Hidratante Calmante - Calming Cream - R$ 52.62
Produto extraído: Gel-creme Hidratante Calmante - Calming Cream - R$ 52.62
Produto extraído: Creme Clareador para Olhos - E

###  CSV


In [4]:
def json_to_csv(json_file="creamy_products.json", csv_file="creamy_products.csv"):

    try:
        with open(json_file, "r", encoding="utf-8") as f:
            data = json.load(f)
        
        if not data:
            print(f"Nenhum dado encontrado no arquivo {json_file}")
            return
        
        cols = ["marca", "nome", "subtitulo", "categoria", "quantidade", "preco", "beneficios", "ingredientes", "tipo_pele", "imagem"]
        
        with open(csv_file, "w", encoding="utf-8", newline="") as f:
            writer = csv.DictWriter(f, fieldnames=cols)
            writer.writeheader()
            for row in data:

                csv_row = {k: (row.get(k) or "") for k in cols}
                writer.writerow(csv_row)
        
        print(f"CSV gerado: {csv_file} ({len(data)} linhas)")
        print(f"A partir do JSON: {json_file}")
        
    except FileNotFoundError:
        print(f" Arquivo {json_file} não encontrado!")
        
        import glob
        json_files = glob.glob("*.json")
        if json_files:
            for f in json_files:
                print(f"   - {f}")
        else:
            print("   Nenhum arquivo .json encontrado")
    except Exception as e:
        print(f"Erro ao converter JSON para CSV: {e}")


json_to_csv("creamy_products.json")  

CSV gerado: creamy_products.csv (20 linhas)
A partir do JSON: creamy_products.json
