# Sallve

In [3]:
import sys
import os
import re
import json
import time
import unicodedata
import csv
from urllib.parse import urljoin, urlparse, parse_qs, urlencode
from pathlib import Path
from typing import List, Dict, Optional

from bs4 import BeautifulSoup

sys.path.append(os.path.abspath("./../models"))

from skin import (
    SKIN_TYPE_CANONICAL_ORDER,
    SKIN_TYPE_SYNONYMS_PT,
)

from exclude import (
    EXCLUDE_KEYWORDS,
)

from ingredient import (
    INGREDIENTES_VALIDOS,
)

from benefits import (
    BENEFIT_SYNONYMS_PT,
    BENEFIT_CANONICAL_ORDER,
)

from category import (
    CATEGORY_CANONICAL_ORDER,
    CATEGORY_HINTS,
)

### Configurações Iniciais

In [26]:
BASE_URL = "https://www.sallve.com.br"
COLLECTION_URL = "https://www.sallve.com.br/collections/loja"


SKIN_FILTER_URLS = {
    "seca":      "https://www.sallve.com.br/collections/super-ativos?filter.p.m.filter.skin_type=pele+seca",
    "mista":     "https://www.sallve.com.br/collections/super-ativos?filter.p.m.filter.skin_type=pele+mista",
    "sensível":  "https://www.sallve.com.br/collections/super-ativos?filter.p.m.filter.skin_type=pele+sens%C3%ADvel",
    "normal":    "https://www.sallve.com.br/collections/super-ativos?filter.p.m.filter.skin_type=pele+normal",
    "oleosa":    "https://www.sallve.com.br/collections/super-ativos?filter.p.m.filter.skin_type=pele+oleosa",
    "todos os tipos": "https://www.sallve.com.br/collections/super-ativos?filter.p.m.filter.skin_type=todos+os+tipos+de+pele",
}

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
}

IMAGES_DIR = Path("images")
IMAGES_DIR.mkdir(parents=True, exist_ok=True)

## Utilitários

### Funções auxiliares para normalização de texto, remoção de acentos, tokenização dos ingredientes, nomes de arquivos e formatação de preços.

In [None]:
def strip_accents_lower(s: str) -> str:
    if not s:
        return ""
    s = "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")
    s = s.lower()
    s = s.replace("-", " ")
    s = re.sub(r"[^\w\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def safe_join_url(url: str) -> str:
    return "https:" + url if url.startswith("//") else url

def tokenize_ingredientes(texto: str):

    if not texto:
        return []
    texto = texto.replace("\n", " ").replace("\r", " ")
    partes = re.split(r"[;,·•|\u2022]|(?:\s{2,})|,|\.", texto)
    return [p.strip() for p in partes if p and p.strip()]

def padroniza_por_lista(tokens, lista_validos):
    valid_norm_map = {_strip_accents_lower(v): v for v in lista_validos}
    padronizados, vistos = [], set()

    for tok in tokens:
        n = strip_accents_lower(tok)
        key = None
        
        for kn in valid_norm_map.keys():
            if n == kn or n.startswith(kn) or kn in n:
                key = kn
                break
                
        if key and key not in vistos:
            padronizados.append(valid_norm_map[key])
            vistos.add(key)
            
    return padronizados

def add_or_replace_page_param(url: str, page: int) -> str:

    parsed = urlparse(url)
    q = parse_qs(parsed.query, keep_blank_values=True)
    q["page"] = [str(page)]
    new_query = urlencode({k: v[0] if isinstance(v, list) and len(v) == 1 else v for k, v in q.items()}, doseq=True)
    return parsed._replace(query=new_query).geturl()

def sanitize_filename(name: str) -> str:

    base = strip_accents_lower(name)
    base = re.sub(r"[^a-z0-9]+", "-", base).strip("-")
    base = re.sub(r"-{2,}", "-", base)
    return base or "produto"

def normalize_price_text(txt: str) -> str | None:
    if not txt:
        return None
    
    t = re.sub(r"\s+", " ", txt).strip().replace("\xa0", " ")
    
    m = re.search(r"(?:R\$)?\s*(\d{1,3}(?:\.\d{3})*,\d{2})", t)
    if m:
        num = m.group(1).replace(".", "").replace(",", ".")
        try:
            return f"{float(num):.2f}"
        except ValueError:
            return None
    
    return None

def should_exclude_product(product_name: str) -> bool:

    if not product_name:
        return True
    
    def strip_accents(s: str) -> str:
        if not isinstance(s, str):
            s = str(s)
        return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
    
    product_norm = strip_accents(product_name.lower())
    
    for keyword in EXCLUDE_KEYWORDS:
        if keyword and strip_accents(keyword.lower()) in product_norm:
            return True
    
    return False

##  Coleta de Links de Produtos Disponíveis

In [28]:
def get_product_links_from_collection(max_pages: int = 8):
    links, vistos = [], set()
    for page in range(1, max_pages + 1):
        url = f"{COLLECTION_URL}?page={page}"
        try:
            r = requests.get(url, headers=HEADERS, timeout=30)
            if r.status_code != 200:
                break
            soup = BeautifulSoup(r.content, "html.parser")
            for a in soup.find_all("a", href=True):
                href = a["href"]
                if "/products/" in href:
                    full = urljoin(BASE_URL, href)
                    if full not in vistos:
                        vistos.add(full)
                        links.append(full)
            time.sleep(0.7)
        except Exception as e:
            print(f"[get_product_links_from_collection] Erro na página {page}: {e}")
            break
    return links

## Ingredientes

In [None]:
def extract_ingredients(soup: BeautifulSoup) -> str:

    collected_ingredients = []
    
    ingredients_section = soup.find("div", class_=re.compile(r"\btabcontent\b.*\bingredients\b", re.I))
    
    if ingredients_section:

        header_elements = ingredients_section.find_all("h2")
        for header in header_elements:
            header_text = header.get_text(" ", strip=True)
            if header_text:
                collected_ingredients.append(header_text)
        
        ingredients_summary = ingredients_section.find("div", class_=re.compile(r"\bingredients_resume\b", re.I))
        if ingredients_summary:
            full_content = ingredients_summary.get_text("\n", strip=True)
            normalized_content = strip_accents_lower(full_content)
            
            main_ingredients_start = normalized_content.find("ingredientes:")
            if main_ingredients_start != -1:
                main_ingredients_section = full_content[main_ingredients_start + len("ingredientes:"):]
                
                portuguese_section_markers = ["ingredientes em portugues:", "ingredientes em português:"]
                for section_marker in portuguese_section_markers:
                    marker_position = strip_accents_lower(main_ingredients_section).find(section_marker)
                    if marker_position != -1:
                        main_ingredients_section = main_ingredients_section[:marker_position]
                        break
                
                main_ingredients_tokens = tokenize_ingredientes(main_ingredients_section)
                collected_ingredients.extend(main_ingredients_tokens)
            
            portuguese_section_keys = ["ingredientes em portugues:", "ingredientes em português:"]
            for portuguese_key in portuguese_section_keys:
                portuguese_start = normalized_content.find(portuguese_key)
                if portuguese_start != -1:
                    portuguese_section = full_content[portuguese_start + len(portuguese_key):]
                    portuguese_tokens = tokenize_ingredientes(portuguese_section)
                    collected_ingredients.extend(portuguese_tokens)
                    break
    
    filtered_ingredients = []
    for ingredient_token in collected_ingredients:
        if not ingredient_token:
            continue
        
        cleaned_ingredient = ingredient_token.strip().strip(":").strip()
        if not cleaned_ingredient:
            continue
        
        word_count = len(cleaned_ingredient.split())
        has_separators = any(separator in cleaned_ingredient for separator in [",", ";"])
        
        if word_count > 8 and not has_separators:
            continue
        
        filtered_ingredients.append(cleaned_ingredient)
    standardized_ingredients = padroniza_por_lista(filtered_ingredients, INGREDIENTES_VALIDOS)
    
    return "; ".join(standardized_ingredients)

## Benefícios

In [None]:
def standardize_benefits(benefit_texts: List[str]) -> List[str]:
    if not benefit_texts:
        return []
    
    found_benefits = set()
    
    normalized_synonyms = {
        canonical: [_strip_accents_lower(synonym) for synonym in patterns if synonym]
        for canonical, patterns in BENEFIT_SYNONYMS_PT.items()
    }
    
    for text in benefit_texts:
        normalized_text = strip_accents_lower(text)
        
        for canonical, patterns in normalized_synonyms.items():
            if any(pattern in normalized_text for pattern in patterns):
                found_benefits.add(canonical)
    
    if BENEFIT_CANONICAL_ORDER:
        order_mapping = {name: index for index, name in enumerate(BENEFIT_CANONICAL_ORDER)}
        return sorted(list(found_benefits), key=lambda x: order_mapping.get(x, 999))
    
    return sorted(list(found_benefits))

def extract_beneficios(soup: BeautifulSoup) -> str:
 
    benefit_candidates = []
    for details in soup.find_all("details", class_=re.compile(r"\bDifferentials\b", re.I)):
        for list_item in details.find_all("li"):
            text = list_item.get_text(" ", strip=True)
            if not text:
                continue
            
            text_normalized = strip_accents_lower(text)
            
            if re.fullmatch(r"\d+(?:[.,]\d+)?", text_normalized):
                continue
            
            excluded_words = ["ponto", "pontos", "minha sallve"]
            if any(word in text_normalized for word in excluded_words):
                continue
            
            if len(text) <= 200:
                benefit_candidates.append(text)
    
    if not benefit_candidates:
        main_content = soup.find("article", class_=re.compile(r"\bRegularMain__content\b"))
        search_area = main_content or soup
        
        for list_item in search_area.find_all("li"):
            text = list_item.get_text(" ", strip=True)
            if text and len(text) <= 160:
                benefit_candidates.append(text)
    
    standardized_benefits = standardize_benefits(benefit_candidates)
    return "; ".join(standardized_benefits)

## Imagem

In [None]:
def select_best_image_from_srcset(srcset: str) -> str:

    best_url, best_width = None, -1
    
    for part in srcset.split(","):
        part = part.strip()
        if not part:
            continue
        
        width_match = re.match(r"(.+?)\s+(\d+)w", part)
        if width_match:
            url, width = width_match.group(1).strip(), int(width_match.group(2))
            if width > best_width:
                best_url, best_width = url, width
        else:
            if best_url is None:
                best_url = part
    
    return best_url

def extract_image_url(soup: BeautifulSoup) -> Optional[str]:

    image_candidates = []
    
    main_image = soup.find("img", class_=re.compile(r"\bview__image\b", re.I))
    if main_image:
        if main_image.get("srcset"):
            image_candidates.append(select_best_image_from_srcset(main_image["srcset"]))
        if main_image.get("src"):
            image_candidates.append(main_image["src"])
    
    if not image_candidates:
        for img in soup.find_all("img"):
            class_names = " ".join(img.get("class", []))
            relevant_keywords = ["product", "image", "gallery", "media", "hero", "view"]
            
            if re.search(r"(" + "|".join(relevant_keywords) + ")", class_names, re.I):
                if img.get("srcset"):
                    image_candidates.append(select_best_image_from_srcset(img["srcset"]))
                if img.get("src"):
                    image_candidates.append(img.get("src"))
    
    if not image_candidates:
        og_image = soup.find("meta", property="og:image")
        if og_image and og_image.get("content"):
            image_candidates.append(og_image["content"])
    
    for url in image_candidates:
        if not url:
            continue
        
        corrected_url = safe_join_url(url.strip())
        return urljoin(BASE_URL, corrected_url)
    
    return None

def detect_image_extension(url: str) -> str:

    if not url:
        return ".jpg"
    
    path = urlparse(url).path
    extension = os.path.splitext(path)[1].lower()
    
    supported_extensions = [".jpg", ".jpeg", ".png", ".webp", ".gif"]
    if extension in supported_extensions:
        return ".jpg" if extension == ".jpeg" else extension
    
    return ".jpg"  

def download_image(image_url: str, product_name: str) -> Optional[str]:

    if not image_url:
        return None
    
    try:

        response = requests.get(image_url, headers=HEADERS, timeout=40)
        response.raise_for_status()
        
        extension = detect_image_extension(image_url)
        base_filename = sanitize_filename(product_name)
        filename = f"{base_filename}{extension}"
        
        destination = IMAGES_DIR / filename
        counter = 1
        while destination.exists():
            filename = f"{base_filename}-{counter}{extension}"
            destination = IMAGES_DIR / filename
            counter += 1
        
        with open(destination, "wb") as file:
            file.write(response.content)
        
        return filename
    
    except Exception as error:
        print(f"[download_image] Falha ao baixar {image_url}: {error}")
        return None

# Preço e tamanho

In [None]:

def extract_price(soup: BeautifulSoup) -> Optional[str]:

    for cls_pat in [r"\bTotalPrice\b", r"\bTotalPrice__CTA\b"]:
        el = soup.find(["strong", "span"], class_=re.compile(cls_pat, re.I))
        if el and el.get_text(strip=True):
            p = normalize_price_text(el.get_text(" ", strip=True))
            if p:
                return p
    box = soup.find(class_=re.compile(r"\bProductPrice\b", re.I))
    if box:
        p = normalize_price_text(box.get_text(" ", strip=True))
        if p:
            return p
        strong = box.find("strong")
        if strong:
            p = normalize_price_text(strong.get_text(" ", strip=True))
            if p:
                return p
    generic = soup.find(class_=re.compile(r"price", re.I))
    if generic:
        p = normalize_price_text(generic.get_text(" ", strip=True))
        if p:
            return p
    p = normalize_price_text(soup.get_text(" ", strip=True))
    return p

def extract_size(soup: BeautifulSoup) -> Optional[str]:
    size_element = soup.find("span", class_=re.compile(r"\bProductWeight\b", re.I))
    
    if size_element:
        return size_element.get_text(strip=True)
    for alt in [
        soup.find(class_=re.compile(r"weight", re.I)),
        soup.find(class_=re.compile(r"size", re.I)),
        soup.find(class_=re.compile(r"volume", re.I)),
        soup.find(class_=re.compile(r"quantity", re.I)),
    ]:
        if alt and alt.get_text(strip=True):
            text = alt.get_text(strip=True)
            m = re.search(r"\d+\,?\d*\s*(?:ml|g|mg|kg|l|oz)", text, re.I)
            if m:
                return m.group()
    return None

## Nome e subtítulo

In [33]:

def extract_subtitle(soup: BeautifulSoup) -> Optional[str]:

    for sel in [
        "p.ProductSubtitle", "p.product-subtitle",
        ".product__subtitle", ".product__text", ".product__text.inline-richtext",
        ".ProductSummary", ".ProductDescription p",
    ]:
        node = soup.select_one(sel)
        if node and node.get_text(strip=True):
            st = node.get_text(" ", strip=True)
            return st if len(st) <= 220 else st[:220]
    return None

def extract_name(soup: BeautifulSoup) -> Optional[str]:
    name_element = soup.find("span", id="ProductNameTitle")
    if name_element and name_element.get_text(strip=True):
        return name_element.get_text(strip=True)
    for alt in [
        soup.find("h1"),
        soup.find("h2"),
        soup.find("title"),
        soup.find(class_=re.compile(r"product.*title", re.I)),
        soup.find(class_=re.compile(r"\bname\b", re.I)),
    ]:
        if alt and alt.get_text(strip=True):
            return alt.get_text(strip=True)
    return None

def fetch_soup(url: str) -> Optional[BeautifulSoup]:
    try:
        r = requests.get(url, headers=HEADERS, timeout=30)
        if r.status_code != 200:
            return None
        return BeautifulSoup(r.content, "html.parser")
    except Exception as e:
        print(f"[fetch_soup] Erro em {url}: {e}")
        return None

## Categoria  e Tipos de Pele

In [None]:
_CAT_ORDER_MAP = {c: i for i, c in enumerate(CATEGORY_CANONICAL_ORDER)}

def norm_plain(s: str) -> str:
    return strip_accents_lower(s or "")

def classify_category(name: str, description: str | None = None) -> Optional[str]:
    txt = norm_plain(f"{name or ''} {description or ''}")
    hits = []
    for cat, needles in CATEGORY_HINTS.items():
        for needle in needles:
            if norm_plain(needle) and norm_plain(needle) in txt:
                hits.append(cat)
                break
    if not hits:
        return None
    hits.sort(key=lambda c: CAT_ORDER_MAP.get(c, 10_000))
    return hits[0]

_SKIN_SYNONYMS_NORM = {
    canonico: [_strip_accents_lower(x) for x in lst if x]
    for canonico, lst in SKIN_TYPE_SYNONYMS_PT.items()
}
_SKIN_ORDER_MAP = {name: i for i, name in enumerate(SKIN_TYPE_CANONICAL_ORDER or [])}

def classify_skin_types_from_strings(*strings: str) -> List[str]:
    big = strip_accents_lower(" ".join(s for s in strings if s))
    found = set()
    for canonico, pats in SKIN_SYNONYMS_NORM.items():
        if any(p and p in big for p in pats):
            found.add(canonico)
    if not found:
        return []
    return sorted(found, key=lambda x: SKIN_ORDER_MAP.get(x, 10_000))


## Extração

In [None]:
def extract_product_data_from_soup(soup: BeautifulSoup, product_url: str, product_name: str) -> Dict:

    product_subtitle = extract_subtitle(soup)
    product_benefits = extract_beneficios(soup) or ""
    product_ingredients = extract_ingredients(soup) or ""
    product_price = extract_price(soup) or ""
    product_size = extract_size(soup) or ""

    product_category = classify_category(product_name, product_subtitle) or ""

    detected_skin_types = classify_skin_types_from_strings(
        product_name, 
        product_subtitle or "", 
        product_benefits
    )

    product_data = {
        "marca": "sallve",
        "nome": product_name,
        "subtitulo": product_subtitle,
        "categoria": product_category,
        "quantidade": product_size,
        "preco": product_price,
        "ingredientes": product_ingredients,
        "beneficios": product_benefits,
        "tipo_pele": "; ".join(detected_skin_types) if detected_skin_types else "",
        "imagem": "",
    }

    image_url = extract_image_url(soup)
    if image_url:
        downloaded_image_filename = download_image(image_url, product_name)
        if downloaded_image_filename:
            product_data["imagem"] = downloaded_image_filename

    return product_data

def scrape_sallve_products() -> List[Dict]:

    print("____________________________________________________________________________________________________________")

    print("WEB SCRAPING Sallve...")
    
    product_links = get_product_links_from_collection(max_pages=8)
    if not product_links:
        print("Nenhum produto encontrado na coleção. (Checar HTML/seletores)")
        return []

    print(f"Encontrados {len(product_links)} links de produto (antes de filtro).")
    collected_products = []

    for current_index, product_url in enumerate(product_links, 1):
        print(f" [{current_index}/{len(product_links)}] {product_url}")
        
        page_soup = fetch_soup(product_url)
        if not page_soup:
            print("\n Falha ao abrir página.")
            continue

        product_name = extract_name(page_soup)
        if not product_name:
            print("\n Nome não encontrado.")
            continue

        if should_exclude_product(product_name):
            print(f"\nExcluído por keyword (models): {product_name}")
            continue

        product_data = extract_product_data_from_soup(page_soup, product_url, product_name)
        collected_products.append(product_data)
        print(f" \nOK: {product_name}")
        
        time.sleep(0.7)

    print(f"Total pós-filtro: {len(collected_products)}")
    return collected_products

def collect_collection_products_with_pagination(collection_url: str, max_pages: int = 20):

    discovered_product_names = set()
    
    for current_page in range(1, max_pages + 1):
        paginated_url = add_or_replace_page_param(collection_url, current_page)
        
        try:
            response = requests.get(paginated_url, headers=HEADERS, timeout=30)
            if response.status_code != 200:
                break
            
            page_soup = BeautifulSoup(response.content, "html.parser")
            product_links = page_soup.find_all("a", href=True)
            
            products_found_on_page = 0
            
            for link_element in product_links:
                href_attribute = link_element["href"]
                
                if "/products/" in href_attribute:
                    link_text = link_element.get_text(" ", strip=True) or ""
                    if not link_text:
                        image_element = link_element.find("img", alt=True)
                        if image_element and image_element.get("alt"):
                            link_text = image_element["alt"]
                    
                    if link_text:
                        products_found_on_page += 1
                        normalized_name = strip_accents_lower(link_text)
                        discovered_product_names.add(normalized_name)
            
            if products_found_on_page == 0:
                break
                
            time.sleep(0.5)
            
        except Exception as e:
            print(f"[skin-type-collect] Erro na página: {paginated_url} -> {e}")
            break
    
    return discovered_product_names

def enrich_with_skin_types(products: List[Dict]) -> List[Dict]:

    if not products:
        return products

    print("\nColetando tipos de pele via filtros da Sallve...")
    
    name_to_product_indices = {}
    for product_index, product_data in enumerate(products):
        normalized_name = strip_accents_lower(product_data.get("nome", ""))
        if normalized_name:
            name_to_product_indices.setdefault(normalized_name, set()).add(product_index)

    universal_skin_type_products = set()
    
    if "todos os tipos" in SKIN_FILTER_URLS:
        universal_skin_type = "todos os tipos"
        filter_url = SKIN_FILTER_URLS[universal_skin_type]
        print(f" - Filtro '{universal_skin_type}': {filter_url}")
        
        discovered_names = collect_collection_products_with_pagination(filter_url, max_pages=20)
        print(f"   · {len(discovered_names)} nome(s) coletado(s)")
        
        for normalized_name in discovered_names:
            if normalized_name in name_to_product_indices:
                for product_index in name_to_product_indices[normalized_name]:
                    universal_skin_type_products.add(product_index)
                    products[product_index]["tipo_pele"] = "todos os tipos"
        
        time.sleep(0.6)

    for skin_type_canonical, filter_url in SKIN_FILTER_URLS.items():
        if skin_type_canonical == "todos os tipos":
            continue
            
        discovered_names = collect_collection_products_with_pagination(filter_url, max_pages=20)
        
        for normalized_name in discovered_names:
            if normalized_name in name_to_product_indices:
                for product_index in name_to_product_indices[normalized_name]:
                    if product_index in universal_skin_type_products:
                        continue
                        
                    current_skin_types = products[product_index].get("tipo_pele", "") or ""
                    existing_types = [skin_type.strip() for skin_type in current_skin_types.split(";") if skin_type.strip()]
                    
                    if skin_type_canonical not in existing_types:
                        existing_types.append(skin_type_canonical)
                    
                    if SKIN_TYPE_CANONICAL_ORDER:
                        skin_type_order = {skin_type: index for index, skin_type in enumerate(SKIN_TYPE_CANONICAL_ORDER)}
                        existing_types = sorted(existing_types, key=lambda x: skin_type_order.get(x, 999))
                    
                    products[product_index]["tipo_pele"] = "; ".join(existing_types)
        
        time.sleep(0.6)

    for product_index, product_data in enumerate(products):
        current_skin_type = (product_data.get("tipo_pele") or "").strip()
        if not current_skin_type:
            product_data["tipo_pele"] = "todos os tipos"

    return products

## Arquivo e Main

In [None]:
def save_data(products_data: List[Dict]):
    if not products_data:
        print("\nNenhum dado para salvar.")
        return

    clean = []
    for p in products_data:
        clean.append({
            "marca": p.get("marca"),
            "nome": p.get("nome"),
            "subtitulo": p.get("subtitulo"),
            "categoria": p.get("categoria"),
            "quantidade": p.get("quantidade"),
            "preco": p.get("preco"),
            "beneficios": p.get("beneficios"),
            "ingredientes": p.get("ingredientes"),
            "tipo_pele": p.get("tipo_pele"),
            "imagem": p.get("imagem"),
        })

    with open("sallve_products.json", "w", encoding="utf-8") as f:
        json.dump(clean, f, ensure_ascii=False, indent=2)
    
    print(f"JSON: sallve_products.json ({len(clean)} produtos)")

if __name__ == "__main__":
    try:
        data = scrape_sallve_products()
        data = enrich_with_skin_types(data)
        save_data(data)
        print("____________________________________________________________________________________________________________")
        print(f"\nFim da execução! Produtos extraídos: {len(data)}")
        print(f"Imagens salvas em: {IMAGES_DIR.resolve()}")
        print("____________________________________________________________________________________________________________")

    except Exception as e:
        print(f"\nERRO: {e}")

____________________________________________________________________________________________________________
WEB SCRAPING Sallve...
Encontrados 146 links de produto (antes de filtro).
 [1/146] https://www.sallve.com.br/products/bastao-antioxidante-para-olhos
Encontrados 146 links de produto (antes de filtro).
 [1/146] https://www.sallve.com.br/products/bastao-antioxidante-para-olhos
 
OK: Bastão Antiolheiras Antioxidante
 
OK: Bastão Antiolheiras Antioxidante
 [2/146] https://www.sallve.com.br/products/kit-tudinho
 [2/146] https://www.sallve.com.br/products/kit-tudinho

Excluído por keyword (models): Kit Tudinho
 [3/146] https://www.sallve.com.br/collections/kits/products/kit-basicao

Excluído por keyword (models): Kit Tudinho
 [3/146] https://www.sallve.com.br/collections/kits/products/kit-basicao

Excluído por keyword (models): Kit Basicão
 [4/146] https://www.sallve.com.br/products/protetor-solar-bastao-com-cor-antimanchas-fps-90

Excluído por keyword (models): Kit Basicão
 [4/146] 

## Conversão JSON para CSV

In [6]:
def json_to_csv(json_file="sallve_products.json", csv_file="sallve_products.csv"):

    try:
        with open(json_file, "r", encoding="utf-8") as f:
            data = json.load(f)
        
        if not data:
            print(f"Nenhum dado encontrado no arquivo {json_file}")
            return
        
        cols = ["marca", "nome", "subtitulo", "categoria", "quantidade", "preco", "beneficios", "ingredientes", "tipo_pele", "imagem"]
        
        with open(csv_file, "w", encoding="utf-8", newline="") as f:
            writer = csv.DictWriter(f, fieldnames=cols)
            writer.writeheader()
            for row in data:

                csv_row = {k: (row.get(k) or "") for k in cols}
                writer.writerow(csv_row)
        
        print(f"CSV gerado: {csv_file} ({len(data)} linhas)")
        print(f"A partir do JSON: {json_file}")
        
    except FileNotFoundError:
        print(f" Arquivo {json_file} não encontrado!")
        
        import glob
        json_files = glob.glob("*.json")
        if json_files:
            for f in json_files:
                print(f"   - {f}")
        else:
            print("   Nenhum arquivo .json encontrado")
    except Exception as e:
        print(f"Erro ao converter JSON para CSV: {e}")


json_to_csv("sallve_products.json")  

CSV gerado: sallve_products.csv (39 linhas)
A partir do JSON: sallve_products.json
