# Sallve

In [3]:
import json, os, re, sys, time, random, unicodedata, csv
from typing import List, Dict, Optional, Tuple
from urllib.parse import urljoin, urlparse, quote
import requests
from bs4 import BeautifulSoup

sys.path.append(os.path.abspath("./../models"))

from skin import SKIN_TYPE_CANONICAL_ORDER, SKIN_TYPE_SYNONYMS_PT
from exclude import EXCLUDE_KEYWORDS
from ingredient import INGREDIENTES_VALIDOS
from benefits import BENEFIT_SYNONYMS_PT, BENEFIT_CANONICAL_ORDER
from category import CATEGORY_CANONICAL_ORDER, CATEGORY_HINTS

### Configurações Iniciais

In [None]:
BASE = "https://www.oceane.com.br"
LISTING_PATH = "/skincare"
OUTPUT_JSON = "oceane_products.json"
IMAGES_DIR = "images"
MAX_TO_FETCH = 300 
RANGE_STEP = 48  
TIMEOUT = 25

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,application/json,*/*;q=0.8",
    "Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8",
    "Connection": "keep-alive",
}

## Utilitários

### Funções auxiliares para normalização de texto, remoção de acentos, tokenização dos ingredientes, nomes de arquivos e formatação de preços.

In [None]:
def strip_acc(s): return "".join(c for c in unicodedata.normalize("NFKD", s or "") if not unicodedata.combining(c))
def norm(s): return re.sub(r"\s+"," ", strip_acc((s or "").lower())).strip()

def any_excluded(text, extra=None):
    base = norm(text or "")
    if extra: base += " " + norm(extra or "")
    for kw in EXCLUDE_KEYWORDS:
        if norm(kw) in base:
            return True
    return False

def should_exclude(name, url=None):
    if name and any_excluded(name): return True
    if url and any_excluded(urlparse(url).path): return True
    return False

def slugify(text):
    s = re.sub(r"[^a-z0-9]+","-", norm(text or ""))
    s = re.sub(r"-+","-", s).strip("-")
    return s or "produto"

def extract_quantity(name):
    if not name: return None
    for pat in [r"(\d+\s*(?:g|ml|mg|l|kg))\b", r"(\d+\s*(?:gramas|mililitros|litros))\b"]:
        m = re.search(pat, name, re.I)
        if m: return m.group(1).replace(" ","").strip()
    return None

def identify_category(name):
    if not name: return None
    t = norm(name)
    for cat in CATEGORY_CANONICAL_ORDER:
        for hint in CATEGORY_HINTS.get(cat, []):
            if norm(hint) in t:
                return cat
    return None

def identify_ingredients(text):
    if not text: return None
    t = norm(text); out=[]
    for ing in INGREDIENTES_VALIDOS:
        if norm(ing) in t and ing not in out: out.append(ing)
    return "; ".join(out) if out else None

def identify_benefits(text):
    if not text: return None
    t = norm(text); found=set()
    for can, syns in BENEFIT_SYNONYMS_PT.items():
        for syn in syns:
            if norm(syn) in t:
                found.add(can); break
    ordered = [b for b in BENEFIT_CANONICAL_ORDER if b in found]
    return "; ".join(ordered) if ordered else None

def identify_skin_types(text):
    if not text: return None
    t = norm(text); out=[]
    for can, syns in SKIN_TYPE_SYNONYMS_PT.items():
        for syn in syns:
            if norm(syn) in t and can not in out:
                out.append(can); break
    ordered = [s for s in SKIN_TYPE_CANONICAL_ORDER if s in out]
    return "; ".join(ordered) if ordered else None

def session():
    s = requests.Session()
    s.headers.update(HEADERS)
    from requests.adapters import HTTPAdapter
    from urllib3.util.retry import Retry
    retry = Retry(total=2, backoff_factor=0.5, status_forcelist=[429,500,502,503,504], allowed_methods=["GET","HEAD","OPTIONS"])
    s.mount("http://", HTTPAdapter(max_retries=retry))
    s.mount("https://", HTTPAdapter(max_retries=retry))
    return s


##  Coleta de Links de Produtos Disponíveis - VTEX APi

In [None]:
def vtex_search_ranges():
    ranges = []
    start = 0
    while start < MAX_TO_FETCH:
        end = start + RANGE_STEP
        ranges.append((start, end))
        start = end + 1
    return ranges

def vtex_search_candidates():
 
    fq1 = f"C:/{LISTING_PATH.strip('/')}/"
    return [
        f"/api/catalog_system/pub/products/search?fq={quote(fq1)}&_from={{start}}&_to={{end}}",
        f"/api/catalog_system/pub/products/search?ft=skincare&_from={{start}}&_to={{end}}",
        f"/api/catalog_system/pub/products/search/{LISTING_PATH.strip('/')}?_from={{start}}&_to={{end}}", 
    ]

def fetch_vtex_products():
    items = []
    seen_ids = set()
    with session() as s:
        for start, end in vtex_search_ranges():
            got_in_range = 0
            for pattern in vtex_search_candidates():
                url = BASE + pattern.format(start=start, end=end)
                try:
                    r = s.get(url, timeout=TIMEOUT)
                    if r.status_code != 200:
                        continue
                    data = r.json()
                    if not isinstance(data, list) or not data:
                        continue
                    for prod in data:
                        pid = str(prod.get("productId") or prod.get("productID") or "")
                        if not pid or pid in seen_ids:
                            continue
                        seen_ids.add(pid)
                        items.append(prod)
                        got_in_range += 1
                except Exception:
                    continue
            # heurística: se o intervalo não trouxe nada em nenhuma variante, segue para próximo range
            if got_in_range == 0 and start > 0:
              
                break
    return items

def product_to_record_from_api(prod: dict) -> Optional[Dict]:
    name = prod.get("productName") or prod.get("productTitle") or prod.get("product_name")
    if not name:
        name = prod.get("productNameComplete")
    if should_exclude(name):
        return None

    description = prod.get("description") or prod.get("descriptionShort") or prod.get("metaTagDescription")

    img_url = None
    final_price = None
    list_price = None
    selling_price = None

    items = prod.get("items") or []
    if items:
        it0 = items[0]
        images = it0.get("images") or []
        if images:
            im = images[0]
            img_url = im.get("imageUrl") or im.get("url") or im.get("imageUrlText")
        sellers = it0.get("sellers") or []
        if sellers:
            off = sellers[0].get("commertialOffer") or {}
         
            list_price = off.get("ListPrice") or off.get("listPrice")
            selling_price = off.get("Price") or off.get("price") or off.get("SellingPrice") or off.get("sellingPrice")

    def fmt(p):
        if p is None: return None
        try: return f"{float(p):.2f}"
        except: return None

    selling_price = fmt(selling_price)
    list_price = fmt(list_price)
    final_price = selling_price or list_price

    quantidade = extract_quantity(name or "")
    categoria = identify_category(name or "")
    beneficios = identify_benefits(description)
    ingredientes = identify_ingredients(description)
    tipo_pele = identify_skin_types(description)

    slug = slugify(name or "produto")
    img_name = None
    if img_url:
        img_name = download_image(img_url, slug)

    if not any([name, final_price, img_url, description]):
        return None

    rec = {
        "marca": "oceane",
        "nome": name,
        "subtitulo": None,
        "categoria": categoria,
        "quantidade": quantidade,
        "preco": final_price,
        "beneficios": beneficios,
        "ingredientes": ingredientes,
        "tipo_pele": tipo_pele,
        "imagem": img_name,
    }
    return rec


## Extração

In [None]:
def get_html(url):
    try:
        with session() as s:
            r = s.get(url, timeout=TIMEOUT)
            if r.status_code == 200 and r.text:
                return r.text
    except requests.exceptions.RequestException:
        return None
    return None

def list_urls_from_html():
    all_urls = set()
    page = 1
    while True:
        url = BASE + (LISTING_PATH if page == 1 else f"{LISTING_PATH}?page={page}")
        html = get_html(url)
        if not html:
            break
        soup = BeautifulSoup(html, "html.parser")

        candidates = []
        for a in soup.find_all("a", href=True):
            href = a["href"]
            if not href or href.startswith("#"): continue
            if any(x in href for x in ["/account","/cart","/login","/busca","whatsapp"]): continue
            if "/p" in href:
                candidates.append(urljoin(BASE, href))
        for card in soup.find_all(["div","article","li","section"], class_=re.compile(r"(product|item|shelf|search-result|grid|gallery|summary)", re.I)):
            a = card.find("a", href=True)
            if a and "/p" in a["href"]:
                candidates.append(urljoin(BASE, a["href"]))

        before = len(all_urls)
        for u in candidates:
            u = u.rstrip("/")
            if not u.endswith("/p"): u = u + "/p"
            all_urls.add(u)
        added = len(all_urls) - before
        if page > 3 and added < 3:
            break
        page += 1
        time.sleep(0.4)
    return list(all_urls)

def parse_name_from_html(soup, fallback):
    el = soup.select_one("span.vtex-store-components-3-x-productBrand")
    if el: return el.get_text(strip=True)
    el = soup.find(["h1","span","div"], class_=re.compile(r"productBrand|productName|name", re.I))
    if el: return el.get_text(strip=True)
    if soup.title and soup.title.string:
        t = soup.title.string.strip()
        t = re.sub(r" \| .*$", "", t)
        if len(t) > 3: return t
    # JSON-LD
    for sc in soup.find_all("script", type=re.compile(r"ld\+json", re.I)):
        try:
            data = json.loads(sc.string or sc.get_text() or "{}")
            arr = data if isinstance(data, list) else [data]
            for it in arr:
                if isinstance(it, dict) and it.get("@type") == "Product" and it.get("name"):
                    return it["name"]
        except Exception:
            pass
    return fallback

def parse_prices_from_html(soup):
    def get_p(cont):
        if not cont: return None
        inteiro = cont.find("span", class_=re.compile(r"currencyInteger"))
        fracao  = cont.find("span", class_=re.compile(r"currencyFraction"))
        if inteiro and fracao:
            return f"{inteiro.get_text(strip=True)}.{fracao.get_text(strip=True)}".replace(",", ".")
        txt = cont.get_text(" ", strip=True)
        m = re.search(r"(\d{1,3}(?:\.\d{3})*,\d{2}|\d+[.,]\d{2})", txt)
        if m: return m.group(1).replace(".", "").replace(",", ".")
        return None
    list_c = soup.find("span", class_=re.compile(r"vtex-store-components-3-x-listPriceValue"))
    sell_c = soup.find("div", class_=re.compile(r"vtex-store-components-3-x-price_sellingPriceContainer"))
    sell_s = soup.find("span", class_=re.compile(r"vtex-store-components-3-x-sellingPriceValue"))
    list_price = get_p(list_c)
    selling   = get_p(sell_c) or get_p(sell_s)
    final     = selling or list_price
    return final, list_price, selling

def parse_specs_from_html(soup):
    c1 = soup.find("div", class_=re.compile(r"vtex-store-components-3-x-content--specifications-tabs"))
    c2 = soup.find("div", class_=re.compile(r"vtex-store-components-3-x-specificationsTab--specifications-mini"))
    texts = []
    for c in [c1,c2]:
        if c:
            t = c.get_text(" ", strip=True)
            if t: texts.append(t)
    text = " ".join(texts)
    return re.sub(r"\s+"," ", text).strip()

def parse_image_from_html(soup):
    img = soup.find("img", class_=re.compile(r"vtex-store-components-3-x-productImageTag"))
    if img and img.get("src"): return img["src"]
    if img and img.get("srcset"):
        urls = re.findall(r"(https?://[^\s,]+)\s+\d+w", img["srcset"])
        if urls: return urls[-1]
    meta = soup.find("meta", property="og:image")
    if meta and meta.get("content"): return meta["content"]
    for sc in soup.find_all("script", type=re.compile(r"ld\+json", re.I)):
        try:
            data = json.loads(sc.string or sc.get_text() or "{}")
            arr = data if isinstance(data, list) else [data]
            for it in arr:
                if isinstance(it, dict) and it.get("@type") == "Product":
                    im = it.get("image")
                    if isinstance(im, list) and im: return im[-1]
                    if isinstance(im, str): return im
        except Exception:
            pass
    return None

def download_image(url_img, slug):
    if not url_img: return None
    try:
        with session() as s:
            r = s.get(url_img, timeout=TIMEOUT, headers={"Referer": BASE, **HEADERS})
            if r.status_code != 200 or not r.content:
                return None
            os.makedirs(IMAGES_DIR, exist_ok=True)
            ct = (r.headers.get("Content-Type") or "").lower()
            ext = "png"
            if "jpeg" in ct or "jpg" in ct: ext = "jpg"
            path = os.path.join(IMAGES_DIR, f"{slug}.{ext}")
            with open(path, "wb") as f: f.write(r.content)
            return os.path.basename(path)
    except Exception:
        return None

def record_from_product_page(url):
    html = get_html(url)
    if not html: return None
    soup = BeautifulSoup(html, "html.parser")
    name = parse_name_from_html(soup, None)
    if should_exclude(name, url): return None
    final, orig, promo = parse_prices_from_html(soup)
    specs = parse_specs_from_html(soup)
    img   = parse_image_from_html(soup)

    quantidade = extract_quantity(name or "")
    categoria  = identify_category(name or "")
    beneficios = identify_benefits(specs)
    ingredientes = identify_ingredients(specs)
    tipo_pele = identify_skin_types(specs)
    if "olheiras" in (specs or "").lower():
        if tipo_pele and "com olheiras" not in tipo_pele:
            tipo_pele = (tipo_pele + "; com olheiras").strip("; ")
        elif not tipo_pele:
            tipo_pele = "com olheiras"

    if not any([name, final, img, specs]):
        return None

    img_name = download_image(img, slugify(name or "produto")) if img else None

    return {
        "marca": "oceane",
        "nome": name,
        "subtitulo": None,
        "categoria": categoria,
        "quantidade": quantidade,
        "preco": final,
        "beneficios": beneficios,
        "ingredientes": ingredientes,
        "tipo_pele": tipo_pele,
        "imagem": img_name
    }


## Arquivo e Main

In [None]:
def main():
    os.makedirs(IMAGES_DIR, exist_ok=True)

    # 1) Tenta VTEX API 
    print("\nBuscando pela API VTEX")
    api_products = fetch_vtex_products()
    print(f"\n API retornou {len(api_products)} itens brutos")

    results: List[Dict] = []
    if api_products:
        for prod in api_products:
            rec = product_to_record_from_api(prod)
            if rec:
              
                if should_exclude(rec.get("nome")):
                    continue
                results.append(rec)

    # 2) Se ficou pouco, tenta HTML fallback
    if len(results) < 50:
        print(" HTML (fallback)…")
        urls = list_urls_from_html()
        print(f" Quantidade: {len(urls)} URLs")
        for i, u in enumerate(urls, 1):
            rec = record_from_product_page(u)
            if rec:
                results.append(rec)
            time.sleep(random.uniform(0.3, 0.7))

    # 3) Dedup por nome (ou imagem)
    dedup = {}
    for r in results:
        key = norm(r.get("nome")) or r.get("imagem") or str(r)
        if key not in dedup:
            dedup[key] = r
    results = list(dedup.values())

    # 4) Salva e preview
    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print("\n____________________________________________________________________________________________________________")
    print("\nFim da Execução!")
    print(f"Total de produtos salvos: {len(results)}")
    print(f"JSON: {OUTPUT_JSON}")
    print(f"Imagens em: {IMAGES_DIR}")
    print("____________________________________________________________________________________________________________")


if __name__ == "__main__":


    print("____________________________________________________________________________________________________________")
    print("WEB SCRAPING Oceane")
    print("____________________________________________________________________________________________________________")

    main()


## Conversão JSON para CSV

In [4]:
def json_to_csv(json_file="oceane_products.json", csv_file="oceane_products.csv"):

    try:
        with open(json_file, "r", encoding="utf-8") as f:
            data = json.load(f)
        
        if not data:
            print(f"Nenhum dado encontrado no arquivo {json_file}")
            return
        
        cols = ["marca", "nome", "subtitulo", "categoria", "quantidade", "preco", "beneficios", "ingredientes", "tipo_pele", "imagem"]
        
        with open(csv_file, "w", encoding="utf-8", newline="") as f:
            writer = csv.DictWriter(f, fieldnames=cols)
            writer.writeheader()
            for row in data:

                csv_row = {k: (row.get(k) or "") for k in cols}
                writer.writerow(csv_row)
        
        print(f"CSV gerado: {csv_file} ({len(data)} linhas)")
        print(f"A partir do JSON: {json_file}")
        
    except FileNotFoundError:
        print(f" Arquivo {json_file} não encontrado!")
        
        import glob
        json_files = glob.glob("*.json")
        if json_files:
            for f in json_files:
                print(f"   - {f}")
        else:
            print("   Nenhum arquivo .json encontrado")
    except Exception as e:
        print(f"Erro ao converter JSON para CSV: {e}")


json_to_csv("oceane_products.json")  

CSV gerado: oceane_products.csv (32 linhas)
A partir do JSON: oceane_products.json
