# Ollie 

In [1]:
import os, re, csv, json, time, random, logging, unicodedata, sys
from urllib.parse import urljoin, urlencode, urlparse
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter, Retry
import pandas as pd

sys.path.append(os.path.abspath("/home/usuario/Área de trabalho/Dados/models"))

from skin import (
    SKIN_TYPE_CANONICAL_ORDER,
    SKIN_TYPE_SYNONYMS_PT,
)
from exclude import EXCLUDE_KEYWORDS
from ingredient import INGREDIENTES_VALIDOS
from benefits import BENEFIT_SYNONYMS_PT, BENEFIT_CANONICAL_ORDER
from category import CATEGORY_CANONICAL_ORDER, CATEGORY_HINTS

  from pandas.core import (


## Informações Iniciais

In [2]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
logger = logging.getLogger("ollie")

BASE_URL = "https://meuollie.com.br"
COLLECTION_PATH = "/collections/loja-produtos-ollie"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
    "Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
}

## Utilitários

In [3]:
def _strip_accents_lower(s: str) -> str:
    if not s: return ""
    s = s.strip().lower()
    s = "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")
    return re.sub(r"\s+", " ", s).strip()

def norm_space(t):
    if not t: return ""
    return re.sub(r"\s+", " ", t).strip()

def strip_accents(s: str) -> str:
    if not s: return ""
    return "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")

def norm_text_for_match(t: str) -> str:
    t = t or ""
    t = strip_accents(t.lower())
    t = re.sub(r"\s+", " ", t)
    return t

def format_brl_price(num_str: str) -> str:
    if not num_str:
        return ""
    try:
        v = float(num_str.replace(",", "."))
        return f"{v:.2f}"
    except Exception:
        return num_str

def norm_price(t):
    if not t: return ""
    t = t.replace("R$", "").strip().replace(".","").replace(",",".")
    m = re.findall(r"[0-9.]+", t)
    return m[0] if m else ""

def slugify(text):
    t = text.lower().strip()
    t = re.sub(r"[^a-z0-9\-\_\sáàâãäéèêëíìîïóòôõöúùûüç]", "", t)
    t = t.replace(" ", "-")
    for a,b in (("á","a"),("à","a"),("â","a"),("ã","a"),("ä","a"),
                ("é","e"),("è","e"),("ê","e"),("ë","e"),
                ("í","i"),("ì","i"),("î","i"),("ï","i"),
                ("ó","o"),("ò","o"),("ô","o"),("õ","o"),("ö","o"),
                ("ú","u"),("ù","u"),("û","u"),("ü","u"),
                ("ç","c")):
        t = t.replace(a,b)
    return re.sub(r"-+", "-", t).strip("-")

def get_image_filename(url):
    if not url:
        return ""
    parsed = urlparse(url)
    filename = os.path.basename(parsed.path)
    return filename.lower() if filename else ""

## Categorias

In [4]:
_CATEGORY_ORDER_MAP = {name: i for i, name in enumerate(CATEGORY_CANONICAL_ORDER)}

def classify_category_from_name(name: str, description: str | None = None) -> str | None:
    txt = _strip_accents_lower(f"{name or ''} {description or ''}")
    hits = []
    for cat, needles in CATEGORY_HINTS.items():
        for n in needles:
            if _strip_accents_lower(n) in txt:
                hits.append(cat)
                break
    if not hits:
        return None
    hits.sort(key=lambda c: _CATEGORY_ORDER_MAP.get(c, 10_000))
    return hits[0]

## Sessões

In [5]:
def make_session(max_retries=3, backoff=0.5, timeout=20):
    
    s = requests.Session()
    retry = Retry(
        total=max_retries, read=max_retries, connect=max_retries,
        backoff_factor=backoff, status_forcelist=[429,500,502,503,504],
        allowed_methods=frozenset(["GET"]), raise_on_status=False
    )
    adapter = HTTPAdapter(max_retries=retry, pool_connections=10, pool_maxsize=10)
    s.mount("http://", adapter)
    s.mount("https://", adapter)
    s.headers.update(HEADERS)
    s.timeout = timeout
    return s

def fetch_html(session, url, delay_range=(0.6,1.1)):

    time.sleep(random.uniform(*delay_range))
    r = session.get(url, timeout=session.timeout)
    if r.status_code >= 400:
        logger.warning("HTTP %s em %s", r.status_code, url)
    return r.text, r.status_code

## Paginação e Listagem

In [6]:
def build_listing_url(page=1, phcursor=None):
    base = urljoin(BASE_URL, COLLECTION_PATH)
    params = []
    if page and page > 1:
        params.append(("page", str(page)))
    if phcursor:
        params.append(("phcursor", phcursor))
    return base + ("?" + urlencode(params) if params else "")

def parse_listing(html):
    soup = BeautifulSoup(html, "html.parser")
    product_urls = []
    for a in soup.select("a[href*='/products/']"):
        href = a.get("href") or ""
        if "/products/" in href:
            product_urls.append(urljoin(BASE_URL, href))

    next_url = None
    a_next = soup.find("a", attrs={"title": lambda x: x and "Avançar" in x})
    if a_next and a_next.get("href"):
        next_url = urljoin(BASE_URL, a_next["href"])
    if not next_url:
        for a in soup.select("a[href*='?page=']"):
            next_url = urljoin(BASE_URL, a.get("href")); break

    seen, uniq = set(), []
    for u in product_urls:
        if u not in seen:
            seen.add(u); uniq.append(u)
    return uniq, next_url

def scrape_listing(session, page_cap=20):
    all_urls, seen, page = [], set(), 1
    next_url = build_listing_url(page=1)
    prev_set = set()
    while next_url and page <= page_cap:
        html, _ = fetch_html(session, next_url)
        product_urls, hinted_next = parse_listing(html)
        logging.info("Página %d | %d produtos", page, len(product_urls))
        if not product_urls:
            logging.info("Sem produtos. Encerrando.")
            break
        cur_set = set(product_urls)
        if cur_set == prev_set:
            logging.info("Página repetida. Encerrando.")
            break
        prev_set = cur_set
        added = 0
        for u in product_urls:
            if u not in seen:
                seen.add(u); all_urls.append(u); added += 1
        if added == 0:
            logging.info("Nenhum novo produto. Encerrando.")
            break
        page += 1
        next_url = hinted_next or build_listing_url(page=page)
    return all_urls

def _collect_section_text(soup, anchors=("PRINCIPAIS BENEFÍCIOS","BENEFÍCIOS","BENEFICIOS","RESULTADOS","POR QUE AMAR")):
    chunks = []
    big_text = norm_space(soup.get_text(" "))
    for b in soup.find_all(["b","strong","h1","h2","h3"]):
        title = norm_space(b.get_text()).upper()
        if any(a in title for a in anchors):
            cur = b.parent if b.parent else b
            hops, acc = 0, []
            while cur and hops < 12:
                cur = cur.find_next_sibling()
                if not cur: break
                if cur.name in ("p","div","span","ul","ol","li"):
                    acc.append(norm_space(cur.get_text(" ")))
                elif cur.name in ("h1","h2","h3","strong","b"):
                    break
                hops += 1
            if acc:
                chunks.append(" ".join(acc))
    return " ".join(chunks) if chunks else big_text


## Benefícios, Ingredientes e Tipos de pele

In [7]:

def extract_benefits(soup):
    text = _collect_section_text(soup)
    text_norm = norm_text_for_match(text)
    found = set()
    for canonical, synonyms in BENEFIT_SYNONYMS_PT.items():
        for syn in synonyms:
            if syn and norm_text_for_match(syn) in text_norm:
                found.add(canonical); break
    if not found:
      
        return norm_space(text)[:220]
    ordered = [b for b in BENEFIT_CANONICAL_ORDER if b in found]
    return ", ".join(ordered)

def extract_skin_types(soup):
    anchors = ("PARA QUAIS TIPOS DE PELE","TIPO DE PELE","TIPOS DE PELE","PELE")
    full_text = ""
    for b in soup.find_all(["b","strong","h1","h2","h3"]):
        t = norm_space(b.get_text()).upper()
        if any(a in t for a in anchors):
            cur = b.parent if b.parent else b
            acc, hops = [], 0
            while cur and hops < 10:
                cur = cur.find_next_sibling()
                if not cur: break
                if cur.name in ("p","div","span","ul","ol","li"):
                    acc.append(norm_space(cur.get_text(" ")))
                elif cur.name in ("h1","h2","h3","strong","b"):
                    break
                hops += 1
            if acc:
                full_text = " ".join(acc); break
            
    if not full_text:
        full_text = norm_space(soup.get_text(" "))
    txt = norm_text_for_match(full_text)
    found = set()
    for canonical, synonyms in SKIN_TYPE_SYNONYMS_PT.items():
        for syn in synonyms:
            if syn and norm_text_for_match(syn) in txt:
                found.add(canonical); break
    if found:
        ordered = [c for c in SKIN_TYPE_CANONICAL_ORDER if c in found]
        return ", ".join(ordered)
    return full_text[:200] + ("..." if len(full_text) > 200 else "")

def extract_active_ingredients(soup):
    text = ""

    for b in soup.find_all(["b","strong","h1","h2","h3"]):
        title = norm_space(b.get_text()).upper()
        if "PRINCIPAIS ATIVOS" in title or "ATIVOS" in title:
            cur = b.parent if b.parent else b
            hops, acc = 0, []
            while cur and hops < 12:
                cur = cur.find_next_sibling()
                if not cur: break
                if cur.name in ("p","div","span","ul","ol","li"):
                    acc.append(norm_space(cur.get_text(" ")))
                elif cur.name in ("h1","h2","h3","strong","b"):
                    break
                hops += 1
            if acc:
                text = " ".join(acc); break
            
    if not text:
        for b in soup.find_all(["b","strong","h1","h2","h3"]):
            title = norm_space(b.get_text()).upper()
            if "COMPOSIÇÃO" in title or "COMPOSICAO" in title or "INGREDIENTES" in title:
                p = b.find_next("p")
                if p:
                    text = norm_space(p.get_text(" ")); break
                
    if not text:
        text = norm_space(soup.get_text(" "))
    text_norm = norm_text_for_match(text)
    found = set()
    for ing in INGREDIENTES_VALIDOS:
        if norm_text_for_match(ing) in text_norm:
            found.add(ing)
    if found:
        return ", ".join(sorted(found, key=lambda x: strip_accents(x).lower()))
    return text[:300] + ("..." if len(text) > 300 else "")

## Imagem

In [8]:
def extract_image_best(soup):
    og = soup.select_one('meta[property="og:image"]')
    if og and og.get("content"):
        url = og["content"]
        if url.startswith("//"): url = "https:" + url
        return url
    best_url, best_w = "", -1
    for img in soup.select("img"):
        srcset = img.get("srcset") or ""
        src = img.get("src") or ""
        def absurl(u):
            if not u: return ""
            if u.startswith("//"): return "https:" + u
            return urljoin(BASE_URL, u)
        if srcset:
            for part in srcset.split(","):
                bits = part.strip().split()
                if not bits: continue
                cand, w = bits[0], -1
                if len(bits) > 1 and bits[1].endswith("w"):
                    try: w = int(bits[1][:-1])
                    except: w = -1
                if w > best_w:
                    best_w = w; best_url = absurl(cand)
        elif src:
            if best_w < 0:
                best_url = absurl(src); best_w = 0
    if best_url:
        if best_url.startswith("//"):
            best_url = "https:" + best_url
        return best_url
    return ""

def download_image(session, url, dest_dir, slug):
    if not url: return ""
    os.makedirs(dest_dir, exist_ok=True)
    m = re.search(r"[?&]width=(\d+)", url)
    width = m.group(1) if m else ""
    fname = f"{slug}__{width}.jpg" if width else f"{slug}.jpg"
    path = os.path.join(dest_dir, fname)
    try:
        r = session.get(url, timeout=session.timeout, headers=HEADERS)
        if r.status_code == 200:
            with open(path, "wb") as f: f.write(r.content)
            return path
        logging.warning("Falha ao baixar imagem %s (HTTP %s)", url, r.status_code)
    except Exception as e:
        logging.warning("Erro ao baixar imagem %s: %s", url, e)
    return ""

## Produto

In [9]:

def parse_product(html, url):
    soup = BeautifulSoup(html, "html.parser")

    nome = ""
    name = soup.select_one("h1.h2.product-single__title") or soup.select_one("h1.product-single__title")
    if name: nome = norm_space(name.get_text())

    if any(k in (nome or "").lower() for k in EXCLUDE_KEYWORDS):
        return None

    preco_num = ""
    price_el = soup.select_one("span.product__price") or soup.select_one("span[data-product-price]")
    if price_el:
        preco_num = norm_price(price_el.get_text())
    preco_fmt = format_brl_price(preco_num) if preco_num else ""

    beneficios = extract_benefits(soup) or ""
    tipos_de_pele = extract_skin_types(soup) or ""
    ingredientes = extract_active_ingredients(soup) or ""

    categoria = classify_category_from_name(nome)

    imagem_url = extract_image_best(soup)
    imagem_filename = get_image_filename(imagem_url)

    data = {
        "marca": "ollie",
        "nome": nome,
        "subtitulo": None,                 
        "categoria": categoria,           
        "quantidade": "",           
        "preco": preco_fmt,
        "beneficios": "; ".join([x.strip() for x in beneficios.replace(",", ";").split(";") if x.strip()]),
        "ingredientes": "; ".join([x.strip() for x in ingredientes.replace(",", ";").split(";") if x.strip()]),
        "tipo_pele": "; ".join([x.strip() for x in tipos_de_pele.replace(",", ";").split(";") if x.strip()]),
        "imagem": imagem_filename
    }
    return data

## Execução

In [10]:
def write_outputs(products_data, out_csv, out_json):

    csv_columns = [
        "marca","nome","subtitulo","categoria", "quantidade","preco",
        "ingredientes","beneficios","tipo_pele","imagem",
    ]

    with open(out_csv, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=csv_columns)
        w.writeheader()
        for product in products_data:
            w.writerow({k: product.get(k, "") for k in csv_columns})

    json_data = [{k: product.get(k, "") for k in csv_columns} for product in products_data]
    with open(out_json, "w", encoding="utf-8") as f:
        json.dump(json_data, f, ensure_ascii=False, indent=2)

def run_scraper(out_csv="ollie_products.csv", out_json="ollie_products.json",
                images_dir="images/", max_retries=3, timeout=20, max_products=80):
    s = make_session(max_retries=max_retries, timeout=timeout)
    os.makedirs(os.path.dirname(out_csv) or ".", exist_ok=True)
    os.makedirs(os.path.dirname(out_json) or ".", exist_ok=True)
    os.makedirs(images_dir, exist_ok=True)

    product_urls = scrape_listing(s, page_cap=20)
    logging.info("Total de URLs: %d", len(product_urls))

    products_data = []
    seen_names = set()

    for i, url in enumerate(product_urls, 1):
        if len(products_data) >= max_products:
            logger.info("Limite de %d produtos atingido. Parando.", max_products)
            break

        html, _ = fetch_html(s, url)
        data = parse_product(html, url)
        if data is None:
            logging.info("Skip (exclusão): %s", url)
            continue

        nome_key = (data.get("nome") or "").strip().lower()
        if nome_key in seen_names:
            logging.info("Skip (duplicado): %s", nome_key)
            continue
        seen_names.add(nome_key)

        slug = slugify(data.get("nome") or os.path.basename(urlparse(url).path))
        _ = download_image(s, data.get("_imagem_url",""), images_dir, slug)

        data.pop("_imagem_url", None)
        products_data.append(data)

        if i % 3 == 0:
            write_outputs(products_data, out_csv, out_json)
            logging.info("Parcial salva (%d itens).", len(products_data))

    write_outputs(products_data, out_csv, out_json)
    logging.info("Finalizado: %d itens", len(products_data))
    return len(products_data), out_csv, out_json, images_dir

In [11]:
if __name__ == "__main__":
    OUT_CSV = "ollie_products.csv"
    OUT_JSON = "ollie_products.json"
    IMAGES_DIR = "images/"
    MAX_RETRIES = 3
    TIMEOUT = 20
    MAX_PRODUCTS = 80

    try:
        n_final, csv_path, json_path, img_dir = run_scraper(
            out_csv=OUT_CSV,
            out_json=OUT_JSON,
            images_dir=IMAGES_DIR,
            max_retries=MAX_RETRIES,
            timeout=TIMEOUT,
            max_products=MAX_PRODUCTS,
        )
        print("\n=== SCRAPING CONCLUÍDO ===")
        print("Itens coletados:", n_final)
        print("CSV:", os.path.abspath(csv_path))
        print("JSON:", os.path.abspath(json_path))
        print("Imagens:", os.path.abspath(img_dir))
    except Exception as e:
        print("Erro durante o scraping:", e)


2025-09-24 11:30:39,339 | INFO | Página 1 | 33 produtos
2025-09-24 11:30:40,738 | INFO | Página 2 | 6 produtos
2025-09-24 11:30:41,854 | INFO | Página 3 | 33 produtos
2025-09-24 11:30:41,854 | INFO | Nenhum novo produto. Encerrando.
2025-09-24 11:30:41,855 | INFO | Total de URLs: 39
2025-09-24 11:30:44,423 | INFO | Skip (duplicado): glow hidratante facial fps50
2025-09-24 11:30:45,871 | INFO | Skip (duplicado): glow hidratante facial fps50
2025-09-24 11:30:47,281 | INFO | Skip (exclusão): https://meuollie.com.br/collections/loja-produtos-ollie/products/protetor-solar-hidratante-corporal-fps-60
2025-09-24 11:30:49,831 | INFO | Skip (duplicado): protetor solar em bastão com cor fps 95
2025-09-24 11:30:51,104 | INFO | Skip (duplicado): protetor solar em bastão com cor fps 95
2025-09-24 11:30:52,534 | INFO | Skip (duplicado): protetor solar em bastão com cor fps 95
2025-09-24 11:30:53,992 | INFO | Skip (duplicado): protetor solar em bastão com cor fps 95
2025-09-24 11:30:55,413 | INFO | Sk


=== SCRAPING CONCLUÍDO ===
Itens coletados: 10
CSV: /home/usuario/Área de trabalho/Dados/Ollie/ollie_products.csv
JSON: /home/usuario/Área de trabalho/Dados/Ollie/ollie_products.json
Imagens: /home/usuario/Área de trabalho/Dados/Ollie/images
