# Ollie 

In [1]:
import os, re, csv, json, time, random, logging, unicodedata, sys
from urllib.parse import urljoin, urlencode, urlparse
from typing import List, Dict
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter, Retry
import pandas as pd

sys.path.append(os.path.abspath("./../models"))

from skin import (
    SKIN_TYPE_CANONICAL_ORDER,
    SKIN_TYPE_SYNONYMS_PT,
)
from exclude import EXCLUDE_KEYWORDS
from ingredient import INGREDIENTES_VALIDOS
from benefits import BENEFIT_SYNONYMS_PT, BENEFIT_CANONICAL_ORDER
from category import CATEGORY_CANONICAL_ORDER, CATEGORY_HINTS

### Configurações Iniciais

In [24]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
logger = logging.getLogger("ollie")

BASE_URL = "https://meuollie.com.br"
COLLECTION_PATH = "/collections/loja-produtos-ollie"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
    "Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
}

### Utilitários

### Funções auxiliares para normalização de texto, remoção de acentos, tokenização dos ingredientes, nomes de arquivos e formatação de preços.

In [25]:
def strip_accents(input_string: str) -> str:

    if not input_string:
        return ""
    
    normalized_chars = unicodedata.normalize("NFD", input_string)
    accent_free_string = "".join(
        char for char in normalized_chars 
        if unicodedata.category(char) != "Mn"  
    )
    
    return accent_free_string

def normalize_text(input_string: str) -> str:

    if not input_string:
        return ""
    
    processed_text = strip_accents(input_string.strip().lower())
    
    processed_text = processed_text.replace("-", " ")
    
    processed_text = re.sub(r"[^\w\s]", " ", processed_text)
    
    normalized_result = re.sub(r"\s+", " ", processed_text).strip()
    
    return normalized_result

def normalize_space(input_text: str) -> str:
 
    if not input_text:
        return ""
    
    space_normalized = re.sub(r"\s+", " ", input_text).strip()
    
    return space_normalized

def slugify(input_text: str) -> str:
  
    if not input_text:
        return "produto"  

    normalized_slug = normalize_text(input_text)
    
    slug_with_hyphens = re.sub(r"[^a-z0-9]+", "-", normalized_slug).strip("-")
    
    clean_slug = re.sub(r"-{2,}", "-", slug_with_hyphens)
    
    return clean_slug or "produto"

def normalize_price(price_text: str) -> str:

    if not price_text:
        return ""
    
    sanitized_text = price_text.replace("R$", "").replace("\xa0", " ").strip()
    sanitized_text = sanitized_text.replace(" ", "")
   
    brazilian_format_match = re.search(r"(\d{1,3}(?:\.\d{3})*,\d{2})", sanitized_text)
    if brazilian_format_match:
        extracted_number = brazilian_format_match.group(1)
    
        decimal_number = extracted_number.replace(".", "").replace(",", ".")
        try:
            formatted_price = f"{float(decimal_number):.2f}"
            return formatted_price
        except ValueError:
            pass  

    simple_format_match = re.search(r"(\d+(?:\.\d{1,2})?)", sanitized_text)
    if simple_format_match:
        try:
            extracted_value = float(simple_format_match.group(1))
            return f"{extracted_value:.2f}"
        except ValueError:
            pass  
    
    return ""  

def get_image_filename(image_url: str) -> str:

    if not image_url:
        return ""
    
    parsed_url = urlparse(image_url)
    
    filename = os.path.basename(parsed_url.path)
    
    return filename.lower() if filename else ""


def safe_join_url(url_input: str) -> str:

    if not url_input:
        return url_input
    
    if url_input.startswith("//"):
        return "https:" + url_input
    
    return url_input

def infer_image_extension(image_url: str) -> str:

    if not image_url:
        return ".jpg"  
    
    parsed_url = urlparse(image_url)
    file_extension = os.path.splitext(parsed_url.path)[1].lower()
    
    supported_extensions = [".jpg", ".jpeg", ".png", ".webp"]
    
    if file_extension in supported_extensions:

        return ".jpg" if file_extension == ".jpeg" else file_extension
    
    return ".jpg" 

def sentence_case(input_text: str) -> str:

    if not input_text:
        return input_text
    
    lowercased_text = input_text.lower()
    
    if len(lowercased_text) > 1:
        sentence_cased = lowercased_text[0].upper() + lowercased_text[1:]
    else:
        sentence_cased = lowercased_text.upper()
    
    return sentence_cased

## Categorias

In [26]:
_CATEGORY_ORDER_MAP = {name: i for i, name in enumerate(CATEGORY_CANONICAL_ORDER)}

def classify_category_from_name(name: str, description: str | None = None) -> str | None:
    txt = normalize_text(f"{name or ''} {description or ''}")
    hits = []
    for cat, needles in CATEGORY_HINTS.items():
        for n in needles:
            if normalize_text(n) in txt:
                hits.append(cat)
                break
    if not hits:
        return None
    hits.sort(key=lambda c: _CATEGORY_ORDER_MAP.get(c, 10_000))
    return hits[0]

## Sessões

In [27]:
def make_session(max_retries=3, backoff=0.5, timeout=20):
    
    s = requests.Session()
    retry = Retry(
        total=max_retries, read=max_retries, connect=max_retries,
        backoff_factor=backoff, status_forcelist=[429,500,502,503,504],
        allowed_methods=frozenset(["GET"]), raise_on_status=False
    )
    adapter = HTTPAdapter(max_retries=retry, pool_connections=10, pool_maxsize=10)
    s.mount("http://", adapter)
    s.mount("https://", adapter)
    s.headers.update(HEADERS)
    s.timeout = timeout
    return s

def fetch_html(session, url, delay_range=(0.6,1.1)):

    time.sleep(random.uniform(*delay_range))
    r = session.get(url, timeout=session.timeout)
    if r.status_code >= 400:
        logger.warning("HTTP %s em %s", r.status_code, url)
    return r.text, r.status_code

## Paginação e Listagem

In [None]:
def build_listing_url(page_number=1, pagination_cursor=None):

    base_collection_url = urljoin(BASE_URL, COLLECTION_PATH)
    url_parameters = []
    
    if page_number and page_number > 1:
        url_parameters.append(("page", str(page_number)))
    if pagination_cursor:
        url_parameters.append(("phcursor", pagination_cursor))
    
    query_string = urlencode(url_parameters) if url_parameters else ""
    return base_collection_url + ("?" + query_string if query_string else "")

def parse_listing(html_content):
  
    soup = BeautifulSoup(html_content, "html.parser")
    
    product_urls = []
    product_links = soup.select("a[href*='/products/']")
    for link_element in product_links:
        href_attribute = link_element.get("href") or ""
        if "/products/" in href_attribute:
            absolute_url = urljoin(BASE_URL, href_attribute)
            product_urls.append(absolute_url)

    next_page_url = None
    next_button = soup.find("a", attrs={"title": lambda x: x and "Avançar" in x})
    if next_button and next_button.get("href"):
        next_page_url = urljoin(BASE_URL, next_button["href"])
    
    if not next_page_url:
        pagination_links = soup.select("a[href*='?page=']")
        if pagination_links:
            next_page_url = urljoin(BASE_URL, pagination_links[0].get("href"))

    unique_urls = []
    seen_urls = set()
    for url in product_urls:
        if url not in seen_urls:
            seen_urls.add(url)
            unique_urls.append(url)
    
    return unique_urls, next_page_url

def scrape_listing(session, max_pages=20):

    collected_urls = []
    processed_urls = set()
    current_page = 1
    current_url = build_listing_url(page_number=1)
    previous_products_set = set()
    
    while current_url and current_page <= max_pages:
        html_content, _ = fetch_html(session, current_url)
        page_products, suggested_next_url = parse_listing(html_content)
        
        logging.info("Página %d | %d produtos", current_page, len(page_products))
        
        if not page_products:
            logging.info("Sem produtos. Encerrando.")
            break
        
        current_products_set = set(page_products)
        if current_products_set == previous_products_set:
            logging.info("Página repetida. Encerrando.")
            break
        
        previous_products_set = current_products_set
        new_products_count = 0
        
        for product_url in page_products:
            if product_url not in processed_urls:
                processed_urls.add(product_url)
                collected_urls.append(product_url)
                new_products_count += 1
        
        if new_products_count == 0:
            logging.info("Nenhum novo produto. Encerrando.")
            break
        
        current_page += 1
        current_url = suggested_next_url or build_listing_url(page_number=current_page)
    
    return collected_urls

def _collect_section_text(soup, target_anchors=("PRINCIPAIS BENEFÍCIOS","BENEFÍCIOS","BENEFICIOS","RESULTADOS","POR QUE AMAR")):

    collected_text_chunks = []
    fallback_text = normalize_space(soup.get_text(" "))
    
    header_elements = soup.find_all(["b","strong","h1","h2","h3"])
    for header_element in header_elements:
        header_title = normalize_space(header_element.get_text()).upper()
        
        if any(anchor in header_title for anchor in target_anchors):
            current_element = header_element.parent if header_element.parent else header_element
            traversal_hops = 0
            section_content = []
            
            while current_element and traversal_hops < 12:
                current_element = current_element.find_next_sibling()
                if not current_element:
                    break
                
                if current_element.name in ("p","div","span","ul","ol","li"):
                    element_text = normalize_space(current_element.get_text(" "))
                    section_content.append(element_text)
                elif current_element.name in ("h1","h2","h3","strong","b"):
                    break
                
                traversal_hops += 1
            
            if section_content:
                combined_section = " ".join(section_content)
                collected_text_chunks.append(combined_section)
    
    return " ".join(collected_text_chunks) if collected_text_chunks else fallback_text

## Benefícios, Ingredientes e Tipos de pele

In [None]:
def extract_benefits(soup):
   
    extracted_text = _collect_section_text(soup)
    normalized_text = normalize_text(extracted_text)
    
    identified_benefits = set()
    for canonical_benefit, synonym_list in BENEFIT_SYNONYMS_PT.items():
        for synonym in synonym_list:
            if synonym and normalize_text(synonym) in normalized_text:
                identified_benefits.add(canonical_benefit)
                break
    
    if not identified_benefits:
        return normalize_space(extracted_text)[:220]
    
    ordered_benefits = [benefit for benefit in BENEFIT_CANONICAL_ORDER if benefit in identified_benefits]
    return ", ".join(ordered_benefits)

def extract_skin_types(soup):

    skin_type_anchors = ("PARA QUAIS TIPOS DE PELE","TIPO DE PELE","TIPOS DE PELE","PELE")
    extracted_text = ""
    
    header_elements = soup.find_all(["b","strong","h1","h2","h3"])
    for header_element in header_elements:
        header_text = normalize_space(header_element.get_text()).upper()
        if any(anchor in header_text for anchor in skin_type_anchors):
            current_element = header_element.parent if header_element.parent else header_element
            content_accumulator = []
            traversal_count = 0
            
            while current_element and traversal_count < 10:
                current_element = current_element.find_next_sibling()
                if not current_element:
                    break
                    
                if current_element.name in ("p","div","span","ul","ol","li"):
                    element_text = normalize_space(current_element.get_text(" "))
                    content_accumulator.append(element_text)
                elif current_element.name in ("h1","h2","h3","strong","b"):
                    break
                    
                traversal_count += 1
            
            if content_accumulator:
                extracted_text = " ".join(content_accumulator)
                break
            
    if not extracted_text:
        extracted_text = normalize_space(soup.get_text(" "))
    
    normalized_text = normalize_text(extracted_text)
    identified_skin_types = set()
    
    for canonical_type, synonym_list in SKIN_TYPE_SYNONYMS_PT.items():
        for synonym in synonym_list:
            if synonym and normalize_text(synonym) in normalized_text:
                identified_skin_types.add(canonical_type)
                break
    
    if identified_skin_types:
        ordered_types = [skin_type for skin_type in SKIN_TYPE_CANONICAL_ORDER if skin_type in identified_skin_types]
        return ", ".join(ordered_types)
    
    truncated_text = extracted_text[:200]
    return truncated_text + ("..." if len(extracted_text) > 200 else "")

def extract_active_ingredients(soup):
  
    extracted_text = ""

    header_elements = soup.find_all(["b","strong","h1","h2","h3"])
    for header_element in header_elements:
        header_title = normalize_space(header_element.get_text()).upper()
        if "PRINCIPAIS ATIVOS" in header_title or "ATIVOS" in header_title:
            current_element = header_element.parent if header_element.parent else header_element
            traversal_count = 0
            content_accumulator = []
            
            while current_element and traversal_count < 12:
                current_element = current_element.find_next_sibling()
                if not current_element:
                    break
                    
                if current_element.name in ("p","div","span","ul","ol","li"):
                    element_text = normalize_space(current_element.get_text(" "))
                    content_accumulator.append(element_text)
                elif current_element.name in ("h1","h2","h3","strong","b"):
                    break
                    
                traversal_count += 1
            
            if content_accumulator:
                extracted_text = " ".join(content_accumulator)
                break
            
    if not extracted_text:
        for header_element in header_elements:
            header_title = normalize_space(header_element.get_text()).upper()
            composition_keywords = ["COMPOSIÇÃO", "COMPOSICAO", "INGREDIENTES"]
            if any(keyword in header_title for keyword in composition_keywords):
                paragraph_element = header_element.find_next("p")
                if paragraph_element:
                    extracted_text = normalize_space(paragraph_element.get_text(" "))
                    break
                
    if not extracted_text:
        extracted_text = normalize_space(soup.get_text(" "))
    
    normalized_text = normalize_text(extracted_text)
    identified_ingredients = set()
    
    for valid_ingredient in INGREDIENTES_VALIDOS:
        if normalize_text(valid_ingredient) in normalized_text:
            identified_ingredients.add(valid_ingredient)
    
    if identified_ingredients:
        sorted_ingredients = sorted(identified_ingredients, key=lambda x: strip_accents(x).lower())
        return ", ".join(sorted_ingredients)
    
    truncated_text = extracted_text[:300]
    return truncated_text + ("..." if len(extracted_text) > 300 else "")

## Imagem

In [None]:
def extract_image_best(soup):

    open_graph_meta = soup.select_one('meta[property="og:image"]')
    if open_graph_meta and open_graph_meta.get("content"):
        og_image_url = open_graph_meta["content"]
        if og_image_url.startswith("//"):
            og_image_url = "https:" + og_image_url
        return og_image_url
    
    best_image_url = ""
    best_image_width = -1
    
    def make_absolute_url(url_input):
        if not url_input:
            return ""
        if url_input.startswith("//"):
            return "https:" + url_input
        return urljoin(BASE_URL, url_input)
    
    image_elements = soup.select("img")
    for image_element in image_elements:
        srcset_attribute = image_element.get("srcset") or ""
        src_attribute = image_element.get("src") or ""
        
        if srcset_attribute:
            srcset_parts = srcset_attribute.split(",")
            for srcset_part in srcset_parts:
                part_components = srcset_part.strip().split()
                if not part_components:
                    continue
                
                candidate_url = part_components[0]
                candidate_width = -1
                
                if len(part_components) > 1 and part_components[1].endswith("w"):
                    try:
                        width_string = part_components[1][:-1]
                        candidate_width = int(width_string)
                    except ValueError:
                        candidate_width = -1
                
                if candidate_width > best_image_width:
                    best_image_width = candidate_width
                    best_image_url = make_absolute_url(candidate_url)
                    
        elif src_attribute:
            if best_image_width < 0:
                best_image_url = make_absolute_url(src_attribute)
                best_image_width = 0
    
    if best_image_url:
        if best_image_url.startswith("//"):
            best_image_url = "https:" + best_image_url
        return best_image_url
    
    return ""

def download_image(session, image_url, destination_directory, filename_slug):

    if not image_url:
        return ""
    
    os.makedirs(destination_directory, exist_ok=True)
    
    width_match = re.search(r"[?&]width=(\d+)", image_url)
    image_width = width_match.group(1) if width_match else ""
    
    if image_width:
        output_filename = f"{filename_slug}__{image_width}.jpg"
    else:
        output_filename = f"{filename_slug}.jpg"
    
    output_path = os.path.join(destination_directory, output_filename)
    
    try:
        response = session.get(image_url, timeout=session.timeout, headers=HEADERS)
        if response.status_code == 200:
            with open(output_path, "wb") as image_file:
                image_file.write(response.content)
            return output_path
        
        logging.warning("Falha ao baixar imagem %s (HTTP %s)", image_url, response.status_code)
    except Exception as download_error:
        logging.warning("Erro ao baixar imagem %s: %s", image_url, download_error)
    
    return ""

## Produto

In [None]:
def parse_product(html_content, product_url):

    soup = BeautifulSoup(html_content, "html.parser")

    product_name = ""
    title_element = soup.select_one("h1.h2.product-single__title") or soup.select_one("h1.product-single__title")
    if title_element:
        product_name = normalize_space(title_element.get_text())

    if any(excluded_keyword in (product_name or "").lower() for excluded_keyword in EXCLUDE_KEYWORDS):
        return None

    product_price = ""
    price_element = soup.select_one("span.product__price") or soup.select_one("span[data-product-price]")
    if price_element:
        product_price = normalize_price(price_element.get_text())

    product_benefits = extract_benefits(soup) or ""
    compatible_skin_types = extract_skin_types(soup) or ""
    active_ingredients = extract_active_ingredients(soup) or ""

    product_category = classify_category_from_name(product_name)

    product_image_url = extract_image_best(soup)
    image_filename = get_image_filename(product_image_url)

    def format_field_list(field_content):

        if not field_content:
            return ""
        cleaned_items = [item.strip() for item in field_content.replace(",", ";").split(";") if item.strip()]
        return "; ".join(cleaned_items)

    structured_product_data = {
        "marca": "ollie",
        "nome": product_name,
        "subtitulo": None,                 
        "categoria": product_category,           
        "quantidade": "",           
        "preco": product_price,
        "beneficios": format_field_list(product_benefits),
        "ingredientes": format_field_list(active_ingredients),
        "tipo_pele": format_field_list(compatible_skin_types),
        "imagem": image_filename,
        "_imagem_url": product_image_url
    }
    
    return structured_product_data

## Execução

In [None]:
def write_json_output(products_data, output_json_path):
  
    output_columns = [
        "marca","nome","subtitulo","categoria", "quantidade","preco",
        "ingredientes","beneficios","tipo_pele","imagem",
    ]

    cleaned_json_data = [
        {column: product.get(column, "") for column in output_columns} 
        for product in products_data
    ]
    
    with open(output_json_path, "w", encoding="utf-8") as json_file:
        json.dump(cleaned_json_data, json_file, ensure_ascii=False, indent=2)

def run_scraper(output_json="ollie_products.json", images_directory="images/", 
                max_retries=3, timeout=20, max_products=80):
 
    session = make_session(max_retries=max_retries, timeout=timeout)
    
    output_directory = os.path.dirname(output_json) or "."
    os.makedirs(output_directory, exist_ok=True)
    os.makedirs(images_directory, exist_ok=True)

    discovered_product_urls = scrape_listing(session, max_pages=20)
    logging.info("Total de URLs: %d", len(discovered_product_urls))

    collected_products = []
    processed_product_names = set()

    for current_index, product_url in enumerate(discovered_product_urls, 1):
        if len(collected_products) >= max_products:
            logger.info("Limite de %d produtos atingido. Parando.", max_products)
            break

        page_html, _ = fetch_html(session, product_url)
        parsed_product_data = parse_product(page_html, product_url)
        
        if parsed_product_data is None:
            logging.info("Skip (exclusão): %s", product_url)
            continue

        product_name_key = (parsed_product_data.get("nome") or "").strip().lower()
        if product_name_key in processed_product_names:
            logging.info("Skip (duplicado): %s", product_name_key)
            continue
        
        processed_product_names.add(product_name_key)

        filename_slug = slugify(parsed_product_data.get("nome") or os.path.basename(urlparse(product_url).path))
        image_download_result = download_image(
            session, 
            parsed_product_data.get("_imagem_url",""), 
            images_directory, 
            filename_slug
        )

        parsed_product_data.pop("_imagem_url", None)
        collected_products.append(parsed_product_data)

        if current_index % 3 == 0:
            write_json_output(collected_products, output_json)
            logging.info("Parcial salva (%d itens).", len(collected_products))

    write_json_output(collected_products, output_json)
    logging.info("Finalizado: %d itens", len(collected_products))
    
    return len(collected_products), output_json, images_directory

In [33]:
def save_data(products_data: List[Dict]):
    """Salva dados dos produtos em formato JSON estruturado"""
    if not products_data:
        print("\nNenhum dado para salvar.")
        return

    cleaned_products = []
    for product in products_data:
        cleaned_products.append({
            "marca": product.get("marca"),
            "nome": product.get("nome"),
            "subtitulo": product.get("subtitulo"),
            "categoria": product.get("categoria"),
            "quantidade": product.get("quantidade"),
            "preco": product.get("preco"),
            "beneficios": product.get("beneficios"),
            "ingredientes": product.get("ingredientes"),
            "tipo_pele": product.get("tipo_pele"),
            "imagem": product.get("imagem"),
        })

    with open("ollie_products.json", "w", encoding="utf-8") as json_output:
        json.dump(cleaned_products, json_output, ensure_ascii=False, indent=2)
    
    print(f"JSON: ollie_products.json ({len(cleaned_products)} produtos)")

if __name__ == "__main__":
    OUTPUT_JSON = "ollie_products.json"
    IMAGES_DIRECTORY = "images/"
    MAX_RETRIES = 3
    TIMEOUT = 20
    MAX_PRODUCTS = 80

    try:
        session = make_session(max_retries=MAX_RETRIES, timeout=TIMEOUT)
        
        output_directory = os.path.dirname(OUTPUT_JSON) or "."
        os.makedirs(output_directory, exist_ok=True)
        os.makedirs(IMAGES_DIRECTORY, exist_ok=True)

        discovered_product_urls = scrape_listing(session, max_pages=20)
        logging.info("Total de URLs descobertas: %d", len(discovered_product_urls))

        collected_products = []
        processed_product_names = set()

        for current_index, product_url in enumerate(discovered_product_urls, 1):
            if len(collected_products) >= MAX_PRODUCTS:
                logger.info("Limite de %d produtos atingido. Parando.", MAX_PRODUCTS)
                break

            page_html, _ = fetch_html(session, product_url)
            parsed_product_data = parse_product(page_html, product_url)
            
            if parsed_product_data is None:
                logging.info("Skip (exclusão): %s", product_url)
                continue

            product_name_key = (parsed_product_data.get("nome") or "").strip().lower()
            if product_name_key in processed_product_names:
                logging.info("Skip (duplicado): %s", product_name_key)
                continue
            
            processed_product_names.add(product_name_key)

            filename_slug = slugify(parsed_product_data.get("nome") or os.path.basename(urlparse(product_url).path))
            download_image(
                session, 
                parsed_product_data.get("_imagem_url",""), 
                IMAGES_DIRECTORY, 
                filename_slug
            )

            parsed_product_data.pop("_imagem_url", None)
            collected_products.append(parsed_product_data)

        save_data(collected_products)
        print("____________________________________________________________________________________________________________")
        print(f"\nFim da execução! Produtos extraídos: {len(collected_products)}")
        print(f"JSON: {os.path.abspath(OUTPUT_JSON)}")
        print(f"Imagens salvas em: {os.path.abspath(IMAGES_DIRECTORY)}")
        print("____________________________________________________________________________________________________________")

    except Exception as execution_error:
        print(f"\nERRO: {execution_error}")

2025-10-10 13:06:44,618 | INFO | Página 1 | 33 produtos
2025-10-10 13:06:45,978 | INFO | Página 2 | 6 produtos
2025-10-10 13:06:45,978 | INFO | Página 2 | 6 produtos
2025-10-10 13:06:47,596 | INFO | Página 3 | 33 produtos
2025-10-10 13:06:47,596 | INFO | Nenhum novo produto. Encerrando.
2025-10-10 13:06:47,597 | INFO | Total de URLs descobertas: 39
2025-10-10 13:06:47,596 | INFO | Página 3 | 33 produtos
2025-10-10 13:06:47,596 | INFO | Nenhum novo produto. Encerrando.
2025-10-10 13:06:47,597 | INFO | Total de URLs descobertas: 39
2025-10-10 13:06:50,540 | INFO | Skip (duplicado): protetor solar em bastão com cor fps 95
2025-10-10 13:06:50,540 | INFO | Skip (duplicado): protetor solar em bastão com cor fps 95
2025-10-10 13:06:51,879 | INFO | Skip (duplicado): protetor solar em bastão com cor fps 95
2025-10-10 13:06:51,879 | INFO | Skip (duplicado): protetor solar em bastão com cor fps 95
2025-10-10 13:06:53,307 | INFO | Skip (duplicado): protetor solar em bastão com cor fps 95
2025-10-1

JSON: ollie_products.json (10 produtos)
____________________________________________________________________________________________________________

Fim da execução! Produtos extraídos: 10
JSON: /home/usuario/Área de trabalho/Dados/Ollie/ollie_products.json
Imagens salvas em: /home/usuario/Área de trabalho/Dados/Ollie/images
____________________________________________________________________________________________________________


## Conversão JSON para CSV

In [2]:
def json_to_csv(json_file="ollie_products.json", csv_file="ollie_products.csv"):

    try:
        with open(json_file, "r", encoding="utf-8") as f:
            data = json.load(f)
        
        if not data:
            print(f"Nenhum dado encontrado no arquivo {json_file}")
            return
        
        cols = ["marca", "nome", "subtitulo", "categoria", "quantidade", "preco", "beneficios", "ingredientes", "tipo_pele", "imagem"]
        
        with open(csv_file, "w", encoding="utf-8", newline="") as f:
            writer = csv.DictWriter(f, fieldnames=cols)
            writer.writeheader()
            for row in data:

                csv_row = {k: (row.get(k) or "") for k in cols}
                writer.writerow(csv_row)
        
        print(f"CSV gerado: {csv_file} ({len(data)} linhas)")
        print(f"A partir do JSON: {json_file}")
        
    except FileNotFoundError:
        print(f" Arquivo {json_file} não encontrado!")
        
        import glob
        json_files = glob.glob("*.json")
        if json_files:
            for f in json_files:
                print(f"   - {f}")
        else:
            print("   Nenhum arquivo .json encontrado")
    except Exception as e:
        print(f"Erro ao converter JSON para CSV: {e}")


json_to_csv("ollie_products.json")  

CSV gerado: ollie_products.csv (8 linhas)
A partir do JSON: ollie_products.json
