In [1]:
import json
import pandas as pd
from pathlib import Path
from datetime import datetime

# Set up paths
data_dir = Path("../data")
print(f"Data directory: {data_dir.absolute()}")


Data directory: c:\Users\Usuario\Documents\GitHub\web-scrapping super\notebooks\..\data


In [2]:
# Find the latest JSON file in the data folder
json_files = list(data_dir.glob("products_*.json"))
if not json_files:
    raise FileNotFoundError("No JSON files found in data folder")

# Sort by modification time and get the latest
latest_json = max(json_files, key=lambda p: p.stat().st_mtime)
print(f"Latest JSON file: {latest_json.name}")
print(f"Modified: {datetime.fromtimestamp(latest_json.stat().st_mtime)}")


Latest JSON file: products_20251118_010114.json
Modified: 2025-11-18 01:01:14.561624


In [3]:
# Load the JSON file
with open(latest_json, "r", encoding="utf-8") as f:
    data = json.load(f)

# Extract metadata and products
metadata = data.get("metadata", {})
products = data.get("products", [])

print(f"Total products: {len(products)}")
print(f"Scraped at: {metadata.get('scraped_at', 'N/A')}")
print(f"Source: {metadata.get('source', 'N/A')}")
print(f"\nFirst product keys: {list(products[0].keys()) if products else 'No products'}")


Total products: 1922
Scraped at: 2025-11-18T01:01:14.485520
Source: misuperfresh.com.gt

First product keys: ['name', 'price', 'description', 'barcode', 'stock', 'offer_price', 'offer_description', 'image_url', 'subcategory', 'category', 'raw_data']


In [4]:
# Create DataFrame from products
# Exclude 'raw_data' field to match CSV structure
df = pd.DataFrame(products)

# Remove raw_data column if it exists
if "raw_data" in df.columns:
    df = df.drop(columns=["raw_data"])

print(f"DataFrame shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nDataFrame info:")
df.info()


DataFrame shape: (1922, 10)

Columns: ['name', 'price', 'description', 'barcode', 'stock', 'offer_price', 'offer_description', 'image_url', 'subcategory', 'category']

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1922 entries, 0 to 1921
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   name               1922 non-null   object 
 1   price              1922 non-null   object 
 2   description        1922 non-null   object 
 3   barcode            1922 non-null   int64  
 4   stock              1922 non-null   float64
 5   offer_price        1922 non-null   object 
 6   offer_description  1922 non-null   object 
 7   image_url          1922 non-null   object 
 8   subcategory        1922 non-null   object 
 9   category           1922 non-null   object 
dtypes: float64(1), int64(1), object(8)
memory usage: 150.3+ KB


In [5]:
# Display first few rows
print("First 5 rows:")
df.head()


First 5 rows:


Unnamed: 0,name,price,description,barcode,stock,offer_price,offer_description,image_url,subcategory,category
0,Aceite Better Body Foods Almendra Liquida Fras...,102.95,Aceite Better Body Foods Almendra Liquida Fras...,101581,15.0,93.45,Oferta,https://res.cloudinary.com/gtagt/image/upload/...,Aceite Comestible,Abarrotes
1,Aceite Capullo Envase Poly 1500ml,35.85,Aceite Capullo Envase Poly 1500ml,118141,7.0,32.25,Oferta,https://res.cloudinary.com/gtagt/image/upload/...,Aceite Comestible,Abarrotes
2,Aceite Capullo Natural Doy Pack 750ml,21.95,Aceite Capullo Natural Doy Pack 750ml,118143,8.0,16.75,Oferta,https://res.cloudinary.com/gtagt/image/upload/...,Aceite Comestible,Abarrotes
3,Aceite Capullo 100% Vegetal Botella 750ml,21.75,Aceite Capullo 100% Vegetal Botella 750ml,118128,30.0,16.95,Oferta,https://res.cloudinary.com/gtagt/image/upload/...,Aceite Comestible,Abarrotes
4,Aceite De Oliva Filippo Berio Extra Virgen 500ml,104.95,Aceite De Oliva Filippo Berio Extra Virgen 500ml,266149,5.0,85.65,Oferta,https://res.cloudinary.com/gtagt/image/upload/...,Aceite Comestible,Abarrotes


In [6]:
# Check the column order matches the CSV structure
# Expected CSV columns: barcode, category, description, image_url, name, offer_description, offer_price, price, stock, subcategory
expected_columns = ["barcode", "category", "description", "image_url", "name", 
                   "offer_description", "offer_price", "price", "stock", "subcategory"]

# Reorder columns to match CSV structure (if they exist)
available_columns = [col for col in expected_columns if col in df.columns]
other_columns = [col for col in df.columns if col not in expected_columns]

# Reorder: expected columns first, then others
df_ordered = df[available_columns + other_columns]

print(f"Reordered DataFrame columns: {list(df_ordered.columns)}")
df_ordered.head()


Reordered DataFrame columns: ['barcode', 'category', 'description', 'image_url', 'name', 'offer_description', 'offer_price', 'price', 'stock', 'subcategory']


Unnamed: 0,barcode,category,description,image_url,name,offer_description,offer_price,price,stock,subcategory
0,101581,Abarrotes,Aceite Better Body Foods Almendra Liquida Fras...,https://res.cloudinary.com/gtagt/image/upload/...,Aceite Better Body Foods Almendra Liquida Fras...,Oferta,93.45,102.95,15.0,Aceite Comestible
1,118141,Abarrotes,Aceite Capullo Envase Poly 1500ml,https://res.cloudinary.com/gtagt/image/upload/...,Aceite Capullo Envase Poly 1500ml,Oferta,32.25,35.85,7.0,Aceite Comestible
2,118143,Abarrotes,Aceite Capullo Natural Doy Pack 750ml,https://res.cloudinary.com/gtagt/image/upload/...,Aceite Capullo Natural Doy Pack 750ml,Oferta,16.75,21.95,8.0,Aceite Comestible
3,118128,Abarrotes,Aceite Capullo 100% Vegetal Botella 750ml,https://res.cloudinary.com/gtagt/image/upload/...,Aceite Capullo 100% Vegetal Botella 750ml,Oferta,16.95,21.75,30.0,Aceite Comestible
4,266149,Abarrotes,Aceite De Oliva Filippo Berio Extra Virgen 500ml,https://res.cloudinary.com/gtagt/image/upload/...,Aceite De Oliva Filippo Berio Extra Virgen 500ml,Oferta,85.65,104.95,5.0,Aceite Comestible


In [7]:
# Rename columns to Spanish before saving
column_mapping = {
    "barcode": "codigo_barras",
    "category": "categoria",
    "description": "descripcion",
    "image_url": "url_imagen",
    "name": "nombre",
    "offer_description": "descripcion_oferta",
    "offer_price": "precio_oferta",
    "price": "precio",
    "stock": "inventario",
    "subcategory": "subcategoria"
}

# Rename columns (only rename columns that exist)
df_spanish = df_ordered.rename(columns={k: v for k, v in column_mapping.items() if k in df_ordered.columns})

print("Column names changed to Spanish:")
print(f"Before: {list(df_ordered.columns)}")
print(f"After:  {list(df_spanish.columns)}")
df_spanish.head()


Column names changed to Spanish:
Before: ['barcode', 'category', 'description', 'image_url', 'name', 'offer_description', 'offer_price', 'price', 'stock', 'subcategory']
After:  ['codigo_barras', 'categoria', 'descripcion', 'url_imagen', 'nombre', 'descripcion_oferta', 'precio_oferta', 'precio', 'inventario', 'subcategoria']


Unnamed: 0,codigo_barras,categoria,descripcion,url_imagen,nombre,descripcion_oferta,precio_oferta,precio,inventario,subcategoria
0,101581,Abarrotes,Aceite Better Body Foods Almendra Liquida Fras...,https://res.cloudinary.com/gtagt/image/upload/...,Aceite Better Body Foods Almendra Liquida Fras...,Oferta,93.45,102.95,15.0,Aceite Comestible
1,118141,Abarrotes,Aceite Capullo Envase Poly 1500ml,https://res.cloudinary.com/gtagt/image/upload/...,Aceite Capullo Envase Poly 1500ml,Oferta,32.25,35.85,7.0,Aceite Comestible
2,118143,Abarrotes,Aceite Capullo Natural Doy Pack 750ml,https://res.cloudinary.com/gtagt/image/upload/...,Aceite Capullo Natural Doy Pack 750ml,Oferta,16.75,21.95,8.0,Aceite Comestible
3,118128,Abarrotes,Aceite Capullo 100% Vegetal Botella 750ml,https://res.cloudinary.com/gtagt/image/upload/...,Aceite Capullo 100% Vegetal Botella 750ml,Oferta,16.95,21.75,30.0,Aceite Comestible
4,266149,Abarrotes,Aceite De Oliva Filippo Berio Extra Virgen 500ml,https://res.cloudinary.com/gtagt/image/upload/...,Aceite De Oliva Filippo Berio Extra Virgen 500ml,Oferta,85.65,104.95,5.0,Aceite Comestible


In [8]:
# Save to CSV with Spanish column names
output_filename = latest_json.stem.replace(".json", "") + "_from_notebook.csv"
output_path = data_dir / output_filename

# Save to CSV (matching the original CSV format but with Spanish columns)
df_spanish.to_csv(output_path, index=False, encoding="utf-8")
print(f"CSV saved to: {output_path.absolute()}")
print(f"Total rows: {len(df_spanish)}")


CSV saved to: c:\Users\Usuario\Documents\GitHub\web-scrapping super\notebooks\..\data\products_20251118_010114_from_notebook.csv
Total rows: 1922


In [9]:
# Display summary statistics (using Spanish column names)
print("Estad√≠sticas Resumen:")
print("=" * 60)
print(f"Total productos: {len(df_spanish)}")
print(f"Categor√≠as √∫nicas: {df_spanish['categoria'].nunique() if 'categoria' in df_spanish.columns else 'N/A'}")
print(f"Subcategor√≠as √∫nicas: {df_spanish['subcategoria'].nunique() if 'subcategoria' in df_spanish.columns else 'N/A'}")
print(f"\nEstad√≠sticas de precios:")
if 'precio' in df_spanish.columns:
    # Convert price to numeric if it's a string
    prices = pd.to_numeric(df_spanish['precio'], errors='coerce')
    print(f"  Precio m√≠nimo: {prices.min():.2f}")
    print(f"  Precio m√°ximo: {prices.max():.2f}")
    print(f"  Precio promedio: {prices.mean():.2f}")
    print(f"  Precio mediano: {prices.median():.2f}")

if 'precio_oferta' in df_spanish.columns:
    offer_prices = pd.to_numeric(df_spanish['precio_oferta'], errors='coerce')
    products_with_offers = offer_prices.notna().sum()
    print(f"\nProductos con ofertas: {products_with_offers} ({products_with_offers/len(df_spanish)*100:.1f}%)")


Estad√≠sticas Resumen:
Total productos: 1922
Categor√≠as √∫nicas: 13
Subcategor√≠as √∫nicas: 109

Estad√≠sticas de precios:
  Precio m√≠nimo: 0.75
  Precio m√°ximo: 494.95
  Precio promedio: 40.82
  Precio mediano: 24.95

Productos con ofertas: 1922 (100.0%)


In [None]:
# Imprimir todas las categor√≠as y subcategor√≠as
print("=" * 60)
print("CATEGOR√çAS Y SUBCATEGOR√çAS")
print("=" * 60)

# Agrupar por categor√≠a y obtener subcategor√≠as √∫nicas
if 'categoria' in df_spanish.columns and 'subcategoria' in df_spanish.columns:
    # Agrupar por categor√≠a
    categorias_subcategorias = df_spanish.groupby('categoria')['subcategoria'].unique()
    
    # Ordenar alfab√©ticamente por categor√≠a
    for categoria in sorted(categorias_subcategorias.index):
        subcategorias = categorias_subcategorias[categoria]
        print(f"\nCATEGORIA: {categoria} ({len(subcategorias)} subcategor√≠as)")
        print("-" * 60)
        
        # Ordenar subcategor√≠as alfab√©ticamente
        for subcat in sorted(subcategorias):
            # Contar productos en esta subcategor√≠a
            count = len(df_spanish[(df_spanish['categoria'] == categoria) & 
                                   (df_spanish['subcategoria'] == subcat)])
            print(f"  SUBCATEGORIA: {subcat} ({count} productos)")
    
    print("\n" + "=" * 60)
    print(f"Total categor√≠as: {df_spanish['categoria'].nunique()}")
    print(f"Total subcategor√≠as: {df_spanish['subcategoria'].nunique()}")
    print(f"Total productos: {len(df_spanish)}")
else:
    print("Error: No se encontraron las columnas 'categoria' o 'subcategoria'")


CATEGOR√çAS Y SUBCATEGOR√çAS

üìÅ Abarrotes (29 subcategor√≠as)
------------------------------------------------------------
  ‚îî‚îÄ Aceite Comestible (18 productos)
  ‚îî‚îÄ Aderezos Y Mayonesas (21 productos)
  ‚îî‚îÄ Atun Y Sardinas Envasadas (20 productos)
  ‚îî‚îÄ Barras De Cereales (2 productos)
  ‚îî‚îÄ Caf√©, Te Y Cremoras (55 productos)
  ‚îî‚îÄ Cereales (41 productos)
  ‚îî‚îÄ Cereales Calientes / Atoles (14 productos)
  ‚îî‚îÄ Complementos De Reposteria (13 productos)
  ‚îî‚îÄ Especias Y Sazonadores (44 productos)
  ‚îî‚îÄ Frijol Envasado (12 productos)
  ‚îî‚îÄ Fruta Seca Y Deshidratada (1 productos)
  ‚îî‚îÄ Frutas Envasadas Y En Almibar (3 productos)
  ‚îî‚îÄ Gelatinas, Flanes Y Pudines (14 productos)
  ‚îî‚îÄ Granos Basicos (5 productos)
  ‚îî‚îÄ Harinas De Trigo (12 productos)
  ‚îî‚îÄ Leche En Polvo (8 productos)
  ‚îî‚îÄ Leche Liquida Uht (43 productos)
  ‚îî‚îÄ Margarinas Y Mantecas (8 productos)
  ‚îî‚îÄ Miel, Jaleas Y Mermeladas (15 productos)
  ‚îî‚îÄ Modificado

In [11]:
# Imprimir todas las categor√≠as y subcategor√≠as (solo comestibles)
print("=" * 60)
print("CATEGOR√çAS Y SUBCATEGOR√çAS (SOLO COMESTIBLES)")
print("=" * 60)

# Categor√≠as no comestibles a excluir
categorias_no_comestibles = [
    "Bebe",
    "Cuidado Del Hogar / Hogar Y Librer√≠a",
    "Cuidado Del Hogar / Limpieza, Ropa Y Hogar",
    "Cuidado Personal",
    "Mascotas",
    "Medicinales"
]

# Filtrar DataFrame para excluir categor√≠as no comestibles
if 'categoria' in df_spanish.columns and 'subcategoria' in df_spanish.columns:
    # Filtrar productos comestibles
    df_comestibles = df_spanish[~df_spanish['categoria'].isin(categorias_no_comestibles)]
    
    # Agrupar por categor√≠a y obtener subcategor√≠as √∫nicas
    categorias_subcategorias = df_comestibles.groupby('categoria')['subcategoria'].unique()
    
    # Ordenar alfab√©ticamente por categor√≠a
    for categoria in sorted(categorias_subcategorias.index):
        subcategorias = categorias_subcategorias[categoria]
        print(f"\nüìÅ {categoria} ({len(subcategorias)} subcategor√≠as)")
        print("-" * 60)
        
        # Ordenar subcategor√≠as alfab√©ticamente
        for subcat in sorted(subcategorias):
            # Contar productos en esta subcategor√≠a
            count = len(df_comestibles[(df_comestibles['categoria'] == categoria) & 
                                      (df_comestibles['subcategoria'] == subcat)])
            print(f"  ‚îî‚îÄ {subcat} ({count} productos)")
    
    print("\n" + "=" * 60)
    print(f"Total categor√≠as comestibles: {df_comestibles['categoria'].nunique()}")
    print(f"Total subcategor√≠as comestibles: {df_comestibles['subcategoria'].nunique()}")
    print(f"Total productos comestibles: {len(df_comestibles)}")
    print(f"\nProductos excluidos (no comestibles): {len(df_spanish) - len(df_comestibles)}")
else:
    print("Error: No se encontraron las columnas 'categoria' o 'subcategoria'")

CATEGOR√çAS Y SUBCATEGOR√çAS (SOLO COMESTIBLES)

üìÅ Abarrotes (29 subcategor√≠as)
------------------------------------------------------------
  ‚îî‚îÄ Aceite Comestible (18 productos)
  ‚îî‚îÄ Aderezos Y Mayonesas (21 productos)
  ‚îî‚îÄ Atun Y Sardinas Envasadas (20 productos)
  ‚îî‚îÄ Barras De Cereales (2 productos)
  ‚îî‚îÄ Caf√©, Te Y Cremoras (55 productos)
  ‚îî‚îÄ Cereales (41 productos)
  ‚îî‚îÄ Cereales Calientes / Atoles (14 productos)
  ‚îî‚îÄ Complementos De Reposteria (13 productos)
  ‚îî‚îÄ Especias Y Sazonadores (44 productos)
  ‚îî‚îÄ Frijol Envasado (12 productos)
  ‚îî‚îÄ Fruta Seca Y Deshidratada (1 productos)
  ‚îî‚îÄ Frutas Envasadas Y En Almibar (3 productos)
  ‚îî‚îÄ Gelatinas, Flanes Y Pudines (14 productos)
  ‚îî‚îÄ Granos Basicos (5 productos)
  ‚îî‚îÄ Harinas De Trigo (12 productos)
  ‚îî‚îÄ Leche En Polvo (8 productos)
  ‚îî‚îÄ Leche Liquida Uht (43 productos)
  ‚îî‚îÄ Margarinas Y Mantecas (8 productos)
  ‚îî‚îÄ Miel, Jaleas Y Mermeladas (15 productos)


In [14]:
# Imprimir todos los productos con su precio (solo comestibles) y guardar en TXT
from io import StringIO

# Crear un buffer de texto para capturar todo el output
output_buffer = StringIO()

def print_and_save(text):
    """Imprime y guarda el texto en el buffer"""
    print(text)
    output_buffer.write(text + "\n")

print_and_save("=" * 60)
print_and_save("PRODUCTOS COMESTIBLES CON PRECIOS")
print_and_save("=" * 60)

# Categor√≠as no comestibles a excluir
categorias_no_comestibles = [
    "Bebe",
    "Cuidado Del Hogar / Hogar Y Librer√≠a",
    "Cuidado Del Hogar / Limpieza, Ropa Y Hogar",
    "Cuidado Personal",
    "Mascotas",
    "Medicinales"
]

# Filtrar DataFrame para excluir categor√≠as no comestibles
if 'categoria' in df_spanish.columns and 'subcategoria' in df_spanish.columns:
    # Filtrar productos comestibles
    df_comestibles = df_spanish[~df_spanish['categoria'].isin(categorias_no_comestibles)].copy()
    
    # Ordenar por categor√≠a, subcategor√≠a y nombre
    df_comestibles = df_comestibles.sort_values(['categoria', 'subcategoria', 'nombre'])
    
    # Obtener categor√≠as √∫nicas ordenadas
    categorias = sorted(df_comestibles['categoria'].unique())
    
    for categoria in categorias:
        df_categoria = df_comestibles[df_comestibles['categoria'] == categoria]
        subcategorias = sorted(df_categoria['subcategoria'].unique())
        
        print_and_save(f"\nCATEGORIA: {categoria}")
        print_and_save("=" * 60)
        
        for subcategoria in subcategorias:
            df_subcat = df_categoria[df_categoria['subcategoria'] == subcategoria]
            
            print_and_save(f"\n  SUBCATEGORIA: {subcategoria} ({len(df_subcat)} productos)")
            print_and_save("  " + "-" * 58)
            
            # Imprimir cada producto con su precio
            for idx, row in df_subcat.iterrows():
                nombre = row.get('nombre', 'N/A')
                precio = row.get('precio', 'N/A')
                
                # Formatear precio (convertir a string si es num√©rico)
                if pd.notna(precio):
                    try:
                        precio_num = float(precio)
                        precio_str = f"Q{precio_num:.2f}"
                    except:
                        precio_str = f"Q{precio}"
                else:
                    precio_str = "Q0.00"
                
                print_and_save(f"    {nombre} - {precio_str}")
    
    print_and_save("\n" + "=" * 60)
    print_and_save(f"Total productos comestibles mostrados: {len(df_comestibles)}")
    
    # Guardar en archivo TXT
    output_filename = latest_json.stem.replace(".json", "") + "_productos_comestibles.txt"
    output_path = data_dir / output_filename
    
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(output_buffer.getvalue())
    
    print(f"\nArchivo guardado: {output_path.absolute()}")
else:
    print("Error: No se encontraron las columnas 'categoria' o 'subcategoria'")


PRODUCTOS COMESTIBLES CON PRECIOS

CATEGORIA: Abarrotes

  SUBCATEGORIA: Aceite Comestible (18 productos)
  ----------------------------------------------------------
    Aceite Better Body Foods Almendra Liquida Frasco 500ml - Q102.95
    Aceite Capullo 100% Vegetal Botella 750ml - Q21.75
    Aceite Capullo Envase Poly 1500ml - Q35.85
    Aceite Capullo Natural Doy Pack 750ml - Q21.95
    Aceite De Oliva Filippo Berio 500ml - Q86.95
    Aceite De Oliva Filippo Berio Extra Virgen 500ml - Q104.95
    Aceite Don Lazaro De Oliva De Orujo Botella 500ml - Q43.95
    Aceite Don Lazaro De Oliva Orujo Botella 1lt - Q92.95
    Aceite Don Lazaro De Oliva Virgen Extra Botella 1lt - Q137.95
    Aceite Don Lazaro De Oliva Virgen Extra Botella 2lt - Q253.95
    Aceite Ideal Canola 1400ml - Q50.45
    Aceite Ideal Vegetal Girasol Botella 750ml - Q18.45
    Aceite Ines De Ajonjoli Picante Botella 250ml - Q43.45
    Aceite Ines De Ajonjoli Spray 134g - Q40.45
    Aceite Ines De Mani Tostado Botella 250