In [3]:
import pandas as pd
import os
import torch
from PIL import Image
from transformers import OwlViTProcessor, OwlViTForObjectDetection
from tqdm import tqdm

# --- 1. CONFIGURACI√ìN A PRUEBA DE BALAS ---
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"üöÄ Arrancando 'El Etiquetador Inteligente' en: {device}")

# Obligamos a Python a usar la ruta absoluta exacta que se ve en tu error
BASE_DIR = os.path.expanduser("~/Documents/HackUDC_2026")

# üî• RUTAS CORREGIDAS SEG√öN TUS CAPTURAS üî•
RUTA_MATCH_TRAIN = os.path.join(BASE_DIR, "data", "raw", "bundles_product_match_train.csv")
RUTA_PRODUCTOS = os.path.join(BASE_DIR, "data", "raw", "product_dataset.csv") # <-- ¬°Corregido!
CARPETA_BUNDLES = os.path.join(BASE_DIR, "data", "images", "bundles")
CARPETA_SALIDA = os.path.join(BASE_DIR, "data", "dataset_entrenar_IA")

# Creamos la carpeta de salida si no existe
os.makedirs(CARPETA_SALIDA, exist_ok=True)

# --- 2. CARGAR EL "CHULETERO" DE INDITEX ---
print("üìö Cruzando los datos de Inditex...")
df_match = pd.read_csv(RUTA_MATCH_TRAIN)
df_products = pd.read_csv(RUTA_PRODUCTOS)

# Unimos los dos excels
df_info = pd.merge(df_match, df_products, on="product_asset_id")

# Mapeo de categor√≠as para que OWL-ViT lo entienda
MAPEO_CATEGORIAS = {
    'T-SHIRT': 'upper clothing', 'SHIRT': 'upper clothing', 'SWEATER': 'upper clothing',
    'JACKET': 'upper clothing', 'COAT': 'upper clothing', 'BLOUSE': 'upper clothing',
    'TOP': 'upper clothing', 'TROUSERS': 'lower clothing', 'JEANS': 'lower clothing',
    'SHORTS': 'lower clothing', 'SKIRT': 'lower clothing', 'DRESS': 'dress',
    'JUMPSUIT': 'dress'
}
df_info['owl_label'] = df_info['product_description'].map(MAPEO_CATEGORIAS)

# --- 3. CARGAMOS LA IA VISUAL ---
processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32").to(device)
etiquetas_busqueda = [["upper clothing", "lower clothing", "dress"]]

bundles_a_procesar = df_info['bundle_asset_id'].unique()[:3000]

print(f"üî™ Extrayendo e identificando ropa de {len(bundles_a_procesar)} modelos...")

# --- 4. LA MAGIA: CORTAR Y BAUTIZAR ---
aciertos_guardados = 0

for bundle_id in tqdm(bundles_a_procesar):
    ruta_foto = os.path.join(CARPETA_BUNDLES, f"{bundle_id}.jpg")
    
    if not os.path.exists(ruta_foto):
        continue
        
    try:
        # Pasamos la foto por OWL-ViT
        imagen = Image.open(ruta_foto).convert("RGB")
        inputs = processor(text=etiquetas_busqueda, images=imagen, return_tensors="pt").to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)

        target_sizes = torch.tensor([imagen.size[::-1]])
        results = processor.post_process_grounded_object_detection(
            outputs=outputs, target_sizes=target_sizes, text_labels=etiquetas_busqueda, threshold=0.1
        )[0]
        
        ropa_real_modelo = df_info[df_info['bundle_asset_id'] == bundle_id]
        
        for caja, score, label_idx in zip(results["boxes"], results["scores"], results["labels"]):
            if score.item() > 0.1:
                lo_que_ve_la_ia = etiquetas_busqueda[0][label_idx.item()]
                match = ropa_real_modelo[ropa_real_modelo['owl_label'] == lo_que_ve_la_ia]
                
                if not match.empty:
                    id_producto_real = match.iloc[0]['product_asset_id']
                    
                    x1, y1, x2, y2 = map(int, caja.tolist())
                    recorte = imagen.crop((x1, y1, x2, y2))
                    
                    nombre_archivo = f"{id_producto_real}_from_{bundle_id}.jpg"
                    ruta_salida = os.path.join(CARPETA_SALIDA, nombre_archivo)
                    recorte.save(ruta_salida)
                    aciertos_guardados += 1
                    
    except Exception as e:
        print(f"Error procesando {bundle_id}: {e}")

print(f"\nüèÜ ¬°PASO 1 COMPLETADO! Has generado {aciertos_guardados} im√°genes etiquetadas.")
print(f"Revisa la carpeta: {CARPETA_SALIDA}")

üöÄ Arrancando 'El Etiquetador Inteligente' en: cuda
üìö Cruzando los datos de Inditex...


Loading weights:   0%|          | 0/412 [00:00<?, ?it/s]

[1mOwlViTForObjectDetection LOAD REPORT[0m from: google/owlvit-base-patch32
Key                                         | Status     |  | 
--------------------------------------------+------------+--+-
owlvit.vision_model.embeddings.position_ids | UNEXPECTED |  | 
owlvit.text_model.embeddings.position_ids   | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


üî™ Extrayendo e identificando ropa de 1876 modelos...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1876/1876 [03:16<00:00,  9.56it/s]


üèÜ ¬°PASO 1 COMPLETADO! Has generado 316 im√°genes etiquetadas.
Revisa la carpeta: /home/aleixbertranandreu/Documents/HackUDC_2026/data/dataset_entrenar_IA



