# Baseline

In [2]:
!pip install sentence-transformers

Collecting sentence-transformers
  Using cached sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Using cached sentence_transformers-5.1.2-py3-none-any.whl (488 kB)
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-5.1.2


In [4]:
"""
Script 2: Calcular m√©tricas baseline (modelo sin editar)
VERSI√ìN OPTIMIZADA PARA APPLE SILICON (M1/M2/M3/M4)
"""

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import numpy as np
from sentence_transformers import SentenceTransformer
import json
from pathlib import Path
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# ============================================
# CONFIGURACI√ìN
# ============================================

# üîß AJUSTA ESTA RUTA
BASE_DIR = Path("/Users/andreaacostasolorzano/Documents/Repositorios/ProyectoIA")

# üîß MODO PRUEBA: Cambia a None para evaluar TODO
LIMIT = 100  # None para completo

# ============================================

class Config:
    DATA_DIR = BASE_DIR / "data"
    PROCESSED_DIR = DATA_DIR / "processed"
    RESULTS_DIR = BASE_DIR / "results" / "fase1"
    
    MODEL_NAME = "gpt2-xl"
    
    # ‚úÖ Detectar MPS (Apple Silicon)
    if torch.cuda.is_available():
        DEVICE = "cuda"
    elif torch.backends.mps.is_available():
        DEVICE = "mps"
    else:
        DEVICE = "cpu"
    
    REGIONS = ['latam', 'europe']
    REGION_NAMES = {
        'latam': 'Latinoam√©rica',
        'europe': 'Europa (Grecia + N√≥rdica)'
    }
    
    RANDOM_SEED = 42

# Funciones auxiliares
def load_json(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    print(f"‚úì Cargados {len(data)} ejemplos desde {filepath.name}")
    return data

def save_json(data, filepath):
    filepath.parent.mkdir(parents=True, exist_ok=True)
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"‚úì Guardado en {filepath}")

def log_decision(decision):
    log_file = BASE_DIR / "FASE1_DECISIONES.md"
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    with open(log_file, 'a', encoding='utf-8') as f:
        f.write(f"\n[{timestamp}] {decision}\n")

def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(Config.RANDOM_SEED)

print("="*70)
print("CONFIGURACI√ìN")
print("="*70)
print(f"BASE_DIR: {BASE_DIR}")
print(f"Dispositivo: {Config.DEVICE}")
if Config.DEVICE == "mps":
    print(f"‚úÖ Usando GPU Apple Silicon - Aceleraci√≥n MPS activada")
    print(f"   Tu Mac M4 acelerar√° esto ~5-10x vs CPU")
elif Config.DEVICE == "cpu":
    print(f"‚ö†Ô∏è  Usando CPU - considera verificar que MPS est√© disponible")
print(f"Modo: {'PRUEBA ('+str(LIMIT)+' ejemplos)' if LIMIT else 'COMPLETO'}")
print("="*70 + "\n")


# ============================================
# CLASE EVALUADOR
# ============================================

class BaselineEvaluator:
    """Evaluador optimizado para Apple Silicon"""
    
    def __init__(self, model_name="gpt2-xl"):
        print(f"ü§ñ Cargando modelo {model_name}...")
        self.device = Config.DEVICE
        print(f"   Dispositivo: {self.device}")
        
        if self.device == "cpu":
            print("   ‚ö†Ô∏è  Usando CPU - ser√° m√°s lento")
        elif self.device == "mps":
            print("   ‚úÖ Usando GPU Apple Silicon (MPS)")
        
        print("   Cargando tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        
        print("   Cargando modelo (1-2 min)...")
        
        # Configuraci√≥n espec√≠fica para MPS
        if self.device == "mps":
            # MPS funciona mejor con float32
            self.model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float32,
                low_cpu_mem_usage=True
            ).to(self.device)
        else:
            self.model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
                low_cpu_mem_usage=True
            ).to(self.device)
        
        self.model.eval()
        print("   ‚úì Modelo cargado en MPS" if self.device == "mps" else "   ‚úì Modelo cargado")
        
        print("   Cargando embeddings...")
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        print("   ‚úì Embeddings cargados\n")
    
    def calculate_perplexity(self, prompt, target):
        """Calcular perplejidad"""
        try:
            full_text = prompt + " " + target
            inputs = self.tokenizer(
                full_text, 
                return_tensors="pt",
                truncation=True,
                max_length=512
            ).to(self.device)
            
            with torch.no_grad():
                outputs = self.model(**inputs, labels=inputs.input_ids)
                loss = outputs.loss
            
            perplexity = torch.exp(loss).item()
            return min(perplexity, 100.0)
            
        except Exception as e:
            return 100.0
    
    def generate_top_k(self, prompt, k=5, max_length=15):
        """Generar top-k respuestas"""
        try:
            inputs = self.tokenizer(
                prompt, 
                return_tensors="pt",
                truncation=True
            ).to(self.device)
            
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=max_length,
                    num_return_sequences=k,
                    do_sample=False,
                    num_beams=k,
                    early_stopping=True,
                    pad_token_id=self.tokenizer.eos_token_id
                )
            
            generated = []
            for output in outputs:
                text = self.tokenizer.decode(output, skip_special_tokens=True)
                if text.startswith(prompt):
                    text = text[len(prompt):].strip()
                generated.append(text)
            
            return generated
        except Exception as e:
            return [""] * k
    
    def check_exact_match(self, generated, target):
        """Verificar exact match"""
        gen = generated.lower().strip()
        tgt = target.lower().strip()
        return gen == tgt or gen.startswith(tgt)
    
    def check_in_top_k(self, top_k, target):
        """Verificar si est√° en top-k"""
        target_lower = target.lower().strip()
        for output in top_k:
            if target_lower in output.lower():
                return True
        return False
    
    def calculate_similarity(self, text1, text2):
        """Calcular similitud sem√°ntica"""
        try:
            if not text1 or not text2:
                return 0.0
            
            embeddings = self.embedding_model.encode([text1, text2])
            similarity = np.dot(embeddings[0], embeddings[1]) / (
                np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1]) + 1e-8
            )
            return float(max(0.0, min(1.0, similarity)))
        except:
            return 0.0
    
    def evaluate_example(self, example):
        """Evaluar un ejemplo completo"""
        prompt = example['prompt']
        target = example['target_new']
        
        # Generar
        top_k = self.generate_top_k(prompt, k=5)
        best = top_k[0] if top_k else ""
        
        # M√©tricas
        exact = self.check_exact_match(best, target)
        in_top5 = self.check_in_top_k(top_k, target)
        perp = self.calculate_perplexity(prompt, target)
        sim = self.calculate_similarity(best, target)
        
        return {
            'prompt': prompt,
            'target_new': target,
            'model_output': best,
            'top5_outputs': top_k,
            'exact_match': exact,
            'in_top5': in_top5,
            'perplexity': perp,
            'semantic_similarity': sim,
            'region': example.get('region', ''),
            'country': example.get('country', ''),
            'subject': example.get('subject', ''),
            'relation': example.get('relation', '')
        }


# ============================================
# FUNCI√ìN DE EVALUACI√ìN
# ============================================

def evaluate_region(region_name, evaluator, limit=None):
    """Evaluar una regi√≥n"""
    
    print(f"\n{'='*70}")
    print(f"EVALUANDO: {Config.REGION_NAMES[region_name].upper()}")
    print(f"{'='*70}\n")
    
    # Cargar datos
    data_file = Config.PROCESSED_DIR / f"{region_name}_all.json"
    data = load_json(data_file)
    
    if limit:
        data = data[:limit]
        print(f"‚ö†Ô∏è  MODO PRUEBA: {limit} ejemplos\n")
    
    results = []
    errors = 0
    
    # Evaluar
    for i, example in enumerate(tqdm(data, desc=f"Procesando {region_name}")):
        try:
            result = evaluator.evaluate_example(example)
            results.append(result)
        except Exception as e:
            errors += 1
            if errors > 10:
                print(f"\n‚ùå Demasiados errores. Abortando.")
                break
    
    if len(results) == 0:
        return [], {}
    
    # Estad√≠sticas
    exact = sum(1 for r in results if r['exact_match'])
    top5 = sum(1 for r in results if r['in_top5'])
    avg_perp = np.mean([r['perplexity'] for r in results])
    med_perp = np.median([r['perplexity'] for r in results])
    std_perp = np.std([r['perplexity'] for r in results])
    avg_sim = np.mean([r['semantic_similarity'] for r in results])
    
    print(f"\n{'='*70}")
    print(f"üìä RESULTADOS {Config.REGION_NAMES[region_name].upper()}")
    print(f"{'='*70}")
    print(f"Ejemplos:          {len(results)}")
    print(f"Exact Match:       {exact}/{len(results)} ({exact/len(results)*100:.1f}%)")
    print(f"In Top-5:          {top5}/{len(results)} ({top5/len(results)*100:.1f}%)")
    print(f"Perplejidad:       {avg_perp:.2f} ¬± {std_perp:.2f}")
    print(f"Mediana Perp:      {med_perp:.2f}")
    print(f"Similitud:         {avg_sim:.3f}")
    print(f"{'='*70}\n")
    
    # Guardar
    output_file = Config.PROCESSED_DIR / f"{region_name}_baseline.json"
    save_json(results, output_file)
    
    summary = {
        'region': region_name,
        'n_examples': len(results),
        'exact_match_rate': exact / len(results),
        'top5_match_rate': top5 / len(results),
        'avg_perplexity': float(avg_perp),
        'median_perplexity': float(med_perp),
        'std_perplexity': float(std_perp),
        'avg_semantic_similarity': float(avg_sim)
    }
    
    summary_file = Config.RESULTS_DIR / f"{region_name}_baseline_summary.json"
    save_json(summary, summary_file)
    
    return results, summary


# ============================================
# EJECUCI√ìN PRINCIPAL
# ============================================

print("\n" + "="*70)
print("üî¨ FASE 1 - PASO 2: CALCULAR BASELINE")
print("="*70 + "\n")

# Inicializar
evaluator = BaselineEvaluator(Config.MODEL_NAME)

# Evaluar regiones
summaries = {}
for region in Config.REGIONS:
    results, summary = evaluate_region(region, evaluator, limit=LIMIT)
    if len(results) > 0:
        summaries[region] = summary

# Comparaci√≥n
if len(summaries) == 2:
    print("\n" + "="*70)
    print("üìä COMPARACI√ìN LATAM vs EUROPA")
    print("="*70 + "\n")
    
    latam = summaries['latam']
    europe = summaries['europe']
    
    print(f"{'M√©trica':<25} {'Latam':>12} {'Europa':>12} {'Gap':>12}")
    print("-"*70)
    print(f"{'Exact Match':<25} {latam['exact_match_rate']*100:>11.1f}% {europe['exact_match_rate']*100:>11.1f}% {(europe['exact_match_rate']-latam['exact_match_rate'])*100:>+11.1f}%")
    print(f"{'Top-5':<25} {latam['top5_match_rate']*100:>11.1f}% {europe['top5_match_rate']*100:>11.1f}% {(europe['top5_match_rate']-latam['top5_match_rate'])*100:>+11.1f}%")
    print(f"{'Perplejidad':<25} {latam['avg_perplexity']:>11.2f}  {europe['avg_perplexity']:>11.2f}  {(latam['avg_perplexity']-europe['avg_perplexity']):>+11.2f} ")
    print(f"{'Similitud':<25} {latam['avg_semantic_similarity']:>11.3f}  {europe['avg_semantic_similarity']:>11.3f}  {(europe['avg_semantic_similarity']-latam['avg_semantic_similarity']):>+11.3f} ")
    
    gap = (europe['exact_match_rate'] - latam['exact_match_rate']) * 100
    
    print(f"\nüí° INTERPRETACI√ìN:")
    if gap > 15:
        print(f"   ‚úì Brecha SIGNIFICATIVA de {gap:.0f} puntos")
        print(f"   ‚Üí Confirma sesgo cultural fuerte en el modelo")
    elif gap > 5:
        print(f"   ‚ö†Ô∏è  Brecha MODERADA de {gap:.0f} puntos")
        print(f"   ‚Üí Hay diferencia cultural detectable")
    else:
        print(f"   ‚Üí Conocimiento similar entre regiones ({gap:.0f} puntos)")

print("\n" + "="*70)
print("‚úÖ COMPLETADO")
print("="*70)

if LIMIT:
    print(f"\n‚ö†Ô∏è  Esto fue una PRUEBA con {LIMIT} ejemplos")
    print(f"   Para baseline completo, cambia LIMIT = None")
else:
    print(f"\n‚úÖ Baseline completo terminado")
    print(f"   Contin√∫a con Script 3 (estratificaci√≥n)")

print()

CONFIGURACI√ìN
BASE_DIR: /Users/andreaacostasolorzano/Documents/Repositorios/ProyectoIA
Dispositivo: mps
‚úÖ Usando GPU Apple Silicon - Aceleraci√≥n MPS activada
   Tu Mac M4 acelerar√° esto ~5-10x vs CPU
Modo: PRUEBA (100 ejemplos)


üî¨ FASE 1 - PASO 2: CALCULAR BASELINE

ü§ñ Cargando modelo gpt2-xl...
   Dispositivo: mps
   ‚úÖ Usando GPU Apple Silicon (MPS)
   Cargando tokenizer...
   Cargando modelo (1-2 min)...
   ‚úì Modelo cargado en MPS
   Cargando embeddings...
   ‚úì Embeddings cargados


EVALUANDO: LATINOAM√âRICA

‚úì Cargados 7250 ejemplos desde latam_all.json
‚ö†Ô∏è  MODO PRUEBA: 100 ejemplos



Procesando latam: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [08:43<00:00,  5.24s/it]



üìä RESULTADOS LATINOAM√âRICA
Ejemplos:          100
Exact Match:       0/100 (0.0%)
In Top-5:          0/100 (0.0%)
Perplejidad:       87.74 ¬± 20.55
Mediana Perp:      100.00
Similitud:         0.323

‚úì Guardado en /Users/andreaacostasolorzano/Documents/Repositorios/ProyectoIA/data/processed/latam_baseline.json
‚úì Guardado en /Users/andreaacostasolorzano/Documents/Repositorios/ProyectoIA/results/fase1/latam_baseline_summary.json

EVALUANDO: EUROPA (GRECIA + N√ìRDICA)

‚úì Cargados 2183 ejemplos desde europe_all.json
‚ö†Ô∏è  MODO PRUEBA: 100 ejemplos



Procesando europe: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [09:03<00:00,  5.44s/it]



üìä RESULTADOS EUROPA (GRECIA + N√ìRDICA)
Ejemplos:          100
Exact Match:       0/100 (0.0%)
In Top-5:          0/100 (0.0%)
Perplejidad:       98.50 ¬± 6.40
Mediana Perp:      100.00
Similitud:         0.287

‚úì Guardado en /Users/andreaacostasolorzano/Documents/Repositorios/ProyectoIA/data/processed/europe_baseline.json
‚úì Guardado en /Users/andreaacostasolorzano/Documents/Repositorios/ProyectoIA/results/fase1/europe_baseline_summary.json

üìä COMPARACI√ìN LATAM vs EUROPA

M√©trica                          Latam       Europa          Gap
----------------------------------------------------------------------
Exact Match                       0.0%         0.0%        +0.0%
Top-5                             0.0%         0.0%        +0.0%
Perplejidad                     87.74        98.50       -10.76 
Similitud                       0.323        0.287       -0.036 

üí° INTERPRETACI√ìN:
   ‚Üí Conocimiento similar entre regiones (0 puntos)

‚úÖ COMPLETADO

‚ö†Ô∏è  Esto fue una

In [5]:
"""
DIAGN√ìSTICO CR√çTICO
"""

import json
from pathlib import Path

BASE_DIR = Path("/Users/andreaacostasolorzano/Documents/Repositorios/ProyectoIA")

print("="*70)
print("üîç DIAGN√ìSTICO CR√çTICO")
print("="*70 + "\n")

# 1. Ver ejemplos de tu dataset ORIGINAL
print("1Ô∏è‚É£  TUS DATOS DE ENTRADA:\n")

data_file = BASE_DIR / "data/processed/latam_all.json"
with open(data_file, 'r') as f:
    data = json.load(f)

for i in range(5):
    item = data[i]
    print(f"Ejemplo {i+1}:")
    print(f"  prompt:     '{item.get('prompt', 'N/A')}'")
    print(f"  target_new: '{item.get('target_new', 'N/A')}'")
    print(f"  subject:    '{item.get('subject', 'N/A')}'")
    print()

# 2. Ver QU√â GENER√ì el modelo
print("="*70)
print("2Ô∏è‚É£  QU√â GENER√ì EL MODELO:\n")

results_file = BASE_DIR / "data/processed/latam_baseline.json"
with open(results_file, 'r') as f:
    results = json.load(f)

for i in range(5):
    item = results[i]
    print(f"Resultado {i+1}:")
    print(f"  Prompt enviado: '{item['prompt']}'")
    print(f"  Target esperado: '{item['target_new']}'")
    print(f"  Modelo gener√≥: '{item['model_output']}'")
    print(f"  Top-5: {item['top5_outputs']}")
    print(f"  Perplejidad: {item['perplexity']:.1f}")
    print()

# 3. An√°lisis r√°pido
print("="*70)
print("3Ô∏è‚É£  AN√ÅLISIS:\n")

# Verificar si prompts tienen sentido
prompts_cortos = sum(1 for r in results if len(r['prompt']) < 5)
outputs_vacios = sum(1 for r in results if len(r['model_output'].strip()) < 2)
perp_maximas = sum(1 for r in results if r['perplexity'] >= 99)

print(f"Prompts muy cortos (<5 chars):    {prompts_cortos}/100")
print(f"Outputs vac√≠os o muy cortos:      {outputs_vacios}/100")
print(f"Perplejidades m√°ximas (‚â•99):      {perp_maximas}/100")

if prompts_cortos > 50:
    print("\n‚ùå PROBLEMA: M√°s del 50% de prompts son muy cortos")
    print("   ‚Üí Tu dataset tiene problema de formato")

if outputs_vacios > 50:
    print("\n‚ùå PROBLEMA: El modelo no est√° generando texto")
    print("   ‚Üí Problema con la generaci√≥n o MPS")

if perp_maximas > 80:
    print("\n‚ùå PROBLEMA: >80% con perplejidad m√°xima")
    print("   ‚Üí El modelo no entiende los prompts")

# 4. Comparar prompt vs target
print("\n" + "="*70)
print("4Ô∏è‚É£  AN√ÅLISIS DE ESTRUCTURA:\n")

for i in range(3):
    item = data[i]
    prompt = item.get('prompt', '')
    target = item.get('target_new', '')
    
    print(f"Ejemplo {i+1}:")
    print(f"  Prompt: '{prompt}'")
    print(f"  Target: '{target}'")
    
    # Verificaciones
    if target.lower() in prompt.lower():
        print(f"  ‚ö†Ô∏è  WARNING: Target YA est√° en el prompt")
    
    if not prompt.endswith(('de', 'es', 'son', 'the', 'is', 'are', 'a', 'an')):
        print(f"  ‚ö†Ô∏è  WARNING: Prompt no termina en palabra t√≠pica")
    
    if len(target) > len(prompt):
        print(f"  ‚ö†Ô∏è  WARNING: Target es m√°s largo que prompt (raro)")
    
    print()

print("="*70)
print("‚úÖ DIAGN√ìSTICO COMPLETADO")
print("="*70)

üîç DIAGN√ìSTICO CR√çTICO

1Ô∏è‚É£  TUS DATOS DE ENTRADA:

Ejemplo 1:
  prompt:     'La batalla de arroyo el rey ocurri√≥ en el a√±o'
  target_new: '1872'
  subject:    'arroyo el rey'

Ejemplo 2:
  prompt:     'El comandante de las tropas en arroyo el rey fue'
  target_new: 'general manuel obligado'
  subject:    'arroyo el rey'

Ejemplo 3:
  prompt:     'La condici√≥n del arroyo arroyo el rey era'
  target_new: 'caudal engrosado'
  subject:    'arroyo el rey'

Ejemplo 4:
  prompt:     'La condici√≥n del arroyo arroyo el rey era'
  target_new: 'sumamente crecido'
  subject:    'arroyo el rey'

Ejemplo 5:
  prompt:     'arroyo el rey es'
  target_new: 'arroyo'
  subject:    'arroyo el rey'

2Ô∏è‚É£  QU√â GENER√ì EL MODELO:

Resultado 1:
  Prompt enviado: 'La batalla de arroyo el rey ocurri√≥ en el a√±o'
  Target esperado: '1872'
  Modelo gener√≥: '.

El rey est√° en la ciudad de'
  Top-5: ['.\n\nEl rey est√° en la ciudad de', '.\n\nEl rey est√° en la ciudad del', '.\n\nEl rey est√° en

# Otra version que no craga el modelo todo el tiempo

In [6]:
"""
Script 2: Calcular m√©tricas baseline (modelo sin editar)
VERSI√ìN OPTIMIZADA - NO RECARGA EL MODELO
"""

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import numpy as np
from sentence_transformers import SentenceTransformer
import json
from pathlib import Path
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# ============================================
# CONFIGURACI√ìN
# ============================================

BASE_DIR = Path("/Users/andreaacostasolorzano/Documents/Repositorios/ProyectoIA")

class Config:
    DATA_DIR = BASE_DIR / "data"
    PROCESSED_DIR = DATA_DIR / "processed"
    RESULTS_DIR = BASE_DIR / "results" / "fase1"
    
    MODEL_NAME = "gpt2-xl"
    
    # Detectar MPS (Apple Silicon)
    if torch.cuda.is_available():
        DEVICE = "cuda"
    elif torch.backends.mps.is_available():
        DEVICE = "mps"
    else:
        DEVICE = "cpu"
    
    REGIONS = ['latam', 'europe']
    REGION_NAMES = {
        'latam': 'Latinoam√©rica',
        'europe': 'Europa (Grecia + N√≥rdica)'
    }
    
    RANDOM_SEED = 42

# Funciones auxiliares
def load_json(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    print(f"‚úì Cargados {len(data)} ejemplos desde {filepath.name}")
    return data

def save_json(data, filepath):
    filepath.parent.mkdir(parents=True, exist_ok=True)
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"‚úì Guardado en {filepath}")

def log_decision(decision):
    log_file = BASE_DIR / "FASE1_DECISIONES.md"
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    with open(log_file, 'a', encoding='utf-8') as f:
        f.write(f"\n[{timestamp}] {decision}\n")

def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(Config.RANDOM_SEED)


# ============================================
# CLASE EVALUADOR (SINGLETON)
# ============================================

class BaselineEvaluator:
    """Evaluador optimizado - Se carga UNA VEZ y se reutiliza"""
    
    _instance = None  # Variable de clase para singleton
    
    def __new__(cls, model_name="gpt2-xl"):
        """Patr√≥n Singleton: solo crea una instancia"""
        if cls._instance is None:
            cls._instance = super(BaselineEvaluator, cls).__new__(cls)
            cls._instance._initialized = False
        return cls._instance
    
    def __init__(self, model_name="gpt2-xl"):
        """Solo inicializa si no se ha hecho antes"""
        if self._initialized:
            print("‚úÖ Reutilizando modelo ya cargado en memoria\n")
            return
        
        print(f"ü§ñ Cargando modelo {model_name} (solo esta vez)...")
        self.device = Config.DEVICE
        print(f"   Dispositivo: {self.device}")
        
        if self.device == "mps":
            print("   ‚úÖ Usando GPU Apple Silicon (MPS)")
        elif self.device == "cpu":
            print("   ‚ö†Ô∏è  Usando CPU")
        
        print("   Cargando tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        
        print("   Cargando modelo (1-2 min)...")
        
        if self.device == "mps":
            self.model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float32,
                low_cpu_mem_usage=True
            ).to(self.device)
        else:
            self.model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
                low_cpu_mem_usage=True
            ).to(self.device)
        
        self.model.eval()
        print("   ‚úì Modelo cargado en MPS" if self.device == "mps" else "   ‚úì Modelo cargado")
        
        print("   Cargando embeddings...")
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        print("   ‚úì Embeddings cargados")
        
        self._initialized = True
        print("\n‚úÖ Modelo listo y en memoria para reutilizaci√≥n\n")
    
    def calculate_perplexity(self, prompt, target):
        """Calcular perplejidad"""
        try:
            full_text = prompt + " " + target
            inputs = self.tokenizer(
                full_text, 
                return_tensors="pt",
                truncation=True,
                max_length=512
            ).to(self.device)
            
            with torch.no_grad():
                outputs = self.model(**inputs, labels=inputs.input_ids)
                loss = outputs.loss
            
            perplexity = torch.exp(loss).item()
            return min(perplexity, 100.0)
            
        except Exception as e:
            return 100.0
    
    def generate_top_k(self, prompt, k=5, max_length=15):
        """Generar top-k respuestas"""
        try:
            inputs = self.tokenizer(
                prompt, 
                return_tensors="pt",
                truncation=True
            ).to(self.device)
            
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=max_length,
                    num_return_sequences=k,
                    do_sample=False,
                    num_beams=k,
                    early_stopping=True,
                    pad_token_id=self.tokenizer.eos_token_id
                )
            
            generated = []
            for output in outputs:
                text = self.tokenizer.decode(output, skip_special_tokens=True)
                if text.startswith(prompt):
                    text = text[len(prompt):].strip()
                generated.append(text)
            
            return generated
        except Exception as e:
            return [""] * k
    
    def check_exact_match(self, generated, target):
        """Verificar exact match"""
        gen = generated.lower().strip()
        tgt = target.lower().strip()
        return gen == tgt or gen.startswith(tgt)
    
    def check_in_top_k(self, top_k, target):
        """Verificar si est√° en top-k"""
        target_lower = target.lower().strip()
        for output in top_k:
            if target_lower in output.lower():
                return True
        return False
    
    def calculate_similarity(self, text1, text2):
        """Calcular similitud sem√°ntica"""
        try:
            if not text1 or not text2:
                return 0.0
            
            embeddings = self.embedding_model.encode([text1, text2])
            similarity = np.dot(embeddings[0], embeddings[1]) / (
                np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1]) + 1e-8
            )
            return float(max(0.0, min(1.0, similarity)))
        except:
            return 0.0
    
    def evaluate_example(self, example):
        """Evaluar un ejemplo completo"""
        prompt = example['prompt']
        target = example['target_new']
        
        # Generar
        top_k = self.generate_top_k(prompt, k=5)
        best = top_k[0] if top_k else ""
        
        # M√©tricas
        exact = self.check_exact_match(best, target)
        in_top5 = self.check_in_top_k(top_k, target)
        perp = self.calculate_perplexity(prompt, target)
        sim = self.calculate_similarity(best, target)
        
        return {
            'prompt': prompt,
            'target_new': target,
            'model_output': best,
            'top5_outputs': top_k,
            'exact_match': exact,
            'in_top5': in_top5,
            'perplexity': perp,
            'semantic_similarity': sim,
            'region': example.get('region', ''),
            'country': example.get('country', ''),
            'subject': example.get('subject', ''),
            'relation': example.get('relation', '')
        }


# ============================================
# FUNCI√ìN DE EVALUACI√ìN
# ============================================

def evaluate_dataset(data_file, limit=None, output_suffix=""):
    """
    Evaluar un dataset espec√≠fico
    
    Args:
        data_file: Path al archivo JSON
        limit: N√∫mero de ejemplos a evaluar (None = todos)
        output_suffix: Sufijo para archivos de salida (ej: "_fixed", "_v2")
    """
    
    print(f"\n{'='*70}")
    print(f"EVALUANDO: {data_file.name}")
    print(f"{'='*70}\n")
    
    # Cargar datos
    data = load_json(data_file)
    
    if limit:
        data = data[:limit]
        print(f"‚ö†Ô∏è  MODO PRUEBA: {limit} ejemplos\n")
    
    # Obtener evaluador (reutiliza si ya existe)
    evaluator = BaselineEvaluator(Config.MODEL_NAME)
    
    results = []
    errors = 0
    
    # Evaluar
    for i, example in enumerate(tqdm(data, desc="Procesando")):
        try:
            result = evaluator.evaluate_example(example)
            results.append(result)
        except Exception as e:
            errors += 1
            if errors > 10:
                print(f"\n‚ùå Demasiados errores. Abortando.")
                break
    
    if len(results) == 0:
        return [], {}
    
    # Estad√≠sticas
    exact = sum(1 for r in results if r['exact_match'])
    top5 = sum(1 for r in results if r['in_top5'])
    avg_perp = np.mean([r['perplexity'] for r in results])
    med_perp = np.median([r['perplexity'] for r in results])
    std_perp = np.std([r['perplexity'] for r in results])
    avg_sim = np.mean([r['semantic_similarity'] for r in results])
    
    print(f"\n{'='*70}")
    print(f"üìä RESULTADOS")
    print(f"{'='*70}")
    print(f"Ejemplos:          {len(results)}")
    print(f"Exact Match:       {exact}/{len(results)} ({exact/len(results)*100:.1f}%)")
    print(f"In Top-5:          {top5}/{len(results)} ({top5/len(results)*100:.1f}%)")
    print(f"Perplejidad:       {avg_perp:.2f} ¬± {std_perp:.2f}")
    print(f"Mediana Perp:      {med_perp:.2f}")
    print(f"Similitud:         {avg_sim:.3f}")
    print(f"{'='*70}\n")
    
    # Generar nombres de archivo
    base_name = data_file.stem  # ej: "latam_all"
    
    # Guardar resultados
    output_file = Config.PROCESSED_DIR / f"{base_name}_baseline{output_suffix}.json"
    save_json(results, output_file)
    
    # Guardar resumen
    summary = {
        'dataset': str(data_file),
        'n_examples': len(results),
        'exact_match_rate': exact / len(results),
        'top5_match_rate': top5 / len(results),
        'avg_perplexity': float(avg_perp),
        'median_perplexity': float(med_perp),
        'std_perplexity': float(std_perp),
        'avg_semantic_similarity': float(avg_sim)
    }
    
    summary_file = Config.RESULTS_DIR / f"{base_name}_baseline_summary{output_suffix}.json"
    save_json(summary, summary_file)
    
    return results, summary


def compare_datasets(summaries_dict):
    """Comparar m√∫ltiples datasets"""
    
    if len(summaries_dict) < 2:
        return
    
    print("\n" + "="*70)
    print("üìä COMPARACI√ìN DE DATASETS")
    print("="*70 + "\n")
    
    # Crear tabla
    print(f"{'Dataset':<30} {'N':>6} {'Exact':>8} {'Top-5':>8} {'Perp':>8}")
    print("-"*70)
    
    for name, summary in summaries_dict.items():
        print(f"{name:<30} {summary['n_examples']:>6} "
              f"{summary['exact_match_rate']*100:>7.1f}% "
              f"{summary['top5_match_rate']*100:>7.1f}% "
              f"{summary['avg_perplexity']:>7.1f}")
    
    print()


# ============================================
# FUNCIONES DE CONVENIENCIA
# ============================================

def quick_eval(dataset_name, limit=100):
    """
    Evaluaci√≥n r√°pida de un dataset
    
    Ejemplo:
        quick_eval("latam_all", limit=100)
        quick_eval("latam_all_fixed", limit=100)
    """
    data_file = Config.PROCESSED_DIR / f"{dataset_name}.json"
    
    if not data_file.exists():
        print(f"‚ùå No existe: {data_file}")
        return None, None
    
    return evaluate_dataset(data_file, limit=limit)


def compare_versions(base_name, versions=["", "_fixed"], limit=100):
    """
    Comparar diferentes versiones de un dataset
    
    Ejemplo:
        compare_versions("latam_all", ["", "_fixed", "_v2"])
    """
    summaries = {}
    
    for version in versions:
        dataset_name = f"{base_name}{version}"
        data_file = Config.PROCESSED_DIR / f"{dataset_name}.json"
        
        if not data_file.exists():
            print(f"‚ö†Ô∏è  Saltando {dataset_name} (no existe)")
            continue
        
        print(f"\n{'='*70}")
        print(f"Evaluando versi√≥n: {dataset_name}")
        print(f"{'='*70}")
        
        _, summary = evaluate_dataset(data_file, limit=limit, output_suffix=version)
        summaries[dataset_name] = summary
    
    # Comparar
    compare_datasets(summaries)
    
    return summaries


# ============================================
# EJEMPLOS DE USO
# ============================================

print("="*70)
print("üéØ BASELINE EVALUATOR - VERSI√ìN OPTIMIZADA")
print("="*70)
print(f"BASE_DIR: {BASE_DIR}")
print(f"Dispositivo: {Config.DEVICE}")
print("="*70 + "\n")

print("üìñ INSTRUCCIONES DE USO:")
print()
print("# 1. Evaluar un dataset:")
print("   results, summary = quick_eval('latam_all', limit=100)")
print()
print("# 2. Evaluar otro dataset (SIN RECARGAR modelo):")
print("   results2, summary2 = quick_eval('latam_all_fixed', limit=100)")
print()
print("# 3. Comparar versiones:")
print("   compare_versions('latam_all', ['', '_fixed'], limit=100)")
print()
print("# 4. Evaluar dataset completo:")
print("   quick_eval('latam_all', limit=None)")
print()
print("="*70 + "\n")

# ============================================
# EJECUCI√ìN AUTOM√ÅTICA (OPCIONAL)
# ============================================

# Descomenta para ejecutar autom√°ticamente:

# print("üöÄ Ejecutando evaluaci√≥n autom√°tica...\n")

# # Evaluar versi√≥n original
# results1, summary1 = quick_eval("latam_all", limit=100)

# # Evaluar versi√≥n corregida (si existe)
# results2, summary2 = quick_eval("latam_all_fixed", limit=100)

# # Comparar
# if summary1 and summary2:
#     compare_datasets({
#         "Original": summary1,
#         "Corregido": summary2
#     })

üéØ BASELINE EVALUATOR - VERSI√ìN OPTIMIZADA
BASE_DIR: /Users/andreaacostasolorzano/Documents/Repositorios/ProyectoIA
Dispositivo: mps

üìñ INSTRUCCIONES DE USO:

# 1. Evaluar un dataset:
   results, summary = quick_eval('latam_all', limit=100)

# 2. Evaluar otro dataset (SIN RECARGAR modelo):
   results2, summary2 = quick_eval('latam_all_fixed', limit=100)

# 3. Comparar versiones:
   compare_versions('latam_all', ['', '_fixed'], limit=100)

# 4. Evaluar dataset completo:
   quick_eval('latam_all', limit=None)




In [7]:
results2, summary2 = quick_eval("latam_all", limit=100)


EVALUANDO: latam_all.json

‚úì Cargados 7250 ejemplos desde latam_all.json
‚ö†Ô∏è  MODO PRUEBA: 100 ejemplos

ü§ñ Cargando modelo gpt2-xl (solo esta vez)...
   Dispositivo: mps
   ‚úÖ Usando GPU Apple Silicon (MPS)
   Cargando tokenizer...
   Cargando modelo (1-2 min)...
   ‚úì Modelo cargado en MPS
   Cargando embeddings...
   ‚úì Embeddings cargados

‚úÖ Modelo listo y en memoria para reutilizaci√≥n



Procesando: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [26:01<00:00, 15.61s/it]  


üìä RESULTADOS
Ejemplos:          100
Exact Match:       0/100 (0.0%)
In Top-5:          0/100 (0.0%)
Perplejidad:       87.74 ¬± 20.55
Mediana Perp:      100.00
Similitud:         0.323

‚úì Guardado en /Users/andreaacostasolorzano/Documents/Repositorios/ProyectoIA/data/processed/latam_all_baseline.json
‚úì Guardado en /Users/andreaacostasolorzano/Documents/Repositorios/ProyectoIA/results/fase1/latam_all_baseline_summary.json





In [8]:
results3, summary3 = quick_eval("europe_all", limit=100)


EVALUANDO: europe_all.json

‚úì Cargados 2183 ejemplos desde europe_all.json
‚ö†Ô∏è  MODO PRUEBA: 100 ejemplos

‚úÖ Reutilizando modelo ya cargado en memoria



Procesando: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [18:50<00:00, 11.30s/it]  


üìä RESULTADOS
Ejemplos:          100
Exact Match:       0/100 (0.0%)
In Top-5:          0/100 (0.0%)
Perplejidad:       98.50 ¬± 6.40
Mediana Perp:      100.00
Similitud:         0.287

‚úì Guardado en /Users/andreaacostasolorzano/Documents/Repositorios/ProyectoIA/data/processed/europe_all_baseline.json
‚úì Guardado en /Users/andreaacostasolorzano/Documents/Repositorios/ProyectoIA/results/fase1/europe_all_baseline_summary.json





# Nuevo modelo

In [2]:
"""
Script 2: Calcular m√©tricas baseline - MULTI-MODELO
Soporta: GPT-2, LLaMA, GPT-J, etc.
"""

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import numpy as np
from sentence_transformers import SentenceTransformer
import json
from pathlib import Path
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# ============================================
# CONFIGURACI√ìN DE MODELOS
# ============================================

BASE_DIR = Path("/Users/andreaacostasolorzano/Documents/Repositorios/ProyectoIA")

# üîß MODELOS DISPONIBLES
AVAILABLE_MODELS = {
    # GPT-2 (baseline original)
    'gpt2-xl': {
        'name': 'gpt2-xl',
        'size': '1.5B',
        'description': 'GPT-2 XL - R√°pido pero limitado'
    },
    
    # LLaMA 3.2 (RECOMENDADO para Mac M4)
    'llama-3.2-1b': {
        'name': 'meta-llama/Llama-3.2-1B',
        'size': '1B',
        'description': 'LLaMA 3.2 1B - R√°pido y moderno'
    },
    'llama-3.2-3b': {
        'name': 'meta-llama/Llama-3.2-3B',
        'size': '3B',
        'description': 'LLaMA 3.2 3B - Balance ideal (RECOMENDADO)'
    },
    
    # LLaMA 3.1 (m√°s grandes)
    'llama-3.1-8b': {
        'name': 'meta-llama/Llama-3.1-8B',
        'size': '8B',
        'description': 'LLaMA 3.1 8B - Mejor calidad pero m√°s lento'
    },
    
    # GPT-J (alternativa)
    'gpt-j-6b': {
        'name': 'EleutherAI/gpt-j-6B',
        'size': '6B',
        'description': 'GPT-J 6B - Buena alternativa'
    }
}

# üéØ SELECCIONA EL MODELO AQU√ç
SELECTED_MODEL = 'llama-3.1-8b'  # ‚Üê CAMBIA ESTO

# ============================================

class Config:
    DATA_DIR = BASE_DIR / "data"
    PROCESSED_DIR = DATA_DIR / "processed"
    RESULTS_DIR = BASE_DIR / "results" / "fase1"
    
    # Modelo seleccionado
    MODEL_INFO = AVAILABLE_MODELS[SELECTED_MODEL]
    MODEL_NAME = MODEL_INFO['name']
    
    # Detectar dispositivo
    if torch.cuda.is_available():
        DEVICE = "cuda"
    elif torch.backends.mps.is_available():
        DEVICE = "mps"
    else:
        DEVICE = "cpu"
    
    REGIONS = ['latam', 'europe']
    REGION_NAMES = {
        'latam': 'Latinoam√©rica',
        'europe': 'Europa (Grecia + N√≥rdica)'
    }
    
    RANDOM_SEED = 42

# Funciones auxiliares
def load_json(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    print(f"‚úì Cargados {len(data)} ejemplos desde {filepath.name}")
    return data

def save_json(data, filepath):
    filepath.parent.mkdir(parents=True, exist_ok=True)
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"‚úì Guardado en {filepath}")

def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(Config.RANDOM_SEED)


# ============================================
# CLASE EVALUADOR (MULTI-MODELO)
# ============================================

class BaselineEvaluator:
    """Evaluador que soporta m√∫ltiples modelos"""
    
    _instance = None
    _current_model = None
    
    def __new__(cls, model_name=None):
        if model_name is None:
            model_name = Config.MODEL_NAME
        
        # Si cambiamos de modelo, crear nueva instancia
        if cls._instance is None or cls._current_model != model_name:
            cls._instance = super(BaselineEvaluator, cls).__new__(cls)
            cls._instance._initialized = False
            cls._current_model = model_name
        
        return cls._instance
    
    def __init__(self, model_name=None):
        if model_name is None:
            model_name = Config.MODEL_NAME
        
        if self._initialized and self._current_model == model_name:
            print(f"‚úÖ Reutilizando {model_name} ya cargado\n")
            return
        
        print(f"\n{'='*70}")
        print(f"ü§ñ CARGANDO MODELO: {model_name}")
        print(f"{'='*70}")
        
        self.device = Config.DEVICE
        print(f"Dispositivo: {self.device}")
        
        if self.device == "mps":
            print("‚úÖ Usando GPU Apple Silicon (MPS)")
        
        # Cargar tokenizer
        print("\nüìù Cargando tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            trust_remote_code=True  # Necesario para algunos modelos
        )
        
        # Configurar pad token
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        
        print("‚úì Tokenizer cargado")
        
        # Cargar modelo
        print("\nüîÑ Cargando modelo (esto puede tardar 1-3 min)...")
        
        # Configuraci√≥n por dispositivo
        if self.device == "mps":
            dtype = torch.float32  # MPS mejor con float32
        elif self.device == "cuda":
            dtype = torch.float16  # CUDA puede usar float16
        else:
            dtype = torch.float32
        
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=dtype,
            low_cpu_mem_usage=True,
            trust_remote_code=True
        ).to(self.device)
        
        self.model.eval()
        print("‚úì Modelo cargado")
        
        # Embeddings
        print("\nüî§ Cargando modelo de embeddings...")
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        print("‚úì Embeddings cargados")
        
        self._initialized = True
        
        print(f"\n{'='*70}")
        print("‚úÖ MODELO LISTO")
        print(f"{'='*70}\n")
    
    def calculate_perplexity(self, prompt, target):
        """Calcular perplejidad"""
        try:
            full_text = prompt + " " + target
            inputs = self.tokenizer(
                full_text, 
                return_tensors="pt",
                truncation=True,
                max_length=512
            ).to(self.device)
            
            with torch.no_grad():
                outputs = self.model(**inputs, labels=inputs.input_ids)
                loss = outputs.loss
            
            perplexity = torch.exp(loss).item()
            return min(perplexity, 100.0)
            
        except Exception as e:
            return 100.0
    
    def generate_top_k(self, prompt, k=5, max_length=15):
        """Generar top-k respuestas"""
        try:
            inputs = self.tokenizer(
                prompt, 
                return_tensors="pt",
                truncation=True
            ).to(self.device)
            
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=max_length,
                    num_return_sequences=k,
                    do_sample=False,
                    num_beams=k,
                    early_stopping=True,
                    pad_token_id=self.tokenizer.pad_token_id,
                    eos_token_id=self.tokenizer.eos_token_id
                )
            
            generated = []
            for output in outputs:
                text = self.tokenizer.decode(output, skip_special_tokens=True)
                # Remover prompt
                if text.startswith(prompt):
                    text = text[len(prompt):].strip()
                generated.append(text)
            
            return generated
        except Exception as e:
            print(f"‚ö†Ô∏è  Error en generaci√≥n: {e}")
            return [""] * k
    
    def check_exact_match(self, generated, target):
        """Verificar exact match"""
        gen = generated.lower().strip()
        tgt = target.lower().strip()
        return gen == tgt or gen.startswith(tgt)
    
    def check_in_top_k(self, top_k, target):
        """Verificar si est√° en top-k"""
        target_lower = target.lower().strip()
        for output in top_k:
            if target_lower in output.lower():
                return True
        return False
    
    def calculate_similarity(self, text1, text2):
        """Calcular similitud sem√°ntica"""
        try:
            if not text1 or not text2:
                return 0.0
            
            embeddings = self.embedding_model.encode([text1, text2])
            similarity = np.dot(embeddings[0], embeddings[1]) / (
                np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1]) + 1e-8
            )
            return float(max(0.0, min(1.0, similarity)))
        except:
            return 0.0
    
    def evaluate_example(self, example):
        """Evaluar un ejemplo completo"""
        prompt = example['prompt']
        target = example['target_new']
        
        # Generar
        top_k = self.generate_top_k(prompt, k=5)
        best = top_k[0] if top_k else ""
        
        # M√©tricas
        exact = self.check_exact_match(best, target)
        in_top5 = self.check_in_top_k(top_k, target)
        perp = self.calculate_perplexity(prompt, target)
        sim = self.calculate_similarity(best, target)
        
        return {
            'prompt': prompt,
            'target_new': target,
            'model_output': best,
            'top5_outputs': top_k,
            'exact_match': exact,
            'in_top5': in_top5,
            'perplexity': perp,
            'semantic_similarity': sim,
            'region': example.get('region', ''),
            'country': example.get('country', ''),
            'subject': example.get('subject', ''),
            'relation': example.get('relation', '')
        }


# ============================================
# FUNCI√ìN DE EVALUACI√ìN
# ============================================

def evaluate_dataset(data_file, limit=None, model_name=None):
    """Evaluar un dataset con el modelo especificado"""
    
    if model_name is None:
        model_name = Config.MODEL_NAME
    
    print(f"\n{'='*70}")
    print(f"EVALUANDO: {data_file.name}")
    print(f"MODELO: {model_name}")
    print(f"{'='*70}\n")
    
    # Cargar datos
    data = load_json(data_file)
    
    if limit:
        data = data[:limit]
        print(f"‚ö†Ô∏è  MODO PRUEBA: {limit} ejemplos\n")
    
    # Obtener evaluador
    evaluator = BaselineEvaluator(model_name)
    
    results = []
    errors = 0
    
    # Evaluar
    for i, example in enumerate(tqdm(data, desc="Procesando")):
        try:
            result = evaluator.evaluate_example(example)
            results.append(result)
        except Exception as e:
            errors += 1
            if i < 5:  # Mostrar primeros errores
                print(f"\n‚ö†Ô∏è  Error en ejemplo {i}: {e}")
            if errors > 10:
                print(f"\n‚ùå Demasiados errores. Abortando.")
                break
    
    if len(results) == 0:
        return [], {}
    
    # Estad√≠sticas
    exact = sum(1 for r in results if r['exact_match'])
    top5 = sum(1 for r in results if r['in_top5'])
    avg_perp = np.mean([r['perplexity'] for r in results])
    med_perp = np.median([r['perplexity'] for r in results])
    std_perp = np.std([r['perplexity'] for r in results])
    avg_sim = np.mean([r['semantic_similarity'] for r in results])
    
    print(f"\n{'='*70}")
    print(f"üìä RESULTADOS")
    print(f"{'='*70}")
    print(f"Modelo:            {model_name}")
    print(f"Ejemplos:          {len(results)}")
    print(f"Exact Match:       {exact}/{len(results)} ({exact/len(results)*100:.1f}%)")
    print(f"In Top-5:          {top5}/{len(results)} ({top5/len(results)*100:.1f}%)")
    print(f"Perplejidad:       {avg_perp:.2f} ¬± {std_perp:.2f}")
    print(f"Mediana Perp:      {med_perp:.2f}")
    print(f"Similitud:         {avg_sim:.3f}")
    print(f"{'='*70}\n")
    
    # Guardar con nombre de modelo
    model_suffix = model_name.split('/')[-1].replace('.', '_')
    base_name = data_file.stem
    
    output_file = Config.PROCESSED_DIR / f"{base_name}_baseline_{model_suffix}.json"
    save_json(results, output_file)
    
    summary = {
        'model': model_name,
        'dataset': str(data_file),
        'n_examples': len(results),
        'exact_match_rate': exact / len(results),
        'top5_match_rate': top5 / len(results),
        'avg_perplexity': float(avg_perp),
        'median_perplexity': float(med_perp),
        'std_perplexity': float(std_perp),
        'avg_semantic_similarity': float(avg_sim)
    }
    
    summary_file = Config.RESULTS_DIR / f"{base_name}_summary_{model_suffix}.json"
    save_json(summary, summary_file)
    
    return results, summary


# ============================================
# FUNCIONES DE CONVENIENCIA
# ============================================

def quick_eval(dataset_name, limit=100, model=None):
    """Evaluaci√≥n r√°pida"""
    if model is None:
        model = Config.MODEL_NAME
    
    data_file = Config.PROCESSED_DIR / f"{dataset_name}.json"
    
    if not data_file.exists():
        print(f"‚ùå No existe: {data_file}")
        return None, None
    
    return evaluate_dataset(data_file, limit=limit, model_name=model)


def compare_models(dataset_name, models=['gpt2-xl', 'llama-3.2-3b'], limit=100):
    """Comparar diferentes modelos en el mismo dataset"""
    
    print("\n" + "="*70)
    print(f"üî¨ COMPARACI√ìN DE MODELOS EN: {dataset_name}")
    print("="*70 + "\n")
    
    summaries = {}
    
    for model_key in models:
        if model_key not in AVAILABLE_MODELS:
            print(f"‚ö†Ô∏è  Modelo desconocido: {model_key}")
            continue
        
        model_name = AVAILABLE_MODELS[model_key]['name']
        
        print(f"\n{'='*70}")
        print(f"Evaluando con: {AVAILABLE_MODELS[model_key]['description']}")
        print(f"{'='*70}")
        
        _, summary = quick_eval(dataset_name, limit=limit, model=model_name)
        
        if summary:
            summaries[model_key] = summary
    
    # Tabla comparativa
    if len(summaries) >= 2:
        print("\n" + "="*70)
        print("üìä COMPARACI√ìN DE RESULTADOS")
        print("="*70 + "\n")
        
        print(f"{'Modelo':<20} {'Exact':>8} {'Top-5':>8} {'Perp':>8}")
        print("-"*50)
        
        for model_key, summary in summaries.items():
            desc = AVAILABLE_MODELS[model_key]['size']
            print(f"{desc:<20} "
                  f"{summary['exact_match_rate']*100:>7.1f}% "
                  f"{summary['top5_match_rate']*100:>7.1f}% "
                  f"{summary['avg_perplexity']:>7.1f}")
    
    return summaries


# ============================================
# INICIO
# ============================================

print("="*70)
print("üéØ BASELINE EVALUATOR - MULTI-MODELO")
print("="*70)
print(f"BASE_DIR: {BASE_DIR}")
print(f"Dispositivo: {Config.DEVICE}")
print(f"\nü§ñ Modelo seleccionado: {SELECTED_MODEL}")
print(f"   {AVAILABLE_MODELS[SELECTED_MODEL]['description']}")
print("="*70 + "\n")

print("üìñ MODELOS DISPONIBLES:")
for key, info in AVAILABLE_MODELS.items():
    marker = "üëâ" if key == SELECTED_MODEL else "  "
    print(f"{marker} {key:<20} - {info['description']}")

print("\n" + "="*70)
print("üìñ EJEMPLOS DE USO:")
print("="*70)
print()
print("# 1. Evaluar con modelo por defecto:")
print("   results, summary = quick_eval('latam_all_fixed', limit=100)")
print()
print("# 2. Evaluar con modelo espec√≠fico:")
print("   results, summary = quick_eval('latam_all_fixed', limit=100, model='meta-llama/Llama-3.2-3B')")
print()
print("# 3. Comparar modelos:")
print("   compare_models('latam_all_fixed', ['gpt2-xl', 'llama-3.2-3b'], limit=100)")
print()
print("="*70 + "\n")

  from .autonotebook import tqdm as notebook_tqdm


üéØ BASELINE EVALUATOR - MULTI-MODELO
BASE_DIR: /Users/andreaacostasolorzano/Documents/Repositorios/ProyectoIA
Dispositivo: mps

ü§ñ Modelo seleccionado: llama-3.1-8b
   LLaMA 3.1 8B - Mejor calidad pero m√°s lento

üìñ MODELOS DISPONIBLES:
   gpt2-xl              - GPT-2 XL - R√°pido pero limitado
   llama-3.2-1b         - LLaMA 3.2 1B - R√°pido y moderno
   llama-3.2-3b         - LLaMA 3.2 3B - Balance ideal (RECOMENDADO)
üëâ llama-3.1-8b         - LLaMA 3.1 8B - Mejor calidad pero m√°s lento
   gpt-j-6b             - GPT-J 6B - Buena alternativa

üìñ EJEMPLOS DE USO:

# 1. Evaluar con modelo por defecto:
   results, summary = quick_eval('latam_all_fixed', limit=100)

# 2. Evaluar con modelo espec√≠fico:
   results, summary = quick_eval('latam_all_fixed', limit=100, model='meta-llama/Llama-3.2-3B')

# 3. Comparar modelos:
   compare_models('latam_all_fixed', ['gpt2-xl', 'llama-3.2-3b'], limit=100)




In [3]:
# Evaluar con el modelo seleccionado
results4, summary4 = quick_eval("latam_all", limit=100)


EVALUANDO: latam_all.json
MODELO: meta-llama/Llama-3.1-8B

‚úì Cargados 7250 ejemplos desde latam_all.json
‚ö†Ô∏è  MODO PRUEBA: 100 ejemplos


ü§ñ CARGANDO MODELO: meta-llama/Llama-3.1-8B
Dispositivo: mps
‚úÖ Usando GPU Apple Silicon (MPS)

üìù Cargando tokenizer...


`torch_dtype` is deprecated! Use `dtype` instead!


‚úì Tokenizer cargado

üîÑ Cargando modelo (esto puede tardar 1-3 min)...


Loading checkpoint shards:  25%|‚ñà‚ñà‚ñå       | 1/4 [00:13<00:41, 13.81s/it]

: 

In [17]:
"""
Baseline con Qwen 2.5 3B (sin restricciones)
"""

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import numpy as np
from sentence_transformers import SentenceTransformer
import json
from pathlib import Path
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# ============================================
# CONFIGURACI√ìN
# ============================================

BASE_DIR = Path("/Users/andreaacostasolorzano/Documents/Repositorios/ProyectoIA")

class Config:
    DATA_DIR = BASE_DIR / "data"
    PROCESSED_DIR = DATA_DIR / "processed"
    RESULTS_DIR = BASE_DIR / "results" / "fase1"
    
    # üéØ MODELO SIN RESTRICCIONES
    MODEL_NAME = "Qwen/Qwen2.5-3B"
    
    # Detectar dispositivo
    if torch.cuda.is_available():
        DEVICE = "cuda"
    elif torch.backends.mps.is_available():
        DEVICE = "mps"
    else:
        DEVICE = "cpu"
    
    REGIONS = ['latam', 'europe']
    REGION_NAMES = {
        'latam': 'Latinoam√©rica',
        'europe': 'Europa (Grecia + N√≥rdica)'
    }
    
    RANDOM_SEED = 42

def load_json(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    print(f"‚úì Cargados {len(data)} ejemplos desde {filepath.name}")
    return data

def save_json(data, filepath):
    filepath.parent.mkdir(parents=True, exist_ok=True)
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"‚úì Guardado en {filepath}")

def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)

set_seed(Config.RANDOM_SEED)

# ============================================
# EVALUADOR
# ============================================

class BaselineEvaluator:
    """Evaluador con Qwen 2.5 3B"""
    
    _instance = None
    
    def __new__(cls):
        if cls._instance is None:
            cls._instance = super(BaselineEvaluator, cls).__new__(cls)
            cls._instance._initialized = False
        return cls._instance
    
    def __init__(self):
        if self._initialized:
            print("‚úÖ Reutilizando modelo ya cargado\n")
            return
        
        print(f"\n{'='*70}")
        print(f"ü§ñ CARGANDO: Qwen 2.5 3B")
        print(f"{'='*70}")
        
        self.device = Config.DEVICE
        print(f"Dispositivo: {self.device}")
        
        print("\nüìù Cargando tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(
            Config.MODEL_NAME,
            trust_remote_code=True
        )
        
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        
        print("‚úì Tokenizer cargado")
        
        print("\nüîÑ Cargando modelo (1-2 min)...")
        
        self.model = AutoModelForCausalLM.from_pretrained(
            Config.MODEL_NAME,
            torch_dtype=torch.float32 if self.device == "mps" else torch.float16,
            low_cpu_mem_usage=True,
            trust_remote_code=True
        ).to(self.device)
        
        self.model.eval()
        print("‚úì Modelo cargado")
        
        print("\nüî§ Cargando embeddings...")
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        print("‚úì Embeddings cargados")
        
        self._initialized = True
        
        print(f"\n{'='*70}")
        print("‚úÖ MODELO LISTO")
        print(f"{'='*70}\n")
    
    def generate_top_k(self, prompt, k=5, max_length=15):
        """Generar top-k respuestas"""
        try:
            inputs = self.tokenizer(
                prompt, 
                return_tensors="pt",
                truncation=True
            ).to(self.device)
            
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=max_length,
                    num_return_sequences=k,
                    do_sample=False,
                    num_beams=k,
                    early_stopping=True,
                    pad_token_id=self.tokenizer.pad_token_id,
                    eos_token_id=self.tokenizer.eos_token_id
                )
            
            generated = []
            for output in outputs:
                text = self.tokenizer.decode(output, skip_special_tokens=True)
                if text.startswith(prompt):
                    text = text[len(prompt):].strip()
                generated.append(text)
            
            return generated
        except Exception as e:
            return [""] * k
    
    def check_exact_match(self, generated, target):
        """Verificar exact match"""
        gen = generated.lower().strip()
        tgt = target.lower().strip()
        return gen == tgt or gen.startswith(tgt)
    
    def check_in_top_k(self, top_k, target):
        """Verificar si est√° en top-k"""
        target_lower = target.lower().strip()
        for output in top_k:
            if target_lower in output.lower():
                return True
        return False
    
    def calculate_similarity(self, text1, text2):
        """Calcular similitud sem√°ntica"""
        try:
            if not text1 or not text2:
                return 0.0
            
            embeddings = self.embedding_model.encode([text1, text2])
            similarity = np.dot(embeddings[0], embeddings[1]) / (
                np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1]) + 1e-8
            )
            return float(max(0.0, min(1.0, similarity)))
        except:
            return 0.0
    
    def evaluate_example(self, example):
        """Evaluar un ejemplo completo"""
        prompt = example['prompt']
        target = example['target_new']
        
        top_k = self.generate_top_k(prompt, k=5)
        best = top_k[0] if top_k else ""
        
        exact = self.check_exact_match(best, target)
        in_top5 = self.check_in_top_k(top_k, target)
        sim = self.calculate_similarity(best, target)
        
        return {
            'prompt': prompt,
            'target_new': target,
            'model_output': best,
            'top5_outputs': top_k,
            'exact_match': exact,
            'in_top5': in_top5,
            'semantic_similarity': sim,
            'region': example.get('region', ''),
            'country': example.get('country', ''),
        }

# ============================================
# FUNCI√ìN DE EVALUACI√ìN
# ============================================

def quick_eval(dataset_name, limit=100):
    """Evaluaci√≥n r√°pida"""
    
    data_file = Config.PROCESSED_DIR / f"{dataset_name}.json"
    
    if not data_file.exists():
        print(f"‚ùå No existe: {data_file}")
        return None, None
    
    print(f"\n{'='*70}")
    print(f"EVALUANDO: {data_file.name}")
    print(f"{'='*70}\n")
    
    data = load_json(data_file)
    
    if limit:
        data = data[:limit]
        print(f"‚ö†Ô∏è  MODO PRUEBA: {limit} ejemplos\n")
    
    evaluator = BaselineEvaluator()
    
    results = []
    
    for example in tqdm(data, desc="Procesando"):
        try:
            result = evaluator.evaluate_example(example)
            results.append(result)
        except Exception as e:
            continue
    
    # Estad√≠sticas
    exact = sum(1 for r in results if r['exact_match'])
    top5 = sum(1 for r in results if r['in_top5'])
    avg_sim = np.mean([r['semantic_similarity'] for r in results])
    
    print(f"\n{'='*70}")
    print(f"üìä RESULTADOS")
    print(f"{'='*70}")
    print(f"Modelo:            Qwen 2.5 3B")
    print(f"Ejemplos:          {len(results)}")
    print(f"Exact Match:       {exact}/{len(results)} ({exact/len(results)*100:.1f}%)")
    print(f"In Top-5:          {top5}/{len(results)} ({top5/len(results)*100:.1f}%)")
    print(f"Similitud:         {avg_sim:.3f}")
    print(f"{'='*70}\n")
    
    # Guardar
    output_file = Config.PROCESSED_DIR / f"{dataset_name}_baseline_qwen.json"
    save_json(results, output_file)
    
    summary = {
        'model': 'Qwen/Qwen2.5-3B',
        'n_examples': len(results),
        'exact_match_rate': exact / len(results),
        'top5_match_rate': top5 / len(results),
        'avg_semantic_similarity': float(avg_sim)
    }
    
    return results, summary

# ============================================
# INICIO
# ============================================

print("="*70)
print("üéØ BASELINE EVALUATOR")
print("="*70)
print(f"BASE_DIR: {BASE_DIR}")
print(f"Dispositivo: {Config.DEVICE}")
print(f"Modelo: Qwen 2.5 3B (sin restricciones)")
print("="*70 + "\n")

print("üìñ USO:")
print("   results, summary = quick_eval('latam_all_fixed', limit=100)")
print("="*70 + "\n")

üéØ BASELINE EVALUATOR
BASE_DIR: /Users/andreaacostasolorzano/Documents/Repositorios/ProyectoIA
Dispositivo: mps
Modelo: Qwen 2.5 3B (sin restricciones)

üìñ USO:
   results, summary = quick_eval('latam_all_fixed', limit=100)



In [None]:
# Ejecuta con Qwen 2.5 3B
results, summary = quick_eval("europe_all", limit=200)



EVALUANDO: europe_all.json

‚úì Cargados 2183 ejemplos desde europe_all.json
‚ö†Ô∏è  MODO PRUEBA: 200 ejemplos


ü§ñ CARGANDO: Qwen 2.5 3B
Dispositivo: mps

üìù Cargando tokenizer...
‚úì Tokenizer cargado

üîÑ Cargando modelo (1-2 min)...


Fetching 2 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [01:39<00:00, 49.53s/it]
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [07:52<00:00, 236.13s/it]


In [5]:
"""
Baseline usando Ollama (llama3.1:8b)
Versi√≥n optimizada sin cargar modelo en memoria del notebook
"""

import requests
import json
from pathlib import Path
from tqdm import tqdm
import numpy as np
import time

# ============================================
# CONFIGURACI√ìN
# ============================================

BASE_DIR = Path("/Users/andreaacostasolorzano/Documents/Repositorios/ProyectoIA")

class Config:
    DATA_DIR = BASE_DIR / "data"
    PROCESSED_DIR = DATA_DIR / "processed"
    RESULTS_DIR = BASE_DIR / "results" / "fase1"
    
    # Ollama configuration
    OLLAMA_MODEL = "llama3:8b"
    OLLAMA_URL = "http://localhost:11434/api/generate"
    
    REGIONS = ['latam', 'europe']
    REGION_NAMES = {
        'latam': 'Latinoam√©rica',
        'europe': 'Europa (Grecia + N√≥rdica)'
    }

def load_json(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    print(f"‚úì Cargados {len(data)} ejemplos desde {filepath.name}")
    return data

def save_json(data, filepath):
    filepath.parent.mkdir(parents=True, exist_ok=True)
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"‚úì Guardado en {filepath}")


# ============================================
# EVALUADOR CON OLLAMA
# ============================================

class OllamaEvaluator:
    """Evaluador usando Ollama local"""
    
    def __init__(self, model_name="llama3.1:8b"):
        self.model_name = model_name
        self.api_url = Config.OLLAMA_URL
        
        print(f"\n{'='*70}")
        print(f"ü§ñ CONFIGURANDO OLLAMA")
        print(f"{'='*70}")
        print(f"Modelo: {model_name}")
        print(f"URL: {self.api_url}")
        
        # Verificar conexi√≥n
        if self._test_connection():
            print("‚úÖ Ollama conectado correctamente")
        else:
            print("‚ùå Error: Ollama no responde")
            print("\nüí° SOLUCIONES:")
            print("1. Inicia Ollama: 'ollama serve' en otra terminal")
            print("2. Verifica que el modelo est√© instalado: 'ollama list'")
            print("3. Si no tienes el modelo: 'ollama pull llama3.1:8b'")
            raise ConnectionError("No se pudo conectar a Ollama")
        
        print(f"{'='*70}\n")
    
    def _test_connection(self):
        """Verificar que Ollama est√° corriendo"""
        try:
            response = requests.post(
                self.api_url,
                json={
                    "model": self.model_name,
                    "prompt": "Hi",
                    "stream": False
                },
                timeout=10
            )
            return response.status_code == 200
        except Exception as e:
            return False
    
    def generate(self, prompt, max_tokens=30, temperature=0.0):
        """
        Generar completaci√≥n con Ollama
        
        Args:
            prompt: Texto de entrada
            max_tokens: M√°ximo de tokens a generar
            temperature: 0.0 = determin√≠stico, 1.0 = creativo
        """
        try:
            response = requests.post(
                self.api_url,
                json={
                    "model": self.model_name,
                    "prompt": prompt,
                    "stream": False,
                    "options": {
                        "num_predict": max_tokens,
                        "temperature": temperature,
                        "top_p": 1.0,
                        "stop": ["\n", ".", ",", ";", "?", "!"]  # Detener en puntuaci√≥n
                    }
                },
                timeout=30  # 30 segundos timeout
            )
            
            if response.status_code == 200:
                result = response.json()
                generated = result['response'].strip()
                return generated
            else:
                print(f"‚ö†Ô∏è  Error {response.status_code}")
                return ""
                
        except requests.Timeout:
            print("‚ö†Ô∏è  Timeout - Ollama tard√≥ mucho")
            return ""
        except Exception as e:
            print(f"‚ö†Ô∏è  Error: {e}")
            return ""
    
    def generate_multiple(self, prompt, n=5, max_tokens=30):
        """
        Generar m√∫ltiples completaciones (simulando top-k)
        
        En Ollama no hay beam search nativo, as√≠ que generamos
        con temperatura > 0 varias veces
        """
        results = []
        
        for i in range(n):
            # Primera generaci√≥n determin√≠stica
            if i == 0:
                temp = 0.0
            else:
                # Siguientes con un poco de variaci√≥n
                temp = 0.3 + (i * 0.1)
            
            generated = self.generate(prompt, max_tokens, temperature=temp)
            if generated and generated not in results:
                results.append(generated)
            
            # Peque√±a pausa para no saturar
            if i < n - 1:
                time.sleep(0.1)
        
        # Rellenar con vac√≠os si no gener√≥ suficientes
        while len(results) < n:
            results.append("")
        
        return results[:n]
    
    def check_exact_match(self, generated, target):
        """Verificar match exacto o muy cercano"""
        if not generated or not target:
            return False
        
        gen = generated.lower().strip()
        tgt = target.lower().strip()
        
        # Match exacto
        if gen == tgt:
            return True
        
        # Target al inicio
        if gen.startswith(tgt):
            return True
        
        # Target est√° contenido
        if tgt in gen:
            return True
        
        return False
    
    def check_in_top_k(self, top_k, target):
        """Verificar si target est√° en alguna de las generaciones"""
        target_lower = target.lower().strip()
        
        for output in top_k:
            if target_lower in output.lower():
                return True
        
        return False
    
    def calculate_similarity(self, text1, text2):
        """
        Similitud simple basada en palabras comunes
        (No usa embeddings para ser m√°s ligero)
        """
        if not text1 or not text2:
            return 0.0
        
        words1 = set(text1.lower().split())
        words2 = set(text2.lower().split())
        
        if not words1 or not words2:
            return 0.0
        
        intersection = words1.intersection(words2)
        union = words1.union(words2)
        
        return len(intersection) / len(union) if union else 0.0
    
    def evaluate_example(self, example, show_details=False):
        """Evaluar un ejemplo completo"""
        prompt = example['prompt']
        target = example['target_new']
        
        # Generar m√∫ltiples respuestas
        top_k = self.generate_multiple(prompt, n=5)
        best = top_k[0] if top_k else ""
        
        # M√©tricas
        exact = self.check_exact_match(best, target)
        in_top5 = self.check_in_top_k(top_k, target)
        similarity = self.calculate_similarity(best, target)
        
        if show_details:
            print(f"\n{'='*50}")
            print(f"Prompt:  {prompt}")
            print(f"Target:  {target}")
            print(f"Best:    {best}")
            print(f"Exact:   {exact}")
            print(f"Top-5:   {in_top5}")
            print(f"Sim:     {similarity:.2f}")
        
        return {
            'prompt': prompt,
            'target_new': target,
            'model_output': best,
            'top5_outputs': top_k,
            'exact_match': exact,
            'in_top5': in_top5,
            'semantic_similarity': similarity,
            'region': example.get('region', ''),
            'country': example.get('country', ''),
            'subject': example.get('subject', ''),
        }


# ============================================
# FUNCI√ìN DE EVALUACI√ìN
# ============================================

def evaluate_dataset(data_file, limit=None):
    """Evaluar un dataset completo"""
    
    print(f"\n{'='*70}")
    print(f"EVALUANDO: {data_file.name}")
    print(f"{'='*70}\n")
    
    # Cargar datos
    data = load_json(data_file)
    
    if limit:
        data = data[:limit]
        print(f"‚ö†Ô∏è  MODO PRUEBA: {limit} ejemplos\n")
    else:
        print(f"üìä Evaluando {len(data)} ejemplos\n")
    
    # Inicializar evaluador
    evaluator = OllamaEvaluator("llama3:8b")
    
    results = []
    errors = 0
    
    print("Procesando ejemplos...")
    
    # Evaluar con barra de progreso
    for i, example in enumerate(tqdm(data, desc="Evaluando")):
        try:
            # Mostrar detalles cada 25 ejemplos
            show = (i % 25 == 0) and (i > 0)
            
            result = evaluator.evaluate_example(example, show_details=show)
            results.append(result)
            
        except KeyboardInterrupt:
            print("\n\n‚ö†Ô∏è  Interrumpido por usuario")
            save_partial = input("¬øGuardar resultados parciales? (s/n): ")
            if save_partial.lower() == 's':
                break
            else:
                return [], {}
                
        except Exception as e:
            errors += 1
            if errors <= 3:
                print(f"\n‚ö†Ô∏è  Error en ejemplo {i}: {e}")
            if errors > 10:
                print(f"\n‚ùå Demasiados errores ({errors}). Abortando.")
                break
    
    if len(results) == 0:
        print("‚ùå No se evalu√≥ ning√∫n ejemplo")
        return [], {}
    
    # Estad√≠sticas
    exact = sum(1 for r in results if r['exact_match'])
    top5 = sum(1 for r in results if r['in_top5'])
    avg_sim = np.mean([r['semantic_similarity'] for r in results])
    
    print(f"\n{'='*70}")
    print(f"üìä RESULTADOS")
    print(f"{'='*70}")
    print(f"Modelo:            llama3:8b (Ollama)")
    print(f"Ejemplos:          {len(results)}")
    print(f"Errores:           {errors}")
    print(f"Exact Match:       {exact}/{len(results)} ({exact/len(results)*100:.1f}%)")
    print(f"In Top-5:          {top5}/{len(results)} ({top5/len(results)*100:.1f}%)")
    print(f"Similitud promedio: {avg_sim:.3f}")
    print(f"{'='*70}\n")
    
    # Guardar resultados
    base_name = data_file.stem
    
    output_file = Config.PROCESSED_DIR / f"{base_name}_baseline_ollama.json"
    save_json(results, output_file)
    print(f"üíæ Resultados guardados en:\n   {output_file}\n")
    
    summary = {
        'model': 'llama3:8b (Ollama)',
        'dataset': str(data_file),
        'n_examples': len(results),
        'n_errors': errors,
        'exact_match_rate': exact / len(results),
        'top5_match_rate': top5 / len(results),
        'avg_semantic_similarity': float(avg_sim)
    }
    
    summary_file = Config.RESULTS_DIR / f"{base_name}_summary_ollama.json"
    save_json(summary, summary_file)
    
    return results, summary


def compare_datasets(summaries_dict):
    """Comparar m√∫ltiples datasets"""
    
    if len(summaries_dict) < 2:
        return
    
    print("\n" + "="*70)
    print("üìä COMPARACI√ìN DE DATASETS")
    print("="*70 + "\n")
    
    print(f"{'Dataset':<30} {'N':>6} {'Exact':>8} {'Top-5':>8} {'Sim':>8}")
    print("-"*70)
    
    for name, summary in summaries_dict.items():
        print(f"{name:<30} {summary['n_examples']:>6} "
              f"{summary['exact_match_rate']*100:>7.1f}% "
              f"{summary['top5_match_rate']*100:>7.1f}% "
              f"{summary['avg_semantic_similarity']:>7.3f}")
    
    # An√°lisis de brecha
    if len(summaries_dict) == 2:
        keys = list(summaries_dict.keys())
        s1, s2 = summaries_dict[keys[0]], summaries_dict[keys[1]]
        
        gap = (s2['exact_match_rate'] - s1['exact_match_rate']) * 100
        
        print(f"\nüí° AN√ÅLISIS:")
        print(f"   Brecha en Exact Match: {gap:+.1f} puntos porcentuales")
        
        if abs(gap) > 15:
            print(f"   ‚úì Diferencia SIGNIFICATIVA entre regiones")
        elif abs(gap) > 5:
            print(f"   ‚ö†Ô∏è  Diferencia MODERADA entre regiones")
        else:
            print(f"   ‚Üí Conocimiento similar entre regiones")
    
    print()


# ============================================
# FUNCIONES DE CONVENIENCIA
# ============================================

def quick_eval(dataset_name, limit=100):
    """Evaluaci√≥n r√°pida de un dataset"""
    data_file = Config.PROCESSED_DIR / f"{dataset_name}.json"
    
    if not data_file.exists():
        print(f"‚ùå No existe: {data_file}")
        available = list(Config.PROCESSED_DIR.glob("*.json"))
        if available:
            print(f"\nüìÅ Archivos disponibles:")
            for f in available[:5]:
                print(f"   - {f.name}")
        return None, None
    
    return evaluate_dataset(data_file, limit=limit)


# ============================================
# INICIO
# ============================================

print("="*70)
print("üéØ BASELINE EVALUATOR CON OLLAMA")
print("="*70)
print(f"BASE_DIR: {BASE_DIR}")
print(f"Modelo: llama3.1:8b (Ollama Local)")
print("="*70 + "\n")

print("‚ö†Ô∏è  IMPORTANTE: Antes de ejecutar, aseg√∫rate de:")
print("   1. Tener Ollama corriendo: 'ollama serve'")
print("   2. Tener el modelo: 'ollama list' debe mostrar llama3.1:8b")
print()

print("üìñ EJEMPLOS DE USO:")
print()
print("# 1. Evaluar dataset:")
print("   results, summary = quick_eval('latam_all_fixed', limit=100)")
print()
print("# 2. Evaluar otro dataset:")
print("   results2, summary2 = quick_eval('europe_all_fixed', limit=100)")
print()
print("# 3. Comparar regiones:")
print("   compare_datasets({'Latam': summary, 'Europa': summary2})")
print()
print("="*70 + "\n")

üéØ BASELINE EVALUATOR CON OLLAMA
BASE_DIR: /Users/andreaacostasolorzano/Documents/Repositorios/ProyectoIA
Modelo: llama3.1:8b (Ollama Local)

‚ö†Ô∏è  IMPORTANTE: Antes de ejecutar, aseg√∫rate de:
   1. Tener Ollama corriendo: 'ollama serve'
   2. Tener el modelo: 'ollama list' debe mostrar llama3.1:8b

üìñ EJEMPLOS DE USO:

# 1. Evaluar dataset:
   results, summary = quick_eval('latam_all_fixed', limit=100)

# 2. Evaluar otro dataset:
   results2, summary2 = quick_eval('europe_all_fixed', limit=100)

# 3. Comparar regiones:
   compare_datasets({'Latam': summary, 'Europa': summary2})




In [7]:
# Evaluar Latinoam√©rica
results_latam, summary_latam = quick_eval("europe_all", limit=100)


EVALUANDO: europe_all.json

‚úì Cargados 2183 ejemplos desde europe_all.json
‚ö†Ô∏è  MODO PRUEBA: 100 ejemplos


ü§ñ CONFIGURANDO OLLAMA
Modelo: llama3:8b
URL: http://localhost:11434/api/generate
‚úÖ Ollama conectado correctamente

Procesando ejemplos...


Evaluando:  26%|‚ñà‚ñà‚ñå       | 26/100 [01:37<04:10,  3.38s/it]


Prompt:  En agatodemon, se contrapone a es
Target:  cacod√©mones
Best:    En la filosof√≠a antigua
Exact:   False
Top-5:   False
Sim:     0.00


Evaluando:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 51/100 [03:08<03:03,  3.74s/it]


Prompt:  En alexiares y aniceto, la naturaleza de su culto es es
Target:  incierta
Best:    La respuesta correcta ser√≠a:
Exact:   False
Top-5:   False
Sim:     0.00


Evaluando:  76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 76/100 [04:47<01:19,  3.31s/it]


Prompt:  En ariadna, aparece en poema de es
Target:  josefa parra
Best:    Un tema interesante!
Exact:   False
Top-5:   False
Sim:     0.00


Evaluando: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [06:20<00:00,  3.80s/it]


üìä RESULTADOS
Modelo:            llama3:8b (Ollama)
Ejemplos:          100
Errores:           0
Exact Match:       1/100 (1.0%)
In Top-5:          1/100 (1.0%)
Similitud promedio: 0.008

‚úì Guardado en /Users/andreaacostasolorzano/Documents/Repositorios/ProyectoIA/data/processed/europe_all_baseline_ollama.json
üíæ Resultados guardados en:
   /Users/andreaacostasolorzano/Documents/Repositorios/ProyectoIA/data/processed/europe_all_baseline_ollama.json

‚úì Guardado en /Users/andreaacostasolorzano/Documents/Repositorios/ProyectoIA/results/fase1/europe_all_summary_ollama.json



