In [1]:
import os
from openai import AzureOpenAI
from dotenv import load_dotenv

In [2]:
# Cargar variables de entorno desde el archivo .env
load_dotenv()

True

In [3]:
# Configuración de Azure OpenAI
endpoint = "https://csbridgeopenai.openai.azure.com/"
model_name = "gpt-4o-mini"
deployment = "csbridge-gpt-4o-mini"
# Obtner llave desde archivo .env
subscription_key = os.getenv("AZURE_OPENAI_API_KEY")
api_version = "2024-02-15-preview"


In [4]:
# Inicializar el cliente de Azure OpenAI
client = AzureOpenAI(
    azure_endpoint=endpoint,
    api_key=subscription_key,
    api_version=api_version
)

Prueba de conexión usando Azure OpenAI.

In [5]:
response = client.chat.completions.create(
    messages=[
        {"role": "system", "content": "Eres un asistente útil."},
        {"role": "user", "content": "¿Cuál es la capital de Francia?"}
    ],
    max_tokens=4096,
    temperature=0.7,
    top_p=1.0,
    model=deployment
)

In [6]:
print(response.choices[0].message.content)

La capital de Francia es París.


## Prompts Optimizados para Sumarización Legal

Los siguientes prompts están basados en el análisis de 1,200 casos legales reales y optimizados para GPT-4o-mini.

In [7]:
# 5 PROMPTS OPTIMIZADOS PARA SUMARIZACIÓN LEGAL CON GPT-4o-mini

# PROMPT 1: DIRECTO Y TÉCNICO
prompt_v1 = """You are a legal document summarization expert. Based on analysis of 1,200 real legal cases:

TARGET SPECIFICATIONS:
• Length: 26% of original text (~260 words for 1000-word input)
• Sentence structure: 27-32 words per sentence average
• Complex words: Reduce to 31% ratio
• Legal terminology: Maintain 2.6% density
• Reading level: Graduate (appropriate for legal audience)

CORE STRATEGY:
1. STRUCTURAL CONSOLIDATION: Combine short sentences into fluent longer ones while preserving legal terminology and conceptual complexity
2. ESSENTIAL CONTENT: Case parties/roles, central facts, main legal arguments, court decision with key reasoning, relevant precedents
3. SELECTIVE ELIMINATION: Excessive procedural details, page references, repetitions, extensive quotes (summarize key content instead)
4. OBJECTIVE STYLE: Clear connectors between concepts, preserve established legal jargon, maintain judicial objectivity

LEGAL JUDGMENT TEXT:
{text}
"""

# PROMPT 2: INSTRUCCIONAL Y PASO A PASO
prompt_v2 = """Act as an expert legal summarizer. You will create concise yet comprehensive summaries of legal judgments optimized for ROUGE-2, ROUGE-L, and BLEU metrics.

SUMMARIZATION PARAMETERS (based on 1,200 case analysis):
- Target length: 26% of original (approximately 260 words per 1000 input words)
- Sentence length: 27-32 words average
- Maintain legal density at 2.6%
- Graduate-level complexity appropriate for legal professionals

STEP-BY-STEP PROCESS:
1. IDENTIFY: Extract case parties, core facts, legal arguments, court decision
2. CONSOLIDATE: Merge related short sentences into coherent longer sentences
3. PRESERVE: Keep exact legal terminology, proper names, statutory references
4. ELIMINATE: Remove procedural metadata, page numbers, excessive citations
5. STRUCTURE: Use clear transitions, maintain chronological flow where relevant

INPUT LEGAL JUDGMENT:
{text}
"""

# PROMPT 3: ORIENTADO A MÉTRICAS ROUGE/BLEU
prompt_v3 = """Legal document summarizer optimized for maximum ROUGE-2, ROUGE-L, and BLEU scores.

OPTIMIZATION TARGETS (derived from 1,200 legal case corpus):
• Compression ratio: 26% (proven optimal for legal text)
• Sentence construction: 27-32 words (increases ROUGE-L overlap)
• Lexical complexity: 31% complex words (balanced comprehension)
• Legal term preservation: 2.6% density (maintains domain accuracy)

CONTENT PRIORITIES FOR METRIC OPTIMIZATION:
1. LITERAL PRESERVATION: Exact names, legal citations, statutory sections, court names
2. STRUCTURAL SYNTHESIS: Combine multiple short factual statements into comprehensive sentences
3. KEY ELEMENT RETENTION: Case parties, central facts, legal reasoning, final ruling
4. STRATEGIC ELIMINATION: Procedural minutiae, formatting artifacts, redundant phrasing

Apply these principles to summarize the following legal judgment while maximizing n-gram overlap with reference summaries:

JUDGMENT TEXT:
{text}
"""

# PROMPT 4: CONVERSACIONAL Y CLARO
prompt_v4 = """You're an expert at creating high-quality legal summaries. I need you to summarize a legal judgment following specific parameters that have been proven effective through analysis of 1,200 legal cases.

Here's what makes a great legal summary:

TARGET CHARACTERISTICS:
- About 26% the length of the original (so ~260 words if the input is 1000 words)
- Longer, more sophisticated sentences (aim for 27-32 words per sentence)
- Keep important legal terminology but reduce overall complexity to 31% complex words
- Maintain appropriate legal density (about 2.6% legal terms)

WHAT TO INCLUDE:
- Who are the parties and their roles
- What are the central facts of the case
- What were the main legal arguments
- What did the court decide and why
- Any important precedents or legal references

WHAT TO STREAMLINE:
- Excessive procedural details
- Page numbers and formatting elements
- Repetitive information
- Very long quotations (summarize the key points instead)

Please create a clear, comprehensive summary that maintains the legal precision while being more accessible:

LEGAL JUDGMENT:
{text}
"""

# PROMPT 5: FORMATO ESTRUCTURADO Y SISTEMÁTICO
prompt_v5 = """Legal Judgment Summarization System
Trained on 1,200 cases | Optimized for ROUGE/BLEU metrics

SYSTEM PARAMETERS:
[COMPRESSION: 26%] [SENTENCE_LENGTH: 27-32 words] [COMPLEXITY: 31%] [LEGAL_DENSITY: 2.6%]

PROCESSING INSTRUCTIONS:
→ EXTRACT core elements: parties, facts, arguments, decision, reasoning
→ CONSOLIDATE multiple short sentences into coherent longer statements  
→ PRESERVE legal terminology, proper names, citations exactly as written
→ ELIMINATE procedural metadata, page references, excessive repetition
→ STRUCTURE with clear logical flow and appropriate transitions

QUALITY TARGETS:
✓ Graduate-level reading appropriate for legal professionals
✓ Maintain judicial objectivity and precision
✓ Optimize for maximum lexical overlap with reference summaries
✓ Ensure completeness of essential case information

JUDGMENT TO SUMMARIZE:
{text}
"""

# Diccionario con todos los prompts
prompts = {
    "directo": prompt_v1,
    "instruccional": prompt_v2, 
    "metricas": prompt_v3,
    "conversacional": prompt_v4,
    "sistematico": prompt_v5
}

print("✅ 5 Prompts optimizados cargados:")
for name in prompts.keys():
    print(f"   • {name}")
    
print(f"\n📊 Características de los prompts:")
print("• Basados en análisis de 1,200 casos legales reales")
print("• Optimizados para métricas ROUGE-2, ROUGE-L, BLEU")
print("• Target: 26% compresión, 27-32 palabras/oración")
print("• Preservan terminología legal (2.6% densidad)")
print("• Nivel postgrado apropiado para audiencia legal")

✅ 5 Prompts optimizados cargados:
   • directo
   • instruccional
   • metricas
   • conversacional
   • sistematico

📊 Características de los prompts:
• Basados en análisis de 1,200 casos legales reales
• Optimizados para métricas ROUGE-2, ROUGE-L, BLEU
• Target: 26% compresión, 27-32 palabras/oración
• Preservan terminología legal (2.6% densidad)
• Nivel postgrado apropiado para audiencia legal


In [8]:
# Función para generar resúmenes con Azure OpenAI
def generate_summary(text, prompt_template, model_deployment=deployment, max_tokens=4096, temperature=0.3):
    """
    Genera un resumen usando Azure OpenAI GPT-4o-mini
    
    Args:
        text (str): Texto del juicio legal a resumir
        prompt_template (str): Template del prompt a usar
        model_deployment (str): Nombre del deployment del modelo
        max_tokens (int): Máximo número de tokens en la respuesta
        temperature (float): Temperatura para controlar creatividad (0.0-1.0)
    
    Returns:
        str: Resumen generado
    """
    try:
        # Formatear el prompt con el texto
        formatted_prompt = prompt_template.format(text=text)
        
        # Llamada a la API de Azure OpenAI
        response = client.chat.completions.create(
            model=model_deployment,
            messages=[
                {"role": "system", "content": "You are an expert legal document summarizer, specialized in Indian Legal System."},
                {"role": "user", "content": formatted_prompt}
            ],
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=0.3,
            frequency_penalty=0,
            presence_penalty=0
        )
        
        # Extraer el resumen de la respuesta con verificación
        content = response.choices[0].message.content
        if content is None:
            print(f"Warning: API retornó contenido None")
            return None
        summary = content.strip()
        
        return summary
        
    except Exception as e:
        print(f"Error al generar resumen: {str(e)}")
        return None

def batch_summarize(texts, prompt_name="directo", show_progress=True):
    """
    Genera resúmenes para múltiples textos
    
    Args:
        texts (list): Lista de textos a resumir
        prompt_name (str): Nombre del prompt a usar ('directo', 'instruccional', etc.)
        show_progress (bool): Mostrar progreso
    
    Returns:
        list: Lista de resúmenes generados
    """
    if prompt_name not in prompts:
        print(f"Prompt '{prompt_name}' no encontrado. Opciones: {list(prompts.keys())}")
        return None
    
    prompt_template = prompts[prompt_name]
    summaries = []
    
    print(f"Generando resúmenes con prompt '{prompt_name}'...")
    print(f"Procesando {len(texts)} documentos...")
    
    for i, text in enumerate(texts):
        if show_progress:
            print(f"   Procesando {i+1}/{len(texts)}...", end=" ")
        
        summary = generate_summary(text, prompt_template)
        summaries.append(summary)
        
        if show_progress:
            if summary:
                word_count = len(summary.split())
                print(f"({word_count} palabras)")
            else:
                print("Error")
    
    successful = sum(1 for s in summaries if s is not None)
    print(f"\nCompletado: {successful}/{len(texts)} resúmenes generados exitosamente")
    
    return summaries

## Carga de Datos y Testing

Carga tus datos y prueba los diferentes prompts para encontrar el que mejor funcione.

In [9]:
# CARGA DE DATOS
import json
import pandas as pd

def load_legal_data(judgments_path, summaries_path):
    """Carga datos de juicios y resúmenes de referencia"""
    # Cargar juicios
    judgments = []
    with open(judgments_path, 'r', encoding='utf-8') as f:
        for line in f:
            judgments.append(json.loads(line.strip()))
    
    # Cargar resúmenes de referencia
    summaries = []
    with open(summaries_path, 'r', encoding='utf-8') as f:
        for line in f:
            summaries.append(json.loads(line.strip()))
    
    # Merge por ID
    df_judgments = pd.DataFrame(judgments)
    df_summaries = pd.DataFrame(summaries)
    df_merged = pd.merge(df_judgments, df_summaries, on='ID', suffixes=('_judgment', '_reference'))
    
    print(f"✅ Datos cargados: {len(df_merged)} casos pareados")
    return df_merged

# Cargar datos
df_data = load_legal_data('./datasets/train/train_judg.jsonl', './datasets/train/train_ref_summ.jsonl')

# Subset piloto para testing
ids_piloto = ['id_955', 'id_26', 'id_48', 'id_45', 'id_777', 'id_111', 'id_300', 'id_72']
df_piloto = df_data[df_data['ID'].isin(ids_piloto)].reset_index(drop=True)

print(f"🎯 Dataset piloto: {len(df_piloto)} casos para testing")

✅ Datos cargados: 1200 casos pareados
🎯 Dataset piloto: 8 casos para testing


## Evaluación con Métricas ROUGE y BLEU

Una vez que tengas resúmenes generados, puedes evaluarlos contra los resúmenes de referencia usando las métricas objetivo.

In [11]:
# FUNCIONES DE EVALUACIÓN
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import numpy as np

def evaluate_summary(generated_summary, reference_summary):
    """Evalúa un resumen generado contra uno de referencia usando ROUGE y BLEU"""
    results = {}
    
    # ROUGE Scores
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(reference_summary, generated_summary)
    
    results['rouge1_f'] = rouge_scores['rouge1'].fmeasure
    results['rouge2_f'] = rouge_scores['rouge2'].fmeasure
    results['rougeL_f'] = rouge_scores['rougeL'].fmeasure
    
    # BLEU Score
    try:
        reference_tokens = reference_summary.lower().split()
        generated_tokens = generated_summary.lower().split()
        smoothing = SmoothingFunction().method1
        bleu_score = sentence_bleu([reference_tokens], generated_tokens, smoothing_function=smoothing)
        results['bleu'] = bleu_score
    except Exception:
        results['bleu'] = 0.0
    
    return results

def evaluate_multiple_summaries(generated_summaries, reference_summaries):
    """Evalúa múltiples resúmenes y calcula estadísticas"""
    all_results = []
    
    for gen_sum, ref_sum in zip(generated_summaries, reference_summaries):
        result = evaluate_summary(gen_sum, ref_sum)
        all_results.append(result)
    
    # Calcular promedios
    avg_results = {}
    for metric in ['rouge1_f', 'rouge2_f', 'rougeL_f', 'bleu']:
        scores = [r[metric] for r in all_results if r[metric] is not None]
        avg_results[f'{metric}_mean'] = np.mean(scores) if scores else 0.0
        avg_results[f'{metric}_std'] = np.std(scores) if scores else 0.0
    
    return avg_results, all_results

In [13]:
# TESTING DE LOS 5 PROMPTS
print("Testing de Prompts Optimizados")
print("=" * 40)

# Datos para testing
texts_to_summarize = df_piloto['Judgment'].tolist()
results_all_prompts = {}

# Procesar cada prompt
for i, prompt_name in enumerate(prompts.keys(), 1):
    print(f"\n{i}/{len(prompts)} - {prompt_name}")
    print("-" * 25)
    
    summaries = batch_summarize(texts_to_summarize, prompt_name=prompt_name, show_progress=True)
    results_all_prompts[prompt_name] = summaries
    
    # Estadísticas
    successful = sum(1 for s in summaries if s is not None)
    if successful > 0:
        word_counts = [len(s.split()) for s in summaries if s is not None]
        avg_words = sum(word_counts) / len(word_counts)
        print(f"📊 {successful}/{len(summaries)} exitosos, {avg_words:.0f} palabras promedio")

print(f"\nTesting completado - Resultados en 'results_all_prompts'")

Testing de Prompts Optimizados

1/5 - directo
-------------------------
Generando resúmenes con prompt 'directo'...
Procesando 8 documentos...
   Procesando 1/8... (316 palabras)
   Procesando 2/8... (273 palabras)
   Procesando 3/8... (235 palabras)
   Procesando 4/8... (317 palabras)
   Procesando 5/8... (223 palabras)
   Procesando 6/8... (290 palabras)
   Procesando 7/8... (245 palabras)
   Procesando 8/8... (277 palabras)

Completado: 8/8 resúmenes generados exitosamente
📊 8/5 exitosos, 272 palabras promedio

2/5 - instruccional
-------------------------
Generando resúmenes con prompt 'instruccional'...
Procesando 8 documentos...
   Procesando 1/8... (289 palabras)
   Procesando 2/8... (266 palabras)
   Procesando 3/8... (333 palabras)
   Procesando 4/8... (321 palabras)
   Procesando 5/8... (266 palabras)
   Procesando 6/8... (281 palabras)
   Procesando 7/8... (238 palabras)
   Procesando 8/8... (287 palabras)

Completado: 8/8 resúmenes generados exitosamente
📊 8/5 exitosos, 285

## Evaluación de Resultados

Una vez completado el testing, puedes evaluar los resultados con métricas ROUGE y BLEU.

In [14]:
# EVALUACIÓN CON MÉTRICAS ROUGE Y BLEU
print("📊 Evaluando prompts con métricas objetivo")
print("=" * 45)

# Obtener resúmenes de referencia
reference_summaries = df_piloto['Summary'].tolist()

# Evaluar cada prompt
evaluation_results = {}

for prompt_name, generated_summaries in results_all_prompts.items():
    print(f"\n🔍 Evaluando: {prompt_name}")
    
    # Filtrar solo resúmenes exitosos
    valid_pairs = [(gen, ref) for gen, ref in zip(generated_summaries, reference_summaries) if gen is not None]
    
    if not valid_pairs:
        print("   ❌ No hay resúmenes válidos para evaluar")
        continue
    
    valid_generated = [pair[0] for pair in valid_pairs]
    valid_reference = [pair[1] for pair in valid_pairs]
    
    # Calcular métricas
    avg_results, _ = evaluate_multiple_summaries(valid_generated, valid_reference)
    evaluation_results[prompt_name] = avg_results
    
    # Mostrar resultados
    print(f"   ROUGE-2: {avg_results['rouge2_f_mean']:.3f}")
    print(f"   ROUGE-L: {avg_results['rougeL_f_mean']:.3f}")
    print(f"   BLEU:    {avg_results['bleu_mean']:.3f}")

# Encontrar el mejor prompt
if evaluation_results:
    print(f"\n🏆 RANKING DE PROMPTS:")
    print("-" * 30)
    
    # Ordenar por ROUGE-2 (métrica principal)
    sorted_prompts = sorted(evaluation_results.items(), 
                          key=lambda x: x[1]['rouge2_f_mean'], 
                          reverse=True)
    
    for i, (prompt_name, metrics) in enumerate(sorted_prompts, 1):
        print(f"{i}. {prompt_name}: ROUGE-2={metrics['rouge2_f_mean']:.3f}, "
              f"ROUGE-L={metrics['rougeL_f_mean']:.3f}, BLEU={metrics['bleu_mean']:.3f}")
    
    best_prompt = sorted_prompts[0][0]
    print(f"\n🥇 MEJOR PROMPT: {best_prompt}")
else:
    print("❌ No se pudieron evaluar los prompts")

📊 Evaluando prompts con métricas objetivo

🔍 Evaluando: directo
   ROUGE-2: 0.137
   ROUGE-L: 0.176
   BLEU:    0.030

🔍 Evaluando: instruccional
   ROUGE-2: 0.139
   ROUGE-L: 0.183
   BLEU:    0.030

🔍 Evaluando: metricas
   ROUGE-2: 0.140
   ROUGE-L: 0.179
   BLEU:    0.039

🔍 Evaluando: conversacional
   ROUGE-2: 0.127
   ROUGE-L: 0.175
   BLEU:    0.027

🔍 Evaluando: sistematico
   ROUGE-2: 0.150
   ROUGE-L: 0.196
   BLEU:    0.050

🏆 RANKING DE PROMPTS:
------------------------------
1. sistematico: ROUGE-2=0.150, ROUGE-L=0.196, BLEU=0.050
2. metricas: ROUGE-2=0.140, ROUGE-L=0.179, BLEU=0.039
3. instruccional: ROUGE-2=0.139, ROUGE-L=0.183, BLEU=0.030
4. directo: ROUGE-2=0.137, ROUGE-L=0.176, BLEU=0.030
5. conversacional: ROUGE-2=0.127, ROUGE-L=0.175, BLEU=0.027

🥇 MEJOR PROMPT: sistematico


## Análisis Comparativo Detallado

Comparación lado a lado de resúmenes generados vs resúmenes de entrenamiento para identificar fortalezas y patrones de cada prompt.

In [None]:
# EXPORTACIÓN DE RESÚMENES PARA ANÁLISIS MANUAL
def export_summaries_for_analysis(num_cases=3):
    """
    Imprime resúmenes de forma estructurada para análisis manual
    """
    print("📝 RESÚMENES PARA ANÁLISIS MANUAL")
    print("=" * 60)
    
    reference_summaries = df_piloto['Summary'].tolist()
    case_ids = df_piloto['ID'].tolist()
    
    for i in range(min(num_cases, len(case_ids))):
        print(f"\n🔍 CASO {i+1}: {case_ids[i]}")
        print("=" * 50)
        
        # Resumen de entrenamiento/referencia
        ref_summary = reference_summaries[i]
        ref_words = len(ref_summary.split())
        
        print(f"\n🎯 RESUMEN DE ENTRENAMIENTO ({ref_words} palabras):")
        print("-" * 40)
        print(ref_summary)
        
        # Resúmenes generados por cada prompt
        for prompt_name, summaries in results_all_prompts.items():
            if i < len(summaries) and summaries[i] is not None:
                gen_summary = summaries[i]
                gen_words = len(gen_summary.split())
                compression = (gen_words / ref_words * 100) if ref_words > 0 else 0
                
                print(f"\n🤖 RESUMEN GENERADO - {prompt_name.upper()} ({gen_words} palabras, {compression:.1f}% vs referencia):")
                print("-" * 40)
                print(gen_summary)
            else:
                print(f"\n❌ RESUMEN GENERADO - {prompt_name.upper()}: Error en generación")
                print("-" * 40)
        
        print("\n" + "="*60)
        if i < num_cases - 1:
            print()  # Espacio extra entre casos

# Función para exportar casos específicos
def export_specific_cases(case_indices=None):
    """
    Exporta casos específicos por índice
    """
    if case_indices is None:
        case_indices = [0, 1, 2]  # Primeros 3 por defecto
    
    print("📋 CASOS ESPECÍFICOS SELECCIONADOS")
    print("=" * 50)
    
    reference_summaries = df_piloto['Summary'].tolist()
    case_ids = df_piloto['ID'].tolist()
    
    for idx in case_indices:
        if idx >= len(case_ids):
            print(f"⚠️ Índice {idx} fuera de rango (máximo: {len(case_ids)-1})")
            continue
            
        print(f"\n📋 CASO ÍNDICE {idx}: {case_ids[idx]}")
        print("=" * 40)
        
        # Resumen de referencia
        ref_summary = reference_summaries[idx]
        print(f"\n🎯 REFERENCIA:")
        print(ref_summary)
        
        # Resúmenes generados
        print(f"\n🤖 GENERADOS:")
        for prompt_name, summaries in results_all_prompts.items():
            if idx < len(summaries) and summaries[idx] is not None:
                print(f"\n[{prompt_name.upper()}]")
                print(summaries[idx])
            else:
                print(f"\n[{prompt_name.upper()}] - ERROR")
        
        print("\n" + "-"*50)

# Ejecutar exportación
print("Ejecutando exportación de resúmenes...")
export_summaries_for_analysis(num_cases=3)

In [None]:
# EXPORTACIÓN COMPACTA PARA COPIAR/PEGAR
def export_compact_summaries():
    """
    Formato compacto para fácil copia y análisis
    """
    print("📋 FORMATO COMPACTO PARA ANÁLISIS")
    print("=" * 50)
    
    reference_summaries = df_piloto['Summary'].tolist()
    case_ids = df_piloto['ID'].tolist()
    
    for i in range(min(3, len(case_ids))):
        print(f"\n--- CASO {i+1}: {case_ids[i]} ---")
        
        # Referencia
        print(f"\nREFERENCIA:")
        print(f'"{reference_summaries[i]}"')
        
        # Generados
        for prompt_name, summaries in results_all_prompts.items():
            if i < len(summaries) and summaries[i] is not None:
                print(f"\n{prompt_name.upper()}:")
                print(f'"{summaries[i]}"')
        
        print("\n" + "-"*60)

# Función para exportar solo el mejor prompt vs referencia
def export_best_vs_reference():
    """
    Exporta solo el mejor prompt vs referencia para análisis enfocado
    """
    if 'evaluation_results' not in globals() or not evaluation_results:
        print("❌ Ejecuta primero la evaluación con métricas")
        return
    
    # Encontrar el mejor prompt por ROUGE-2
    best_prompt = max(evaluation_results.items(), key=lambda x: x[1]['rouge2_f_mean'])[0]
    
    print(f"🏆 ANÁLISIS DEL MEJOR PROMPT: {best_prompt.upper()}")
    print("=" * 50)
    
    reference_summaries = df_piloto['Summary'].tolist()
    case_ids = df_piloto['ID'].tolist()
    best_summaries = results_all_prompts[best_prompt]
    
    for i in range(min(5, len(case_ids))):
        print(f"\n📋 CASO {i+1}: {case_ids[i]}")
        print("-" * 30)
        
        print(f"\n🎯 REFERENCIA ({len(reference_summaries[i].split())} palabras):")
        print(reference_summaries[i])
        
        if i < len(best_summaries) and best_summaries[i] is not None:
            print(f"\n🥇 {best_prompt.upper()} ({len(best_summaries[i].split())} palabras):")
            print(best_summaries[i])
        else:
            print(f"\n❌ Error en generación")
        
        print("\n" + "="*50)

✅ Funciones de exportación listas

Opciones disponibles:
• export_summaries_for_analysis() - Exportación detallada
• export_compact_summaries() - Formato compacto
• export_best_vs_reference() - Solo mejor prompt vs referencia
• export_specific_cases([0,1,2]) - Casos específicos


In [22]:
# COMPARACIÓN ESPECÍFICA: SISTEMÁTICO vs MÉTRICAS vs REFERENCIA
def compare_sistematico_metricas():
    """
    Compara solo los prompts 'sistemático' y 'metricas' con la referencia
    """
    print("🔍 COMPARACIÓN: SISTEMÁTICO vs MÉTRICAS vs REFERENCIA")
    print("=" * 60)
    
    reference_summaries = df_piloto['Summary'].tolist()
    case_ids = df_piloto['ID'].tolist()
    
    # Solo los prompts que nos interesan
    prompts_target = ['sistematico', 'metricas']
    
    for i in range(min(5, len(case_ids))):  # Analizar 5 casos
        print(f"\n📋 CASO {i+1}: {case_ids[i]}")
        print("=" * 50)
        
        # Resumen de referencia
        ref_summary = reference_summaries[i]
        ref_words = len(ref_summary.split())
        
        print(f"\n🎯 REFERENCIA ({ref_words} palabras):")
        print("-" * 30)
        print(ref_summary)
        
        # Comparar solo los dos prompts objetivo
        for prompt_name in prompts_target:
            if prompt_name in results_all_prompts:
                summaries = results_all_prompts[prompt_name]
                if i < len(summaries) and summaries[i] is not None:
                    gen_summary = summaries[i]
                    gen_words = len(gen_summary.split())
                    compression = (gen_words / ref_words * 100) if ref_words > 0 else 0
                    
                    print(f"\n🤖 {prompt_name.upper()} ({gen_words} palabras, {compression:.1f}% vs ref):")
                    print("-" * 30)
                    print(gen_summary)
                else:
                    print(f"\n❌ {prompt_name.upper()}: Error en generación")
            else:
                print(f"\n⚠️ {prompt_name.upper()}: No encontrado en resultados")
        
        print("\n" + "="*60)
        if i < 4:  # Agregar espacio entre casos excepto el último
            print()

# Función compacta solo para estos dos prompts
def export_compact_sistemático_metricas():
    """
    Formato compacto solo para sistemático y métricas
    """
    print("📋 FORMATO COMPACTO: SISTEMÁTICO vs MÉTRICAS")
    print("=" * 50)
    
    reference_summaries = df_piloto['Summary'].tolist()
    case_ids = df_piloto['ID'].tolist()
    
    for i in range(min(4, len(case_ids))):
        print(f"\n--- CASO {i+1}: {case_ids[i]} ---")
        
        # Referencia
        print(f"\nREFERENCIA:")
        print(f'"{reference_summaries[i]}"')
        
        # Solo sistemático y métricas
        for prompt_name in ['sistematico', 'metricas']:
            if prompt_name in results_all_prompts:
                summaries = results_all_prompts[prompt_name]
                if i < len(summaries) and summaries[i] is not None:
                    print(f"\n{prompt_name.upper()}:")
                    print(f'"{summaries[i]}"')
                else:
                    print(f"\n{prompt_name.upper()}: ERROR")
        
        print("\n" + "-"*50)

# Ejecutar comparación específica
print("🚀 Ejecutando comparación específica...")
compare_sistematico_metricas()

🚀 Ejecutando comparación específica...
🔍 COMPARACIÓN: SISTEMÁTICO vs MÉTRICAS vs REFERENCIA

📋 CASO 1: id_111

🎯 REFERENCIA (559 palabras):
------------------------------
The Allahabad High Court recently directed the State Director General of Police (DGP) to dispose of an application moved by a police constable seeking permission for Sex Reassignment Surgery (SRS) [Neha Singh v. State Of UP & 2 Ors].
Justice Ajit Kumar stated in the order that a person has the Constitutional right to change his/ her gender through surgical intervention. 
"One should not have any doubt that if a person suffers from gender dysphoria and except for physical structure, her feeling and also the traits of opposite sex so much so that such a person takes a complete misalignment of her personality with physical body, such a person does possess a constitutionally recognized right to get his/her gender changed though surgical intervention."
The Court went on to hold that failure to recognize an individual's inh

In [27]:
# PROMPT HÍBRIDO OPTIMIZADO: COMBINACIÓN MÉTRICAS + SISTEMÁTICO
prompt_hibrido = """Legal document summarizer optimized for maximum ROUGE-2, ROUGE-L, and BLEU scores with natural structure preservation.

OPTIMIZATION TARGETS (derived from 1,200 legal case corpus):
• Compression ratio: 26% (proven optimal for legal text)
• Sentence construction: 27-32 words (increases ROUGE-L overlap)
• Lexical complexity: 31% complex words (balanced comprehension)
• Legal term preservation: 2.6% density (maintains domain accuracy)

PROCESSING STRATEGY:
→ EXTRACT core elements: parties, facts, arguments, decision, reasoning
→ CONSOLIDATE multiple short sentences into coherent longer statements  
→ PRESERVE legal terminology, proper names, citations exactly as written
→ ELIMINATE procedural metadata, page references, excessive repetition
→ STRUCTURE with clear logical flow and natural transitions

CONTENT PRIORITIES FOR METRIC OPTIMIZATION:
1. LITERAL PRESERVATION: Exact names, legal citations, statutory sections, court names
2. STRUCTURAL SYNTHESIS: Combine multiple short factual statements into comprehensive sentences
3. KEY ELEMENT RETENTION: Case parties, central facts, legal reasoning, final ruling, relevant procedural details
4. STRATEGIC ELIMINATION: Only eliminate redundant phrasing and formatting artifacts

QUALITY TARGETS:
✓ Graduate-level reading appropriate for legal professionals
✓ Maintain judicial objectivity and precision
✓ Optimize for maximum lexical overlap with reference summaries
✓ Ensure completeness of essential case information including relevant procedural aspects
✓ Use natural narrative flow rather than rigid sectioned structure
✓ Preserve chronological progression and legal reasoning chains

Create a comprehensive legal summary that flows naturally while preserving all essential legal elements and procedural details that contribute to case understanding.

LEGAL JUDGMENT TEXT:
{text}

LEGAL SUMMARY:"""

# Agregar el prompt híbrido al diccionario
prompts['hibrido'] = prompt_hibrido

In [28]:
# TESTING Y EVALUACIÓN DEL PROMPT HÍBRIDO
def test_hibrido():
    """
    Ejecuta el testing del prompt híbrido usando las funciones existentes
    """
    print("🧪 TESTING DEL PROMPT HÍBRIDO")
    print("=" * 50)
    
    # Obtener textos para resumir
    texts_to_summarize = df_piloto['Judgment'].tolist()
    
    # Generar resúmenes con el prompt híbrido usando batch_summarize
    summaries_hibrido = batch_summarize(texts_to_summarize, prompt_name='hibrido', show_progress=True)
    
    # Agregar a results_all_prompts
    results_all_prompts['hibrido'] = summaries_hibrido
    
    print(f"\n✅ Prompt híbrido ejecutado y agregado a results_all_prompts")
    
    return summaries_hibrido

def evaluate_hibrido():
    """
    Evalúa el prompt híbrido con métricas usando las funciones existentes
    """
    if 'hibrido' not in results_all_prompts:
        print("❌ Ejecuta primero test_hibrido() para generar los resúmenes")
        return None
    
    print("📊 EVALUACIÓN DEL PROMPT HÍBRIDO")
    print("=" * 45)
    
    # Obtener datos
    reference_summaries = df_piloto['Summary'].tolist()
    generated_summaries = results_all_prompts['hibrido']
    
    # Filtrar solo resúmenes exitosos
    valid_pairs = [(gen, ref) for gen, ref in zip(generated_summaries, reference_summaries) if gen is not None]
    
    if not valid_pairs:
        print("❌ No hay resúmenes válidos para evaluar")
        return None
    
    valid_generated = [pair[0] for pair in valid_pairs]
    valid_reference = [pair[1] for pair in valid_pairs]
    
    # Evaluar usando la función existente
    avg_results, _ = evaluate_multiple_summaries(valid_generated, valid_reference)
    
    # Mostrar resultados
    print(f"\n🔄 HÍBRIDO:")
    print(f"   ROUGE-2: {avg_results['rouge2_f_mean']:.3f}")
    print(f"   ROUGE-L: {avg_results['rougeL_f_mean']:.3f}")
    print(f"   BLEU:    {avg_results['bleu_mean']:.3f}")
    
    # Agregar a evaluation_results si existe
    if 'evaluation_results' in globals():
        evaluation_results['hibrido'] = avg_results
        print(f"\n✅ Resultados agregados a evaluation_results")
    
    return avg_results

In [29]:
# EJECUTAR TESTING Y EVALUACIÓN DEL PROMPT HÍBRIDO
print("🚀 EJECUTANDO PROMPT HÍBRIDO")
print("=" * 50)

# 1. Testing del prompt híbrido
print("\n1️⃣ GENERANDO RESÚMENES...")
summaries_hibrido = test_hibrido()

# 2. Evaluación con métricas
print("\n2️⃣ CALCULANDO MÉTRICAS...")
results_hibrido = evaluate_hibrido()

# 3. Comparación con otros prompts (si existen)
if 'evaluation_results' in globals() and len(evaluation_results) > 1:
    print("\n3️⃣ COMPARACIÓN CON OTROS PROMPTS:")
    print("-" * 40)
    
    # Mostrar todos los resultados ordenados por ROUGE-2
    sorted_prompts = sorted(evaluation_results.items(), 
                          key=lambda x: x[1]['rouge2_f_mean'], 
                          reverse=True)
    
    for i, (prompt_name, metrics) in enumerate(sorted_prompts, 1):
        emoji = "🥇" if i == 1 else "🥈" if i == 2 else "🥉" if i == 3 else "📊"
        print(f"{emoji} {i}. {prompt_name.upper()}:")
        print(f"     ROUGE-2: {metrics['rouge2_f_mean']:.3f}")
        print(f"     ROUGE-L: {metrics['rougeL_f_mean']:.3f}")
        print(f"     BLEU:    {metrics['bleu_mean']:.3f}")
    
    # Análisis de posición del híbrido
    hibrido_position = next((i for i, (name, _) in enumerate(sorted_prompts, 1) if name == 'hibrido'), None)
    if hibrido_position:
        print(f"\n🔄 PROMPT HÍBRIDO: Posición #{hibrido_position} de {len(sorted_prompts)}")
        
        if hibrido_position == 1:
            print("   🎉 ¡MEJOR PROMPT! El híbrido logró el mejor rendimiento")
        elif hibrido_position <= 2:
            print("   🎯 Excelente rendimiento, entre los top 2")
        elif hibrido_position <= 3:
            print("   ✅ Buen rendimiento, entre los top 3")
        else:
            print("   📈 Rendimiento medio, hay margen de mejora")

print(f"\n✅ EJECUCIÓN COMPLETADA")
print(f"📊 Datos disponibles en:")
print(f"   • results_all_prompts['hibrido'] - Resúmenes generados")
print(f"   • evaluation_results['hibrido'] - Métricas de evaluación")

🚀 EJECUTANDO PROMPT HÍBRIDO

1️⃣ GENERANDO RESÚMENES...
🧪 TESTING DEL PROMPT HÍBRIDO
Generando resúmenes con prompt 'hibrido'...
Procesando 8 documentos...
   Procesando 1/8... (531 palabras)
   Procesando 2/8... (531 palabras)
   Procesando 2/8... (365 palabras)
   Procesando 3/8... (365 palabras)
   Procesando 3/8... (321 palabras)
   Procesando 4/8... (321 palabras)
   Procesando 4/8... (531 palabras)
   Procesando 5/8... (531 palabras)
   Procesando 5/8... (493 palabras)
   Procesando 6/8... (493 palabras)
   Procesando 6/8... (396 palabras)
   Procesando 7/8... (396 palabras)
   Procesando 7/8... (326 palabras)
   Procesando 8/8... (326 palabras)
   Procesando 8/8... (433 palabras)

Completado: 8/8 resúmenes generados exitosamente

✅ Prompt híbrido ejecutado y agregado a results_all_prompts

2️⃣ CALCULANDO MÉTRICAS...
📊 EVALUACIÓN DEL PROMPT HÍBRIDO
(433 palabras)

Completado: 8/8 resúmenes generados exitosamente

✅ Prompt híbrido ejecutado y agregado a results_all_prompts

2️⃣ CA