In [None]:
# instalar dependencias
# librerias
# cargar train de workshop datasets de huggingface
# cargar validation workshop datasets de huggingface
# modelo extractivo entrenado con datasets de train con modelo sentence-transformers/all-MiniLM-L6-v2
# modelo abstractivo con Domain adaptation. Finetuning modelo google/pegasus
# - tokenizacion con chunking
# inferencia

In [None]:
# ======================================
# Instalación de dependencias
# ======================================
!pip install -q transformers datasets accelerate sentence-transformers sacrebleu rouge-score

# ======================================
# Librerías
# ======================================
import json
import torch
from datasets import Dataset
from sentence_transformers import SentenceTransformer, util
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)

def load_jsonl(path):
    data=[]
    with open(path,"r",encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    return data

def save_jsonl(path,data):
    with open(path,"w",encoding="utf-8") as f:
        for d in data:
            f.write(json.dumps(d,ensure_ascii=False)+"\n")

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [None]:
# ======================================
# Preprocesamiento híbrido (extractivo + abstractive)
# ======================================

# cargar dataset
train_judg = load_jsonl("/content/train_judg.jsonl")
train_ref = load_jsonl("/content/train_ref_summ.jsonl")

ref_map = {r["ID"]: r["Summary"] for r in train_ref}
train_pairs = [
    {"Judgment": j["Judgment"], "Summary": ref_map[j["ID"]]}
    for j in train_judg if j["ID"] in ref_map
]

# modelo para extracción (legal domain embeddings si posible)
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def extractive_filter(text, top_k=15):
    """Selecciona las top_k oraciones más relevantes como input al modelo generador"""
    sentences = text.split(". ")
    if len(sentences) <= top_k:
        return text
    embeddings = embedder.encode(sentences, convert_to_tensor=True)
    scores = util.cos_sim(embeddings, embeddings.mean(dim=0))  # ranking global
    ranked = sorted(zip(sentences, scores), key=lambda x: x[1], reverse=True)
    top_sentences = [s for s,_ in ranked[:top_k]]
    return ". ".join(top_sentences)

processed_pairs = []
for ex in train_pairs:
    reduced = extractive_filter(ex["Judgment"])
    processed_pairs.append({"Judgment": reduced, "Summary": ex["Summary"]})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# ======================================
# Tokenización (con chunking si es necesario)
# ======================================
MODEL_NAME = "google/pegasus-large"  # mejor para summarization
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess(ex):
    # chunking de entrada si es demasiado largo
    tokens = tokenizer(ex["Judgment"], truncation=False)["input_ids"]
    chunks = [tokens[i:i+1024] for i in range(0, len(tokens), 1024)]
    chunk_texts = [tokenizer.decode(c, skip_special_tokens=True) for c in chunks]

    # si hay más de 1 chunk → concatenar con separadores
    judgment = " <chunk> ".join(chunk_texts)

    in_tok = tokenizer(judgment, truncation=True, max_length=1024)
    out_tok = tokenizer(ex["Summary"], truncation=True, max_length=512)
    in_tok["labels"] = out_tok["input_ids"]
    return in_tok

dataset = Dataset.from_list(processed_pairs)
tokenized = dataset.map(preprocess, batched=False, remove_columns=dataset.column_names)


tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1346 > 1024). Running this sequence through the model will result in indexing errors


In [None]:
# ======================================
# Entrenamiento
# ======================================
data_collator = DataCollatorForSeq2Seq(tokenizer, model=None)

training_args = Seq2SeqTrainingArguments(
    output_dir="outputs",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    fp16=torch.cuda.is_available(),
    save_strategy="epoch",
    logging_steps=50,
    remove_unused_columns=True,
)

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()
trainer.save_model("model")
tokenizer.save_pretrained("model")


pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

  trainer = Seq2SeqTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mandresmosquera[0m ([33mandresmosqueraw[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
50,3.2257
100,2.7197
150,2.5993
200,2.5252
250,2.5309
300,2.5122




('model/tokenizer_config.json',
 'model/special_tokens_map.json',
 'model/spiece.model',
 'model/added_tokens.json',
 'model/tokenizer.json')

In [None]:
!pip install nltk
import nltk
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
import gc, torch
gc.collect()
torch.cuda.empty_cache()

In [None]:
# ======================================
# Inferencia robusta con chunks + checkpoints + auto-resume
# ======================================
import os
from tqdm import tqdm
from nltk.tokenize import sent_tokenize

# cargar validación y modelo
val_data = load_jsonl("/content/val_judg.jsonl")
device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForSeq2SeqLM.from_pretrained("model").to(device)
tokenizer = AutoTokenizer.from_pretrained("model")
model.eval()

# parámetros seguros
tokenizer_max = getattr(tokenizer, "model_max_length", 1024) or 1024
max_input_len = min(tokenizer_max - 2, 1024)  # margen para tokens especiales
if max_input_len <= 0:
    max_input_len = 1024

max_new_tokens_chunk = 150
max_new_tokens_final = 300
beams = 4
checkpoint_every = 50

# Autoresume
if os.path.exists("answer_partial.jsonl"):
    outputs = load_jsonl("answer_partial.jsonl")
    start_idx = len(outputs)
    print(f"Se reanuda desde checkpoint: {start_idx} ejemplos ya procesados.")
else:
    outputs = []
    start_idx = 0

# -------------------------
# Función de chunking robusta
# -------------------------
def split_into_chunks_by_sentences(text, max_tokens=max_input_len, tokenizer=tokenizer):
    """
    Divide en chunks por oraciones, considerando tokens especiales del modelo.
    """
    sents = sent_tokenize(text)
    chunks = []
    cur = []

    for s in sents:
        s_ids = tokenizer(s, add_special_tokens=True, truncation=False)["input_ids"]
        cur_ids = tokenizer(" ".join(cur), add_special_tokens=True, truncation=False)["input_ids"] if cur else []
        # evitar duplicar tokens BOS/EOS
        cand_ids = cur_ids[:-1] + s_ids[1:] if cur_ids else s_ids

        if len(cand_ids) <= max_tokens:
            cur.append(s)
        else:
            if cur:
                chunks.append(" ".join(cur).strip())
                cur = [s]
            else:
                trunc = tokenizer(s, truncation=True, max_length=max_tokens)
                chunks.append(tokenizer.decode(trunc["input_ids"], skip_special_tokens=True))
                cur = []

    if cur:
        chunks.append(" ".join(cur).strip())

    return chunks

# -------------------------
# Loop de inferencia
# -------------------------
for idx in tqdm(range(start_idx, len(val_data)), desc="Generating summaries"):
    ex = val_data[idx]
    text = ex["Judgment"]

    # 1) aplicar filtro extractivo si lo tienes, si no pasa el texto completo
    try:
        reduced = extractive_filter(text)
    except Exception:
        reduced = text

    # 2) dividir en chunks seguros
    chunk_texts = split_into_chunks_by_sentences(reduced, max_tokens=max_input_len, tokenizer=tokenizer)
    partials = []

    # 3) generar resumen parcial para cada chunk
    for chunk in chunk_texts:
        inputs = tokenizer(chunk, return_tensors="pt", add_special_tokens=True,
                           truncation=True, max_length=max_input_len).to(device)
        inputs["input_ids"] = inputs["input_ids"][:, :max_input_len]  # cortar si sobra
        with torch.no_grad():
            out_ids = model.generate(
                inputs["input_ids"],
                max_new_tokens=max_new_tokens_chunk,
                num_beams=beams,
                length_penalty=1.0,
                early_stopping=True,
                no_repeat_ngram_size=3
            )
        partial = tokenizer.decode(out_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
        partials.append(partial)

    # 4) fusionar parciales y resumir nuevamente
    concatenated = " ".join(partials)
    inputs = tokenizer(concatenated, return_tensors="pt", add_special_tokens=True,
                       truncation=True, max_length=max_input_len).to(device)
    inputs["input_ids"] = inputs["input_ids"][:, :max_input_len]  # cortar si sobra
    with torch.no_grad():
        out_ids = model.generate(
            inputs["input_ids"],
            max_new_tokens=max_new_tokens_final,
            num_beams=beams,
            length_penalty=1.0,
            early_stopping=True,
            no_repeat_ngram_size=3
        )
    final_summary = tokenizer.decode(out_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

    outputs.append({"ID": ex["ID"], "Summary": final_summary})

    # checkpoint
    if (len(outputs) % checkpoint_every) == 0:
        save_jsonl("answer_partial.jsonl", outputs)
        print(f"[Checkpoint] guardado answer_partial.jsonl ({len(outputs)}/{len(val_data)})")

# -------------------------
# Guardado final
# -------------------------
save_jsonl("answer.jsonl", outputs)
print("✅ Guardado final: answer.jsonl")

from google.colab import files
files.download("answer.jsonl")

Generating summaries:  25%|██▌       | 50/200 [05:50<18:46,  7.51s/it]

[Checkpoint] guardado answer_partial.jsonl (50/200)


Generating summaries:  50%|█████     | 100/200 [11:23<09:44,  5.84s/it]

[Checkpoint] guardado answer_partial.jsonl (100/200)


Generating summaries:  75%|███████▌  | 150/200 [16:46<05:51,  7.04s/it]

[Checkpoint] guardado answer_partial.jsonl (150/200)


Generating summaries: 100%|██████████| 200/200 [22:22<00:00,  6.71s/it]

[Checkpoint] guardado answer_partial.jsonl (200/200)
✅ Guardado final: answer.jsonl





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>