In [1]:
!pip install mlflow textstat

In [2]:
!pip install -q transformers datasets evaluate mlflow accelerate bert-score readability-metrics alignscore

In [3]:
pip install huggingface_hub[hf_xet]

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import json
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model
import evaluate
import mlflow
import torch
from textstat import flesch_kincaid_grade


with open(r"C:\Users\sanch\Documents\proyecto_grado\data\data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

records = []

for _, articles in data.items():
    if isinstance(articles, dict):  # aseguramos que es un diccionario
        for _, content in articles.items():
            if isinstance(content, dict):  # solo seguimos si es un dict
                # Unir abstract si existe
                abstract = ""
                if "abstract" in content and isinstance(content["abstract"], dict):
                    abstract = " ".join(content["abstract"].values())

                # Unir adaptation si existe
                adaptation = ""
                if "adaptations" in content and isinstance(content["adaptations"], dict):
                    adaptation_dict = content["adaptations"].get("adaptation2", {})
                    if isinstance(adaptation_dict, dict):
                        adaptation = " ".join(adaptation_dict.values())

                if abstract and adaptation:  # solo guardamos pares vÃ¡lidos
                    records.append({"source": abstract, "target": adaptation})

# Convertimos a DataFrame
import pandas as pd
df = pd.DataFrame(records)
print(df.head())
total_ejemplos = len(df)
print(f"Total de ejemplos: {total_ejemplos}")
dataset = Dataset.from_pandas(df)


small_models = [
    "distilgpt2",
    "gpt2",
    "EleutherAI/gpt-neo-125M",
    "EleutherAI/pythia-70m"
]


def tokenize_function(examples, tokenizer):
    inputs = [f"Simplify: {src}" for src in examples["source"]]
    targets = examples["target"]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs

def train_model(model_name, dataset):
    print(f"\nðŸ”§ Entrenando {model_name}...")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    tokenized = dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)

    model = AutoModelForCausalLM.from_pretrained(model_name)

    if "pythia" in model_name:
        target_modules = ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"]
    elif "gpt2" in model_name or "ettin" in model_name:
         target_modules = ["c_attn","q_proj","v_proj"]
    else:
         target_modules = None 

    config = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=target_modules,
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM"
    )
    model = get_peft_model(model, config)

    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

    training_args = TrainingArguments(
        output_dir=f"./results/{model_name.replace('/','_')}",
        eval_strategy="epoch",
        learning_rate=2e-4,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        num_train_epochs=3,   
        weight_decay=0.01,
        save_total_limit=1,
        fp16=True,
        push_to_hub=False,
        logging_dir="./logs",
        report_to="mlflow",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized.shuffle(seed=42).select(range(total_ejemplos)),
        eval_dataset=tokenized.shuffle(seed=42).select(range(100)),
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    trainer.train()
    return model, tokenizer


trained_models = {}
for m in small_models:
    model, tokenizer = train_model(m, dataset)
    trained_models[m] = (model, tokenizer)


bertscore = evaluate.load("bertscore")

def evaluate_model(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt")
    inputs = inputs.to(model.device) 
    outputs = model.generate(**inputs, max_new_tokens=100)
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)


    refs = [df["target"][0]]
    preds = [summary]
    bs = bertscore.compute(predictions=preds, references=refs, lang="en")


    fk = flesch_kincaid_grade(summary)

    return summary, bs["f1"][0], fk

sample_text = df["source"][0]
for m, (model, tok) in trained_models.items():
    summary, bert, fk = evaluate_model(model, tok, sample_text)
    print(f"\n Modelo: {m}")
    print(f" Resumen: {summary}")
    print(f" BERTScore F1: {bert:.3f} | Flesch-Kincaid: {fk:.2f}")

                                              source  \
0  Muscle cramps are a common problem characteriz...   
1  The dystonias are a group of disorders charact...   
2  Muscle cramps result in continuous, involuntar...   
3  Exercise-Associated Muscle Cramps (EAMC) are a...   
4  Muscular cramp is a common symptom in healthy ...   

                                              target  
0  Muscle cramps are a common problem represented...  
1  Dystonias are disorders with a lot of uncontro...  
2  Muscle cramps cause constant and unintended co...  
3  Exercise-Associated Muscle Cramps (EAMC) are a...  
4  Muscle cramps are common in healthy people, es...  
Total de ejemplos: 320

ðŸ”§ Entrenando distilgpt2...


Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 320/320 [00:00<00:00, 1379.63 examples/s]
                                                 
 33%|â–ˆâ–ˆâ–ˆâ–Ž      | 160/480 [04:13<07:21,  1.38s/it]

{'eval_loss': 3.4489240646362305, 'eval_runtime': 31.1532, 'eval_samples_per_second': 3.21, 'eval_steps_per_second': 1.605, 'epoch': 1.0}


                                                 
 67%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–‹   | 320/480 [07:57<02:59,  1.12s/it]

{'eval_loss': 3.4321718215942383, 'eval_runtime': 24.4517, 'eval_samples_per_second': 4.09, 'eval_steps_per_second': 2.045, 'epoch': 2.0}


                                                 
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 480/480 [11:28<00:00,  1.17s/it]

{'eval_loss': 3.4281585216522217, 'eval_runtime': 25.4826, 'eval_samples_per_second': 3.924, 'eval_steps_per_second': 1.962, 'epoch': 3.0}
{'train_runtime': 689.1656, 'train_samples_per_second': 1.393, 'train_steps_per_second': 0.696, 'train_loss': 3.554651896158854, 'epoch': 3.0}


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 480/480 [11:29<00:00,  1.44s/it]



ðŸ”§ Entrenando gpt2...


Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 320/320 [00:00<00:00, 1607.22 examples/s]
 33%|â–ˆâ–ˆâ–ˆâ–Ž      | 160/480 [05:06<10:08,  1.90s/it]
 33%|â–ˆâ–ˆâ–ˆâ–Ž      | 160/480 [05:45<10:08,  1.90s/it]

{'eval_loss': 3.0523221492767334, 'eval_runtime': 39.6362, 'eval_samples_per_second': 2.523, 'eval_steps_per_second': 1.261, 'epoch': 1.0}


 67%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–‹   | 320/480 [10:49<05:01,  1.88s/it]  
 67%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–‹   | 320/480 [11:29<05:01,  1.88s/it]

{'eval_loss': 3.028944969177246, 'eval_runtime': 39.6096, 'eval_samples_per_second': 2.525, 'eval_steps_per_second': 1.262, 'epoch': 2.0}


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 480/480 [16:45<00:00,  2.16s/it]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 480/480 [17:25<00:00,  2.16s/it]

{'eval_loss': 3.0230748653411865, 'eval_runtime': 39.8785, 'eval_samples_per_second': 2.508, 'eval_steps_per_second': 1.254, 'epoch': 3.0}
{'train_runtime': 1045.7447, 'train_samples_per_second': 0.918, 'train_steps_per_second': 0.459, 'train_loss': 3.185089874267578, 'epoch': 3.0}


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 480/480 [17:26<00:00,  2.18s/it]



ðŸ”§ Entrenando EleutherAI/gpt-neo-125M...


Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 320/320 [00:00<00:00, 1604.16 examples/s]
 33%|â–ˆâ–ˆâ–ˆâ–Ž      | 160/480 [05:28<12:00,  2.25s/it]
 33%|â–ˆâ–ˆâ–ˆâ–Ž      | 160/480 [06:19<12:00,  2.25s/it]

{'eval_loss': 2.604733943939209, 'eval_runtime': 51.7532, 'eval_samples_per_second': 1.932, 'eval_steps_per_second': 0.966, 'epoch': 1.0}


 67%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–‹   | 320/480 [12:05<05:43,  2.15s/it]  
 67%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–‹   | 320/480 [12:53<05:43,  2.15s/it]

{'eval_loss': 2.5870566368103027, 'eval_runtime': 47.4234, 'eval_samples_per_second': 2.109, 'eval_steps_per_second': 1.054, 'epoch': 2.0}


                                                 
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 480/480 [19:54<00:00,  2.27s/it]

{'eval_loss': 2.5825839042663574, 'eval_runtime': 53.7483, 'eval_samples_per_second': 1.861, 'eval_steps_per_second': 0.93, 'epoch': 3.0}
{'train_runtime': 1194.8047, 'train_samples_per_second': 0.803, 'train_steps_per_second': 0.402, 'train_loss': 2.5756375630696615, 'epoch': 3.0}


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 480/480 [19:55<00:00,  2.49s/it]



ðŸ”§ Entrenando EleutherAI/pythia-70m...


Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 320/320 [00:00<00:00, 1650.12 examples/s]
                                                 
 33%|â–ˆâ–ˆâ–ˆâ–Ž      | 160/480 [02:45<05:09,  1.04it/s]

{'eval_loss': 3.086937189102173, 'eval_runtime': 17.5217, 'eval_samples_per_second': 5.707, 'eval_steps_per_second': 2.854, 'epoch': 1.0}


                                                 
 67%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–‹   | 320/480 [05:44<02:39,  1.00it/s]

{'eval_loss': 3.017232894897461, 'eval_runtime': 18.0764, 'eval_samples_per_second': 5.532, 'eval_steps_per_second': 2.766, 'epoch': 2.0}


                                                 
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 480/480 [08:10<00:00,  1.19it/s]

{'eval_loss': 2.9888219833374023, 'eval_runtime': 18.574, 'eval_samples_per_second': 5.384, 'eval_steps_per_second': 2.692, 'epoch': 3.0}
{'train_runtime': 491.106, 'train_samples_per_second': 1.955, 'train_steps_per_second': 0.977, 'train_loss': 3.0613072713216147, 'epoch': 3.0}


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 480/480 [08:11<00:00,  1.02s/it]
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



 Modelo: distilgpt2
 Resumen: Muscle cramps are a common problem characterized by a sudden, painful, involuntary contraction of muscle. These true cramps, which originate from peripheral nerves, may be distinguished from other muscle pain or spasm. Medical history, physical examination, and a limited laboratory screen help to determine the various causes of muscle cramps. Despite the "benign" nature of cramps, many patients find the symptom very uncomfortable. Treatment options are guided both by experience and by a limited number of therapeutic trials. Quinine sulfate is an effective medication, but the side-effect profile is worrisome, and other membrane-stabilizing drugs are probably just as effective. Patients will benefit from further studies to better define the pathophysiology of muscle cramps and to find more effective medications with fewer side-effects.




































































































 BERTScore F1: 0.934 | 

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



 Modelo: gpt2
 Resumen: Muscle cramps are a common problem characterized by a sudden, painful, involuntary contraction of muscle. These true cramps, which originate from peripheral nerves, may be distinguished from other muscle pain or spasm. Medical history, physical examination, and a limited laboratory screen help to determine the various causes of muscle cramps. Despite the "benign" nature of cramps, many patients find the symptom very uncomfortable. Treatment options are guided both by experience and by a limited number of therapeutic trials. Quinine sulfate is an effective medication, but the side-effect profile is worrisome, and other membrane-stabilizing drugs are probably just as effective. Patients will benefit from further studies to better define the pathophysiology of muscle cramps and to find more effective medications with fewer side-effects.

The most common cause of muscle cramps is a sudden, painful, involuntary contraction of muscle. These true cramps, which origina

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



 Modelo: EleutherAI/gpt-neo-125M
 Resumen: Muscle cramps are a common problem characterized by a sudden, painful, involuntary contraction of muscle. These true cramps, which originate from peripheral nerves, may be distinguished from other muscle pain or spasm. Medical history, physical examination, and a limited laboratory screen help to determine the various causes of muscle cramps. Despite the "benign" nature of cramps, many patients find the symptom very uncomfortable. Treatment options are guided both by experience and by a limited number of therapeutic trials. Quinine sulfate is an effective medication, but the side-effect profile is worrisome, and other membrane-stabilizing drugs are probably just as effective. Patients will benefit from further studies to better define the pathophysiology of muscle cramps and to find more effective medications with fewer side-effects.
 BERTScore F1: 0.934 | Flesch-Kincaid: 13.60

 Modelo: EleutherAI/pythia-70m
 Resumen: Muscle cramps are a com