In [1]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")

In [2]:
dataset.column_names

['article', 'highlights', 'id']

In [3]:
# Convert to pandas
df = pd.DataFrame(dataset)
df = df[["article", "highlights"]]  # Keep only the columns we need

In [4]:
# Save to disk for DVC tracking
df.to_csv("data/cnn_dm_subset.csv", index=False)
print("Dataset saved to data/cnn_dm_subset.csv")

Dataset saved to data/cnn_dm_subset.csv


In [4]:
# 🔧 Step 2: Tokenize + Prepare for Training
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import Dataset

tokenizer = T5Tokenizer.from_pretrained("t5-small")

def tokenize(batch):
    inputs = tokenizer(
        ["summarize: " + text for text in batch["article"]],
        padding="max_length",
        truncation=True,
        max_length=512
    )
    targets = tokenizer(
        batch["highlights"],
        padding="max_length",
        truncation=True,
        max_length=128
    )
    inputs["labels"] = targets["input_ids"]
    return inputs

dataset = Dataset.from_pandas(df) # Wrap your data in HuggingFace's Dataset object
tokenized = dataset.map(tokenize, batched=True) # Tokenize the entire dataset
tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) # Set format for PyTorch training

# preparing article-summary pairs into tokenized input/output tensors to fine-tune a t5-small model to summarize text

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/2871 [00:00<?, ? examples/s]

#### Load the model

In [5]:
from transformers import T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [1]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
    save_strategy="epoch",
    report_to=[]  # disables HF tracking
)



In [7]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized.shuffle(seed=42).select(range(500)),
    eval_dataset=tokenized.shuffle(seed=43).select(range(100)),
    tokenizer=tokenizer
)

  trainer = Trainer(


In [8]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,No log,1.102345


TrainOutput(global_step=125, training_loss=2.647498291015625, metrics={'train_runtime': 159.816, 'train_samples_per_second': 3.129, 'train_steps_per_second': 0.782, 'total_flos': 67670900736000.0, 'train_loss': 2.647498291015625, 'epoch': 1.0})

In [9]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 1.1023451089859009, 'eval_runtime': 4.3147, 'eval_samples_per_second': 23.176, 'eval_steps_per_second': 5.794, 'epoch': 1.0}


In [6]:
from dagshub import init
init(repo_owner="achrafhoteit", repo_name="AIDE505-final-project", mlflow=True)

In [None]:
import mlflow
import mlflow.pytorch
from transformers import Trainer, TrainingArguments

def train_with_mlflow(model, tokenizer, train_dataset, eval_dataset, learning_rate=5e-5, batch_size=4, epochs=1):
    # Set experiment
    mlflow.set_experiment("text-summarizer")
    
    # Set DagsHub tracking URI for remote logging
    mlflow.set_tracking_uri("https://dagshub.com/achrafhoteit/AIDE505-final-project.mlflow")

    # Set local tracking URI for local UI
    # mlflow.set_tracking_uri("file:./mlruns") # this was working before connecting to DagsHub


    # Training args
    args = TrainingArguments(
        # output_dir=output_dir,
        evaluation_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        save_strategy="epoch",
        report_to=[]
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer
    )

    with mlflow.start_run():
        # Log parameters
        mlflow.log_param("model", model.config.name_or_path)
        mlflow.log_param("learning_rate", learning_rate)
        mlflow.log_param("batch_size", batch_size)
        mlflow.log_param("epochs", epochs)

        # Train
        trainer.train()

        # Evaluate
        eval_result = trainer.evaluate()
        mlflow.log_metric("eval_loss", eval_result["eval_loss"])

        # Save model
        mlflow.pytorch.log_model(model, "summarizer_model")

        print("✅ Run completed and logged to MLflow!")

    return trainer


In [11]:
trainer = train_with_mlflow(
    model=model,
    tokenizer=tokenizer,
    train_dataset=tokenized.shuffle(seed=42).select(range(500)),
    eval_dataset=tokenized.shuffle(seed=43).select(range(100)),
    # output_dir="./results/run_01",
    learning_rate=5e-5,
    batch_size=4,
    epochs=2
)

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,1.054396
2,No log,1.027778




MlflowException: When an mlflow-artifacts URI was supplied, the tracking URI must be a valid http or https URI, but it was currently set to file:///./mlruns. Perhaps you forgot to set the tracking URI to the running MLflow server. To set the tracking URI, use either of the following methods:
1. Set the MLFLOW_TRACKING_URI environment variable to the desired tracking URI. `export MLFLOW_TRACKING_URI=http://localhost:5000`
2. Set the tracking URI programmatically by calling `mlflow.set_tracking_uri`. `mlflow.set_tracking_uri('http://localhost:5000')`

In [16]:
trainer2 = train_with_mlflow(
    model=model,
    tokenizer=tokenizer,
    train_dataset=tokenized.shuffle(seed=42).select(range(2000)),
    eval_dataset=tokenized.shuffle(seed=43).select(range(500)),
    output_dir="./results/run_02",
    learning_rate=5e-5,
    batch_size=8,
    epochs=3
)

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.895979
2,0.928200,0.889565
3,0.928200,0.883532




✅ Run completed and logged to MLflow!
🏃 View run stylish-fawn-811 at: https://dagshub.com/achrafhoteit/AIDE505-final-project.mlflow/#/experiments/0/runs/d7a51b9ad1154324a9476a47b3875a4d
🧪 View experiment at: https://dagshub.com/achrafhoteit/AIDE505-final-project.mlflow/#/experiments/0


#### This will only work on my local machine, cause i'm fetching a local model

In [17]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_path = "./results/run_02/checkpoint-750"  # Replace XXX with actual checkpoint number or just use "./results/run_002"
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

In [18]:
def summarize(text, max_input_length=512, max_output_length=128):
    input_text = "summarize: " + text
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=max_input_length, truncation=True)

    summary_ids = model.generate(
        inputs,
        max_length=max_output_length,
        num_beams=4,
        early_stopping=True
    )

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


In [19]:
sample_text = """
The European Union announced today a new climate initiative that aims to reduce carbon emissions by 55% before 2030. 
This ambitious goal is part of the EU Green Deal and includes measures like carbon taxes, green energy investments, and stricter vehicle regulations.
"""

print(summarize(sample_text))


The European Union announced today a new climate initiative. This ambitious goal is part of the EU Green Deal. It includes measures like carbon taxes, green energy investments and stricter vehicle regulations.


#### i can even load model from mlflow and not locally

In [12]:
from transformers import T5Tokenizer

# Load tokenizer (must match model)
tokenizer = T5Tokenizer.from_pretrained("t5-small")  # or whatever model you fine-tuned

# Load model from MLflow run
import mlflow.pytorch
model = mlflow.pytorch.load_model("runs:/5d30ba2385b44f2887e00ca84247c0fd/summarizer_model")
model.eval()  # Important: put in eval mode

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [13]:
import torch

def summarize(text, max_input_length=512, min_output_length=30, max_output_length=60):
    input_text = "summarize: " + text
    inputs = tokenizer.encode(
        input_text,
        return_tensors="pt",
        max_length=max_input_length,
        truncation=True
    ).to("cpu")  # force input to CPU

    model.to("cpu")  # force model to CPU
    model.eval()

    summary_ids = model.generate(
        inputs,
        min_length=min_output_length,
        max_length=max_output_length,
        num_beams=4,
        length_penalty=1.0,
        early_stopping=True
    )

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


In [14]:
sample_text = """
OpenAI has released its latest large language model, claiming it can understand and generate human-like text at a new level of coherence and accuracy.
"""

print(summarize(sample_text))

OpenAI has released its latest large language model. It can understand and generate human-like text at a new level of accuracy.
