**Setup**

In [None]:
! pip install datasets evaluate transformers rouge-score nltk



In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [None]:
import transformers

print(transformers.__version__)

4.45.2


**Importing Libraries**

In [None]:
nltk.download('punkt')  # Ensure NLTK is ready for sentence tokenization


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

**Defining Variables**

In [None]:
model_checkpoint = "t5-small"

Loading Dataset

In [None]:
from datasets import load_dataset
from evaluate import load

raw_datasets = load_dataset("xsum")



metric = load("rouge")

In [None]:
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})


In [None]:
raw_datasets['train'][0]

 'summary': 'Clean-up operations are continuing across the Scottish Borders and Dumfries and Galloway after flooding caused by Storm Frank.',
 'id': '35232142'}

Data preprocessing

In [None]:
metric

EvaluationModule(name: "rouge", module_type: "metric", features: [{'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id=None)}, {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}], usage: """
Calculates average rouge scores for a list of hypotheses and references
Args:
    predictions: list of predictions to score. Each prediction
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
    rouge_types: A list of rouge types to calculate.
        Valid names:
        `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
        `"rougeL"`: Longest common subsequence based scoring.
        `"rougeLsum"`: rougeLsum splits text using `"
"`.
        See details in https://github.com/huggingface/

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
tokenizer("Hello, This is a project for summer internship ")

{'input_ids': [8774, 6, 100, 19, 3, 9, 516, 21, 1248, 13361, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""

In [None]:
max_input_length = 1024
max_target_length = 128

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(text_target=examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)


Map:   0%|          | 0/204045 [00:00<?, ? examples/s]

In [None]:
# Select subsets for training and evaluation
tokenized_datasets = {
    "train": tokenized_datasets["train"].select(range(2000)),
    "validation": tokenized_datasets["validation"].select(range(200)),
    "test": tokenized_datasets["test"].select(range(200)),
}


Fine-Tuning the model

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
batch_size = 4
args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    save_total_limit=2,
    push_to_hub=False,
)




In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Add newlines for proper sentence tokenization
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {key: value * 100 for key, value in result.items()}
    result["gen_len"] = np.mean([np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions])

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
0,No log,2.908628,18.5368,3.0938,14.4773,14.7093,18.74




TrainOutput(global_step=62, training_loss=3.217930947580645, metrics={'train_runtime': 9246.2377, 'train_samples_per_second': 0.216, 'train_steps_per_second': 0.007, 'total_flos': 433364448313344.0, 'train_loss': 3.217930947580645, 'epoch': 0.992})

In [None]:
test_text = "The UK's inflation rate is expected to rise sharply next month."
summary = trainer.predict(tokenized_datasets["test"])
print(summary)


PredictionOutput(predictions=array([[    0, 27344,  7505, ...,     5,     8, 22982],
       [    0,  2095,   497, ...,   540,   130, 16599],
       [    0,     8,   662, ...,     9,  5341,    11],
       ...,
       [    0, 27874,    43, ...,   651,  6942,   776],
       [    0,  7222,  1948, ...,  1524,   313,    30],
       [    0,     3,     9, ...,   118,  5539,  6107]]), label_ids=array([[  290,    19,     3, ...,  -100,  -100,  -100],
       [   71,   388,    65, ...,  -100,  -100,  -100],
       [ 5933,   151, 11970, ...,  -100,  -100,  -100],
       ...,
       [27874,  3156,  2366, ...,  -100,  -100,  -100],
       [ 9938,   419,  5165, ...,  -100,  -100,  -100],
       [   71,   388,   113, ...,  -100,  -100,  -100]]), metrics={'test_loss': 3.045330762863159, 'test_rouge1': 19.3407, 'test_rouge2': 3.6578, 'test_rougeL': 15.4287, 'test_rougeLsum': 15.894, 'test_gen_len': 18.865, 'test_runtime': 616.105, 'test_samples_per_second': 0.325, 'test_steps_per_second': 0.081})


In [None]:
trainer.push_to_hub()

events.out.tfevents.1729695127.61f6f1996ea9.17593.0:   0%|          | 0.00/11.7k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

events.out.tfevents.1729700016.61f6f1996ea9.17593.1:   0%|          | 0.00/12.6k [00:00<?, ?B/s]

Upload 5 LFS files:   0%|          | 0/5 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/chinm1010r/results/commit/162b6609315501d15609e97754da6b44fa1af237', commit_message='End of training', commit_description='', oid='162b6609315501d15609e97754da6b44fa1af237', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# Evaluate on the validation dataset
validation_results = trainer.evaluate(tokenized_datasets["validation"])
print(f"Validation Results: {validation_results}")




Validation Results: {'eval_loss': 2.908628463745117, 'eval_rouge1': 18.5368, 'eval_rouge2': 3.0938, 'eval_rougeL': 14.4773, 'eval_rougeLsum': 14.7093, 'eval_gen_len': 18.74, 'eval_runtime': 648.1701, 'eval_samples_per_second': 0.309, 'eval_steps_per_second': 0.077, 'epoch': 0.992}


Summaries on the Test set

In [None]:
# Test with an example from the test set
test_sample = tokenized_datasets["test"][0]["document"]
print(f"Original Document: {test_sample}")

# Generate summary
input_ids = tokenizer(test_sample, return_tensors="pt").input_ids
summary_ids = model.generate(input_ids, max_length=150, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print(f"Generated Summary: {summary}")


Token indices sequence length is longer than the specified maximum sequence length for this model (775 > 512). Running this sequence through the model will result in indexing errors


Original Document: Prison Link Cymru had 1,099 referrals in 2015-16 and said some ex-offenders were living rough for up to a year before finding suitable accommodation.
Workers at the charity claim investment in housing would be cheaper than jailing homeless repeat offenders.
The Welsh Government said more people than ever were getting help to address housing problems.
Changes to the Housing Act in Wales, introduced in 2015, removed the right for prison leavers to be given priority for accommodation.
Prison Link Cymru, which helps people find accommodation after their release, said things were generally good for women because issues such as children or domestic violence were now considered.
However, the same could not be said for men, the charity said, because issues which often affect them, such as post traumatic stress disorder or drug dependency, were often viewed as less of a priority.
Andrew Stevens, who works in Welsh prisons trying to secure housing for prison leavers, said the 

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("path/to/local/folder")
tokenizer = AutoTokenizer.from_pretrained("path/to/local/folder")
