In [2]:
model_checkpoint ="t5-small"

In [3]:
from sklearn.datasets import load_breast_cancer
from datasets import load_dataset
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
raw_dataset=load_dataset("xsum",trust_remote_code=True)
metric=evaluate.load("rouge")

In [5]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})

In [6]:
raw_dataset['validation'][0]

{'document': 'The ex-Reading defender denied fraudulent trading charges relating to the Sodje Sports Foundation - a charity to raise money for Nigerian sport.\nMr Sodje, 37, is jointly charged with elder brothers Efe, 44, Bright, 50 and Stephen, 42.\nAppearing at the Old Bailey earlier, all four denied the offence.\nThe charge relates to offences which allegedly took place between 2008 and 2014.\nSam, from Kent, Efe and Bright, of Greater Manchester, and Stephen, from Bexley, are due to stand trial in July.\nThey were all released on bail.',
 'summary': 'Former Premier League footballer Sam Sodje has appeared in court alongside three brothers accused of charity fraud.',
 'id': '38295789'}

In [7]:
metric

EvaluationModule(name: "rouge", module_type: "metric", features: [{'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id=None)}, {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}], usage: """
Calculates average rouge scores for a list of hypotheses and references
Args:
    predictions: list of predictions to score. Each prediction
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
    rouge_types: A list of rouge types to calculate.
        Valid names:
        `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
        `"rougeL"`: Longest common subsequence based scoring.
        `"rougeLsum"`: rougeLsum splits text using `"
"`.
        See details in https://github.com/huggingface/

In [8]:
from transformers import AutoTokenizer
tokenizer=AutoTokenizer.from_pretrained(model_checkpoint)

In [9]:
if model_checkpoint in ["t5-small",'t5-base']:
    prefix="summarize: "
else:
    prefix=""

In [10]:
max_length_input=1024
max_target_length=128

def preprocess(text):
    inputs=[prefix+doc for doc in text['document']]
    model_inputs=tokenizer(inputs,max_length=max_length_input,truncation =True)

    with tokenizer.as_target_tokenizer():
        labels=tokenizer(text['summary'], max_length=max_target_length,truncation=True)

    model_inputs['labels']=labels['input_ids']
    return model_inputs

In [11]:
preprocess(raw_dataset['train'][:2])



{'input_ids': [[21603, 10, 37, 423, 583, 13, 1783, 16, 20126, 16496, 6, 80, 13, 8, 844, 6025, 4161, 6, 19, 341, 271, 14841, 5, 7057, 161, 19, 4912, 16, 1626, 5981, 11, 186, 7540, 16, 1276, 15, 2296, 7, 5718, 2367, 14621, 4161, 57, 4125, 387, 5, 15059, 7, 30, 8, 4653, 4939, 711, 747, 522, 17879, 788, 12, 1783, 44, 8, 15763, 6029, 1813, 9, 7472, 5, 1404, 1623, 11, 5699, 277, 130, 4161, 57, 18368, 16, 20126, 16496, 227, 8, 2473, 5895, 15, 147, 89, 22411, 139, 8, 1511, 5, 1485, 3271, 3, 21926, 9, 472, 19623, 5251, 8, 616, 12, 15614, 8, 1783, 5, 37, 13818, 10564, 15, 26, 3, 9, 3, 19513, 1481, 6, 18368, 186, 1328, 2605, 30, 7488, 1887, 3, 18, 8, 711, 2309, 9517, 89, 355, 5, 3966, 1954, 9233, 15, 6, 113, 293, 7, 8, 16548, 13363, 106, 14022, 84, 47, 14621, 4161, 6, 243, 255, 228, 59, 7828, 8, 1249, 18, 545, 11298, 1773, 728, 8, 8347, 1560, 5, 611, 6, 255, 243, 72, 1709, 1528, 161, 228, 43, 118, 4006, 91, 12, 766, 8, 3, 19513, 1481, 410, 59, 5124, 5, 96, 196, 17, 19, 1256, 68, 27, 103, 317, 132

In [12]:
tokenized_dataset=raw_dataset.map(preprocess,batched=True)


In [13]:
from transformers import AutoModelForSeq2SeqLM , DataCollatorForSeq2Seq , Seq2SeqTrainingArguments,Seq2SeqTrainer
model=AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [14]:
batchsize=16
print(model_checkpoint)
model_args=Seq2SeqTrainingArguments(
    f"{model_checkpoint}-finetuned-xsum",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batchsize,
    per_device_eval_batch_size=batchsize,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False
)

t5-small


In [15]:
data_collator=DataCollatorForSeq2Seq(tokenizer,model=model_checkpoint)

In [16]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    preditcions,labels=eval_pred
    decoded_preds=tokenizer.batch_decode(preditcions,skip_special_tokens=True)
    labels =np.where(labels != -100,labels,tokenizer.pad_token_id)
    decoded_labels=tokenizer.batch_decode(labels,skip_special_tokens=True)

    decoded_preds=['\n'.join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels=['\n'.join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result=metric.compute(predictions=decoded_preds,references=decoded_labels,use_stemmer=True)

    result={key: value.mid.fmeasure *100 for key,value in result.items()}

    preditcion_lens =[np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preditcions]
    result["gen_len"]=np.mean(preditcion_lens)

    return {k: round(v,4) for k ,v in result.items()}


In [20]:
trainer=Seq2SeqTrainer(
    model=model,
    args=model_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [21]:
trainer.train()

  0%|          | 2/12753 [05:05<523:13:18, 147.72s/it]