# `Fine-Tuning mBART MLSUM`


In [1]:
MODEL_NAME = 'lincoln/mbart-mlsum-automatic-summarization'

In [2]:
# %pip install peft

# %pip install -q -U bitsandbytes
# %pip install -q -U git+https://github.com/huggingface/transformers.git
# %pip install -q -U git+https://github.com/huggingface/peft.git
# %pip install -q -U git+https://github.com/huggingface/accelerate.git
# %pip install -q datasets

# %pip install rouge_score

In [3]:
import pandas as pd
import datasets
import torch

from tqdm import tqdm

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, SummarizationPipeline

## `1` Data preparation


In [4]:
path = '../recup/'
df = pd.read_csv(path + '2000_train_samples.csv')
print(f'Loaded {len(df)} samples')
df.head()

Loaded 2000 samples


Unnamed: 0,text,titles
0,"Les ""gilets jaunes"" et leurs colères font leur...",Quelques tensions ont éclaté place Wagram à Pa...
1,"Parmi ces blessés, trois ont été hospitalisés,...",Une dizaine de personnes ont été blessées vend...
2,"L'ancienne patronne d'EELV a, elle, raillé son...","Benoît Hamon et Cécile Duflot, tous les deux c..."
3,"« C'était comme un coup de canon », se souvien...","Selon les informations du Parisien, plusieurs ..."
4,"À 85 ans, Bernadette Chirac est obligée de se ...",L'ancienne Première dame et épouse de Jacques ...


- Tokenize text and titles


In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.eos_token = tokenizer.pad_token

In [6]:
prompt_template = "### Summarize: {text} \n### Summary:"
finetuning_dataset = {'prompt': [], 'target': []}
for i in range(len(df)):
    finetuning_dataset['prompt'].append(prompt_template.format(text=df.iloc[i]['text']))
    finetuning_dataset['target'].append(df.iloc[i]['titles'])

print(f'Created {len(finetuning_dataset["prompt"])} samples for fine-tuning')

Created 2000 samples for fine-tuning


In [7]:
# tokenize function
def tokenize_function(examples):
    tokenizer_inputs = tokenizer(
        examples['prompt'],
        padding='max_length',
        truncation=True,
        max_length=512,
    )
    tokenizer_inputs['labels'] = tokenizer(
        examples['target'],
        padding='max_length',
        truncation=True,
        max_length=128,
    )['input_ids']
    return tokenizer_inputs

In [8]:
# create dataset from the list finetuning_dataset
tokenized_datasets = datasets.Dataset.from_dict(finetuning_dataset)
tokenized_datasets = tokenized_datasets.map(
    tokenize_function,
    batched=True,
    batch_size=4,
    drop_last_batch=True,
)

# split the dataset
split_dataset = tokenized_datasets.train_test_split(test_size=0.3, shuffle=True, seed=42)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

## `2` PEFT with LORA


In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [10]:
# base model
base_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)

In [11]:
from peft import LoraConfig, TaskType, get_peft_model

peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    target_modules= ['k_proj', 'v_proj', 'q_proj'],
    r=2,
    lora_alpha=32,
    lora_dropout=0.1,
    )

model = get_peft_model(base_model, peft_config)
model.print_trainable_parameters()

'NoneType' object has no attribute 'cadam32bit_grad_fp32'
trainable params: 442,368 || all params: 611,321,856 || trainable%: 0.07236253630689755


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [12]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="lora/mbart-mlsum-automatic-summarization",
    learning_rate=1e-3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    tokenizer=tokenizer,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


- fine-tune the model


In [13]:
# trainer.train()

- save the model


In [14]:
# save the model
# trainer.save_model('lora/mbart-mlsum-2000')
# import shutil
# shutil.make_archive('lora/mbart-mlsum-2000', 'zip', 'lora/mbart-mlsum-2000')

## `3` Test the train_model


- load the model


In [15]:
# load the model
model = AutoModelForSeq2SeqLM.from_pretrained(path + 'mbart-mlsum-2000').to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# create the pipeline
summarizer = SummarizationPipeline(model=model, tokenizer=tokenizer)

- generate titles


In [16]:
# generate titles for the N first texts in the validation set
generated_titles = []
batch_size = 8
N = 100

for i in tqdm(range(0, min(N, len(split_dataset['test']['prompt'])), batch_size)):
    # max length of the input text is 512 based on FirstAnalysis.ipynb
    batch = [text[:512] for text in split_dataset['test']['prompt'][i:i+batch_size]]
    titles = summarizer(batch, max_length=39, min_length=24, num_beams=4, length_penalty=2.0, early_stopping=True, no_repeat_ngram_size=3, top_k=50)
    generated_titles.extend([title['summary_text'] for title in titles])

100%|██████████| 13/13 [13:33<00:00, 62.55s/it]


- rouge score on test data


In [17]:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rougeL'])

In [18]:
# calculate ROUGE scores between the generated titles and the true titles
mbart_mlsum_2000_rouge = []
for i in range(N):
    rouge_score = scorer.score(generated_titles[i], split_dataset['test']['target'][i])['rougeL'][2]
    mbart_mlsum_2000_rouge.append(rouge_score)

avg_rouge_score_mbart_mlsum_2000 = sum(mbart_mlsum_2000_rouge) / len(mbart_mlsum_2000_rouge)
print("Average Rouge-L F-Score with mBART-MLSUM-2000:", avg_rouge_score_mbart_mlsum_2000)

Average Rouge-L F-Score with mBART-MLSUM-2000: 0.20661475710424512


<span style="color: red;"> Next ? Train for many data and many epochs </span>
