# `Fine-Tuning mBART MLSUM`


In [1]:
MODEL_NAME = 'lincoln/mbart-mlsum-automatic-summarization'

In [2]:
%pip install peft

%pip install -q -U bitsandbytes
%pip install -q -U git+https://github.com/huggingface/transformers.git
%pip install -q -U git+https://github.com/huggingface/peft.git
%pip install -q -U git+https://github.com/huggingface/accelerate.git
%pip install -q datasets

%pip install rouge_score

Defaulting to user installation because normal site-packages is not writeable
Collecting peft
  Downloading peft-0.9.0-py3-none-any.whl.metadata (13 kB)
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-0.28.0-py3-none-any.whl.metadata (18 kB)
Downloading peft-0.9.0-py3-none-any.whl (190 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate, peft
Successfully installed accelerate-0.28.0 peft-0.9.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart 

In [2]:
import pandas as pd
import datasets
import torch

from tqdm import tqdm

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, SummarizationPipeline

  from .autonotebook import tqdm as notebook_tqdm
2024-03-16 06:37:32.665262: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-16 06:37:32.665300: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-16 06:37:32.665916: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-16 06:37:32.669528: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## `1` Data preparation


In [3]:
path = 'data/'
train_df = pd.read_csv(path + 'train.csv')
validation_df = pd.read_csv(path + 'validation.csv')
test_df = pd.read_csv(path + 'test_text.csv')

print(f'Loaded {len(train_df)} samples')
train_df.head()

Loaded 21401 samples


Unnamed: 0,text,titles
0,Thierry Mariani sur la liste du Rassemblement ...,L'information n'a pas été confirmée par l'inté...
1,C'est désormais officiel : Alain Juppé n'est p...,Le maire de Bordeaux ne fait plus partie des R...
2,La mesure est décriée par les avocats et les m...,"En 2020, les tribunaux d'instance fusionnent a..."
3,Dans une interview accordée au Figaro mercredi...,"Les médecins jugés ""gros prescripteurs d'arrêt..."
4,Le préjudice est estimé à 2 millions d'euros. ...,Il aura fallu mobiliser 90 gendarmes pour cett...


- Tokenize text and titles


In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.eos_token = tokenizer.pad_token

In [5]:
prompt_template = "### Summarize: {text} \n### Summary:"

# train
train_dataset = {'prompt': [], 'target': []}
for i in range(len(train_df)):
    train_dataset['prompt'].append(prompt_template.format(text=train_df.iloc[i]['text']))
    train_dataset['target'].append(train_df.iloc[i]['titles'])

print(f'Created {len(train_dataset["prompt"])} samples for train_dataset')

# validation
validation_dataset = {'prompt': [], 'target': []}
for i in range(len(validation_df)):
    validation_dataset['prompt'].append(prompt_template.format(text=validation_df.iloc[i]['text']))
    validation_dataset['target'].append(validation_df.iloc[i]['titles'])

print(f'Created {len(validation_dataset["prompt"])} samples for validation_dataset')

Created 21401 samples for train_dataset
Created 1500 samples for validation_dataset


In [6]:
# tokenize function
def tokenize_function(examples):
    tokenizer_inputs = tokenizer(
        examples['prompt'],
        padding='max_length',
        truncation=True,
        max_length=512,
    )
    tokenizer_inputs['labels'] = tokenizer(
        examples['target'],
        padding='max_length',
        truncation=True,
        max_length=128,
    )['input_ids']
    return tokenizer_inputs

In [7]:
# create dataset from the list train_dataset
tokenized_train = datasets.Dataset.from_dict(train_dataset)
tokenized_train = tokenized_train.map(
    tokenize_function,
    batched=True,
    batch_size=4,
    drop_last_batch=True,
)

# create dataset from the list validation_dataset
tokenized_validation = datasets.Dataset.from_dict(validation_dataset)
tokenized_validation = tokenized_validation.map(
    tokenize_function,
    batched=True,
    batch_size=4,
    drop_last_batch=True,
)

Map: 100%|██████████| 21400/21400 [00:19<00:00, 1091.85 examples/s]
Map: 100%|██████████| 1500/1500 [00:01<00:00, 1085.05 examples/s]


## `2` PEFT with LORA


In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [9]:
# base model
base_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)

  return self.fget.__get__(instance, owner)()


In [10]:
from peft import LoraConfig, TaskType, get_peft_model

peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    target_modules= ['k_proj', 'v_proj', 'q_proj'],
    r=2,
    lora_alpha=32,
    lora_dropout=0.1,
    )

model = get_peft_model(base_model, peft_config)
model.print_trainable_parameters()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


trainable params: 442,368 || all params: 611,321,856 || trainable%: 0.07236253630689755


In [11]:
# reduced_train = tokenized_train.select(range(10000))
# reduced_train

In [12]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="lora/mbart-mlsum-automatic-summarization",
    learning_rate=1e-3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    tokenizer=tokenizer,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


- rougeL on train and validation dataset before fine-tuning


In [15]:
# create the pipeline
summarizer = SummarizationPipeline(model=base_model, tokenizer=tokenizer)

from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rougeL'])

In [17]:
# generate titles for train
generated_train_titles = []
batch_size = 8
N = 200

for i in tqdm(range(0, min(N, len(tokenized_train['prompt'])), batch_size)):
    # max length of the input text is 512 based on FirstAnalysis.ipynb
    batch = [text[:512] for text in tokenized_train['prompt'][i:i+batch_size]]
    titles = summarizer(batch, max_length=39, min_length=24, num_beams=4, length_penalty=2.0, early_stopping=True, top_k=50)
    generated_train_titles.extend([title['summary_text'] for title in titles])

100%|██████████| 25/25 [09:47<00:00, 23.49s/it]


In [18]:
# ROUGE-L score for train
train_rouge = []
for i in range(N):
    rouge_score = scorer.score(generated_train_titles[i], tokenized_train['target'][i])['rougeL'][2]
    train_rouge.append(rouge_score)

avg_train_rouge = sum(train_rouge) / len(train_rouge)
print("Average Rouge-L F-Score with mBART-MLSUM-2000:", avg_train_rouge)

Average Rouge-L F-Score with mBART-MLSUM-2000: 0.1824078178390139


In [19]:
# generate titles for validation
generated_validation_titles = []
batch_size = 8
N = 200

for i in tqdm(range(0, min(N, len(tokenized_validation['prompt'])), batch_size)):
    # max length of the input text is 512 based on FirstAnalysis.ipynb
    batch = [text[:512] for text in tokenized_validation['prompt'][i:i+batch_size]]
    titles = summarizer(batch, max_length=39, min_length=24, num_beams=4, length_penalty=2.0, early_stopping=True, top_k=50)
    generated_validation_titles.extend([title['summary_text'] for title in titles])

100%|██████████| 25/25 [09:40<00:00, 23.24s/it]


In [20]:
# ROUGE-L score for validation
validation_rouge = []
for i in range(N):
    rouge_score = scorer.score(generated_validation_titles[i], tokenized_validation['target'][i])['rougeL'][2]
    validation_rouge.append(rouge_score)

avg_validation_rouge = sum(validation_rouge) / len(validation_rouge)
print("Average Rouge-L F-Score with mBART-MLSUM-2000:", avg_validation_rouge)

Average Rouge-L F-Score with mBART-MLSUM-2000: 0.18910661894374978


- fine-tune the model


In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.7984,0.764578
2,0.7743,0.753759
3,0.7614,0.742824
4,0.7396,0.735815
5,0.7275,0.73385


TrainOutput(global_step=13375, training_loss=0.7693331584039136, metrics={'train_runtime': 5573.6408, 'train_samples_per_second': 19.198, 'train_steps_per_second': 2.4, 'total_flos': 1.16086850715648e+17, 'train_loss': 0.7693331584039136, 'epoch': 5.0})

- save the model


In [14]:
save_path = 'lora/mbart-mlsum-5-epochs'
# save the model

trainer.save_model(save_path)
import shutil
shutil.make_archive(save_path, 'zip', save_path)

'/users/eleves-a/2021/abasse.dabere/Desktop/Notebooks/lora/mbart-mlsum-5-epochs.zip'

## `3` Test the train_model


- load the model


In [15]:
# load the model
model = AutoModelForSeq2SeqLM.from_pretrained(save_path).to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# create the pipeline
summarizer = SummarizationPipeline(model=model, tokenizer=tokenizer)

- generate titles


In [16]:
# generate titles for train
generated_train_titles = []
batch_size = 8
N = len(tokenized_train['prompt'])

for i in tqdm(range(0, min(N, len(tokenized_train['prompt'])), batch_size)):
    # max length of the input text is 512 based on FirstAnalysis.ipynb
    batch = [text[:512] for text in tokenized_train['prompt'][i:i+batch_size]]
    titles = summarizer(batch, max_length=39, min_length=24, num_beams=4, length_penalty=2.0, early_stopping=True, top_k=50)
    generated_train_titles.extend([title['summary_text'] for title in titles])

 27%|██▋       | 726/2675 [4:49:53<12:58:15, 23.96s/it]


KeyboardInterrupt: 

In [20]:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rougeL'])

In [21]:
# ROUGE-L score for train
N = len(generated_train_titles)
train_rouge = []
for i in range(N):
    rouge_score = scorer.score(generated_train_titles[i], tokenized_train['target'][i])['rougeL'][2]
    train_rouge.append(rouge_score)

avg_train_rouge = sum(train_rouge) / len(train_rouge)
print("Average Rouge-L F-Score with mBART-MLSUM-2000:", avg_train_rouge)

Average Rouge-L F-Score with mBART-MLSUM-2000: 0.2085607698985734


In [22]:
# generate titles for validation
generated_validation_titles = []
batch_size = 8
N = len(tokenized_validation['prompt'])

for i in tqdm(range(0, min(N, len(tokenized_validation['prompt'])), batch_size)):
    # max length of the input text is 512 based on FirstAnalysis.ipynb
    batch = [text[:512] for text in tokenized_validation['prompt'][i:i+batch_size]]
    titles = summarizer(batch, max_length=39, min_length=24, num_beams=4, length_penalty=2.0, early_stopping=True, top_k=50)
    generated_validation_titles.extend([title['summary_text'] for title in titles])

100%|██████████| 188/188 [1:14:27<00:00, 23.76s/it]


In [23]:
# ROUGE-L score for validation
validation_rouge = []
for i in range(N):
    rouge_score = scorer.score(generated_validation_titles[i], tokenized_validation['target'][i])['rougeL'][2]
    validation_rouge.append(rouge_score)

avg_validation_rouge = sum(validation_rouge) / len(validation_rouge)
print("Average Rouge-L F-Score with mBART-MLSUM-2000:", avg_validation_rouge)

Average Rouge-L F-Score with mBART-MLSUM-2000: 0.2086499917307663


In [27]:
# submission

def tokenize_function2(examples):
    tokenizer_inputs = tokenizer(
        examples['prompt'],
        padding='max_length',
        truncation=True,
        max_length=512,
    )
    return tokenizer_inputs

test_dataset = {'prompt': []}
for i in range(len(test_df)):
    test_dataset['prompt'].append(prompt_template.format(text=test_df.iloc[i]['text']))

print(f'Created {len(test_dataset["prompt"])} samples for test_dataset')

# create dataset from the list test_dataset
tokenized_test = datasets.Dataset.from_dict(test_dataset)
tokenized_test = tokenized_test.map(
    tokenize_function2,
    batched=True,
    batch_size=4,
    drop_last_batch=True,
)

Created 1500 samples for test_dataset


Map: 100%|██████████| 1500/1500 [00:01<00:00, 893.27 examples/s]


In [28]:
tokenized_test

Dataset({
    features: ['prompt', 'input_ids', 'attention_mask'],
    num_rows: 1500
})

In [29]:
# generate titles for test
generated_test_titles = []
batch_size = 8
N = len(tokenized_test['prompt'])

for i in tqdm(range(0, min(N, len(tokenized_test['prompt'])), batch_size)):
    # max length of the input text is 512 based on FirstAnalysis.ipynb
    batch = [text[:512] for text in tokenized_test['prompt'][i:i+batch_size]]
    titles = summarizer(batch, max_length=39, min_length=24, num_beams=4, length_penalty=2.0, early_stopping=True, top_k=50)
    generated_test_titles.extend([title['summary_text'] for title in titles])

100%|██████████| 188/188 [1:14:19<00:00, 23.72s/it]


In [31]:
generated_test_titles[:5]

["Le projet de loi bioéthique a été voté mercredi à l'Assemblée nationale dans le cadre de l'examen en première lecture du projet de loi bioéthique.",
 'Le président du MoDem dénonce un "manquement absolu" aux promesses initiales du chef de l\'Etat.',
 "Le quotidien régional La Dépêche révèle les dessous d'une famille qui s'est déchirée à cause d'un héritage.",
 'Le fondateur de Facebook a décidé de se mettre à la course à pied en apprenant que sa compagne attendait un bébé.',
 "Selon le Journal du Dimanche, un ténor de la droite a demandé son avis à l'économiste Alain Minc."]

In [37]:
# Store the generated summaries in the Kaggle-accepted format
submission_df = pd.DataFrame({
    'ID': test_df['ID'],
    'titles': generated_test_titles,
})

submission_df.head()

Unnamed: 0,ID,titles
0,0,Le projet de loi bioéthique a été voté mercred...
1,1,"Le président du MoDem dénonce un ""manquement a..."
2,2,Le quotidien régional La Dépêche révèle les de...
3,3,Le fondateur de Facebook a décidé de se mettre...
4,4,"Selon le Journal du Dimanche, un ténor de la d..."


In [39]:
submission_df.to_csv('results/lora-mbart_mlsum-5_epochs.csv', index=False)

<span style="color: red;"> Next ? Many many epochs, try another approach </span>
