# `Fine-Tuning t5-base-fr-sum-cnndm`


In [1]:
MODEL_NAME = 'plguillou/t5-base-fr-sum-cnndm'

In [2]:
%pip install peft

%pip install -q -U bitsandbytes
%pip install -q -U git+https://github.com/huggingface/transformers.git
%pip install -q -U git+https://github.com/huggingface/peft.git
%pip install -q -U git+https://github.com/huggingface/accelerate.git
%pip install -q datasets

%pip install rouge_score

In [3]:
import pandas as pd
import numpy as np
import datasets
import torch

from tqdm import tqdm
from rouge_score import rouge_scorer

from transformers import T5Tokenizer, T5ForConditionalGeneration, SummarizationPipeline

  from .autonotebook import tqdm as notebook_tqdm
2024-03-23 16:13:17.136685: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-23 16:13:18.590834: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-23 16:13:18.590915: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-23 16:13:18.845925: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-23 16:13:19.3

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# rouge score
scorer = rouge_scorer.RougeScorer(['rougeL'])

def calculate_rouge_score(reference, generated):
    rouge_score = scorer.score(generated, reference)['rougeL'][2]
    return rouge_score

# load data
print('########## Loading data ##########')
path = 'data/'
train_df = pd.read_csv(path + 'train.csv')
validation_df = pd.read_csv(path + 'validation.csv')
test_df = pd.read_csv(path + 'test_text.csv')

# data preparation
print('########## Data preparation ##########')
train_df['text'] = 'summarize: ' + train_df['text']
validation_df['text'] = 'summarize: ' + validation_df['text']
test_df['text'] = 'summarize: ' + test_df['text']

# convert to dataset
train_dataset = datasets.Dataset.from_pandas(train_df)
validation_dataset = datasets.Dataset.from_pandas(validation_df)
test_dataset = datasets.Dataset.from_pandas(test_df)

print(f'Train dataset size: {len(train_dataset["text"])}')
print(f'Validation dataset size: {len(validation_dataset["text"])}')
print(f'Test dataset size: {len(test_dataset["text"])}')

cuda
########## Loading data ##########
########## Data preparation ##########
Train dataset size: 21401
Validation dataset size: 1500
Test dataset size: 1500


### `MODEL AND TOKENIZED DATASET`


In [5]:
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  return self.fget.__get__(instance, owner)()


In [None]:
max_text_length = 512
max_title_length = 68

In [6]:
def tokenize_function(examples):
    tokenizer_inputs = tokenizer(
        examples['text'],
        padding='max_length',
        max_length=max_text_length,
        truncation=True,
        return_tensors='pt'
    )
    if 'titles' not in examples:
        return {
            'input_ids': tokenizer_inputs['input_ids'].squeeze(),
            'attention_mask': tokenizer_inputs['attention_mask'].squeeze()
        }
    tokenizer_outputs = tokenizer(
        examples['titles'],
        padding='max_length',
        max_length=max_title_length,
        truncation=True,
        return_tensors='pt'
    )
    return {
        'input_ids': tokenizer_inputs['input_ids'].squeeze(),
        'attention_mask': tokenizer_inputs['attention_mask'].squeeze(),
        'labels': tokenizer_outputs['input_ids'].squeeze()
    }
        
# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
validation_dataset = validation_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 21401/21401 [00:20<00:00, 1046.45 examples/s]
Map: 100%|██████████| 1500/1500 [00:01<00:00, 1070.92 examples/s]
Map: 100%|██████████| 1500/1500 [00:01<00:00, 1153.85 examples/s]


### `GENERATE TITLES`


In [7]:
def generate_train_validation_titles(model, train_dataset, validation_dataset):
    model.to(device)
    batch_size = 8
    N = min(100, len(train_dataset))
    # generate titles for train with batch size 16
    generated_train_titles = []
    for i in tqdm(range(0, N, batch_size)):
        batch = train_dataset[i:i+batch_size]
        generated_titles = model.generate(
            input_ids=torch.tensor(batch['input_ids']).to(device),
            max_length=max_title_length,
            num_beams=4,
            early_stopping=True
        )
        generated_titles = tokenizer.batch_decode(generated_titles, skip_special_tokens=True)
        generated_train_titles.extend(generated_titles)

    # generate titles for validation with batch size 16
    generated_validation_titles = []
    N = min(100, len(validation_dataset))
    for i in tqdm(range(0, N, batch_size)):
        batch = validation_dataset[i:i+batch_size]
        generated_titles = model.generate(
            input_ids=torch.tensor(batch['input_ids']).to(device),
            max_length=max_title_length,
            num_beams=4,
            early_stopping=True
        )
        generated_titles = tokenizer.batch_decode(generated_titles, skip_special_tokens=True)
        generated_validation_titles.extend(generated_titles)

    return generated_train_titles, generated_validation_titles

def generate_test_titles(model, test_dataset):
    model.to(device)
    batch_size = 8
    N = len(test_dataset)
    # generate titles for test with batch size 16
    generated_test_titles = []
    for i in tqdm(range(0, N, batch_size)):
        batch = test_dataset[i:i+batch_size]
        generated_titles = model.generate(
            input_ids=torch.tensor(batch['input_ids']).to(device),
            max_length=max_title_length,
            num_beams=4,
            early_stopping=True
        )
        generated_test_titles.extend(generated_titles)

    return generated_test_titles

def calculate_rouge(generated, references):
    # rouge score for train
    rouge_scores = []
    N = len(generated)
    for i in tqdm(range(N)):
        rouge_score = calculate_rouge_score(references[i], generated[i])
        rouge_scores.append(rouge_score)
    avg_rouge_scores = sum(rouge_scores) / N
    return avg_rouge_scores

def create_submission_file(generated_test_titles, name='submission.csv'):
    # create submission file
    submission_df = pd.DataFrame({
        'ID': test_df['ID'],
        'titles': generated_test_titles,
    })
    submission_df.to_csv(f'results/{name}', index=False)

In [8]:
# generate titles for train and validation
print('########## Generating titles for train and validation ##########')
generated_train_titles, generated_validation_titles = generate_train_validation_titles(model, train_dataset, validation_dataset)
generated_test_titles = generate_test_titles(model, test_dataset)

########## Generating titles for train and validation ##########


100%|██████████| 13/13 [01:42<00:00,  7.85s/it]
100%|██████████| 13/13 [01:42<00:00,  7.85s/it]
100%|██████████| 188/188 [24:50<00:00,  7.93s/it]


In [9]:
# calculate rouge score
print('########## Calculating rouge score ##########')
train_rouge_score = calculate_rouge(generated_train_titles, train_df['titles'])
validation_rouge_score = calculate_rouge(generated_validation_titles, validation_df['titles'])

print(f'Train rouge score: {train_rouge_score}')
print(f'Validation rouge score: {validation_rouge_score}')

########## Calculating rouge score ##########


100%|██████████| 104/104 [00:00<00:00, 2291.70it/s]
100%|██████████| 104/104 [00:00<00:00, 2157.49it/s]

Train rouge score: 0.1942314133696225
Validation rouge score: 0.18356194110789598





### `PEFT with LORA`


In [8]:
from peft import LoraConfig, TaskType, get_peft_model
from transformers import TrainingArguments, Trainer

output_dir = f'lora/{MODEL_NAME}'

In [9]:
# lora config

peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    target_modules= ['q', 'k', 'v', 'o'], # check the model summary to get the names of the modules
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    )

peft_model = get_peft_model(model, peft_config).to(device)
peft_model.print_trainable_parameters()

trainable params: 3,538,944 || all params: 226,422,528 || trainable%: 1.5629822841656462


In [10]:
# Training arguments and trainer

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=50,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [11]:
# train model
# history = trainer.train()

Epoch,Training Loss,Validation Loss
1,1.6265,1.469039
2,1.555,1.436459
3,1.5181,1.433633
4,1.4826,1.409938
5,1.4655,1.400498
6,1.4518,1.397835
7,1.4151,1.38743
8,1.404,1.398021
9,1.382,1.392312
10,1.3778,1.393764


KeyboardInterrupt: 

### `TEST LORA_MODEL`


In [None]:
checkpoint = # TODO: checkpoint number
checkpoint = f'{output_dir}/checkpoint-{checkpoint}'
cpt_model = T5ForConditionalGeneration.from_pretrained(checkpoint).to(device)

In [None]:
# generate titles for train and validation
print('########## Generating titles for train and validation ##########')
generated_train_titles, generated_validation_titles = generate_train_validation_titles(cpt_model, train_dataset, validation_dataset)
generated_test_titles = generate_test_titles(cpt_model, test_dataset)

# calculate rouge score
print('########## Calculating rouge score ##########')
train_rouge_score = calculate_rouge(generated_train_titles, train_df['titles'])
validation_rouge_score = calculate_rouge(generated_validation_titles, validation_df['titles'])

print(f'Train rouge score: {train_rouge_score}')
print(f'Validation rouge score: {validation_rouge_score}')