<center>
<h1><br/></h1>
<h1>INF 582: Introduction To Text Mining And NLP</h1>
<h2>Challenge: News Articles Title Generation</h2>
<h3>Notebook: Fine-Tuning the 1st model: mbart-mlsum-automatic-summarization</h3>
<h4>Students Name: <br>
<b>DABERE Abasse<br>
FUERTES PANIZO Cynthia Yacel</b> </h4>
<br>
</center>

# `Fine-Tuning mbart-mlsum-automatic-summarization`


In [None]:
MODEL_NAME = 'lincoln/mbart-mlsum-automatic-summarization'

In [None]:
%pip install -q -U bitsandbytes
%pip install -q -U git+https://github.com/huggingface/transformers.git
%pip install -q -U git+https://github.com/huggingface/peft.git
%pip install -q -U git+https://github.com/huggingface/accelerate.git
%pip install -q datasets

In [None]:
# Libraries for data analysis and visualization
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json

# Deep learning and natural language processing
import torch
torch.cuda.empty_cache()  # Clears CUDA cache
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, SummarizationPipeline
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import TrainingArguments, Trainer
import datasets

# Utilities and text preprocessing
from tqdm import tqdm
import re

# Model evaluation and custom models
from rouge_score import rouge_scorer
from peft import LoraConfig, TaskType, get_peft_model

  from .autonotebook import tqdm as notebook_tqdm
2024-03-30 19:47:40.322739: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-30 19:47:40.322777: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-30 19:47:40.323464: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-30 19:47:40.327717: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [None]:
# rouge score
scorer = rouge_scorer.RougeScorer(['rougeL'])

def calculate_rouge_score(reference, generated):
    rouge_score = scorer.score(generated, reference)['rougeL'][2]
    return rouge_score

In [None]:
# tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.eos_token = tokenizer.pad_token

## `1 DATA PREPARATION`


In [None]:
path = "data/"
train_df = pd.read_csv(path + 'train.csv')
validation_df = pd.read_csv(path + 'validation.csv')
test_df = pd.read_csv(path + 'test_text.csv')

In [None]:
prompt_template = "### Summarize: {text} \n### Summary:"

def create_dataset_dict(df):
    dataset = {'prompt': []}
    if 'titles' in df.columns:
        dataset['target'] = []
    for i in range(len(df)):
        dataset['prompt'].append(prompt_template.format(text=df.iloc[i]['text']))
        if 'titles' in df.columns:
            dataset['target'].append(df.iloc[i]['titles'])
    return dataset

In [None]:
max_text_length = 512
max_title_length = 128

# tokenize function
def tokenize_function(examples):
    tokenizer_inputs = tokenizer(
        examples['prompt'],
        padding='max_length',
        truncation=True,
        max_length=max_text_length,
    )
    if 'target' in examples:
        tokenizer_inputs['labels'] = tokenizer(
            examples['target'],
            padding='max_length',
            truncation=True,
            max_length=max_title_length,
        )['input_ids']
    return tokenizer_inputs

In [None]:
def create_dataset(df):
    dataset_dict = create_dataset_dict(df)
    dataset = datasets.Dataset.from_dict(dataset_dict)
    dataset = dataset.map(
        tokenize_function,
        batched=True,
        batch_size=4,
        drop_last_batch=True,
        )
    return dataset

In [None]:
# train
train_dataset = create_dataset(train_df)
print(f'Created {len(train_dataset["prompt"])} samples for train_dataset')

# validation
validation_dataset = create_dataset(validation_df)
print(f'Created {len(validation_dataset["prompt"])} samples for validation_dataset')

# test
test_dataset = create_dataset(test_df)
print(f'Created {len(test_dataset["prompt"])} samples for test_dataset')

Map: 100%|██████████| 21400/21400 [00:24<00:00, 861.41 examples/s]


Created 21400 samples for train_dataset


Map: 100%|██████████| 1500/1500 [00:01<00:00, 1025.71 examples/s]


Created 1500 samples for validation_dataset


Map: 100%|██████████| 1500/1500 [00:01<00:00, 1228.24 examples/s]

Created 1500 samples for test_dataset





In [None]:
def generate_titles(model, N=200, batch_size = 8):
    pipeline = SummarizationPipeline(model, tokenizer, device=device)
    # generate titles for train
    generated_train_titles = []
    for i in tqdm(range(0, min(N, len(train_dataset['prompt'])), batch_size)):
        batch = [text[:max_text_length] for text in train_dataset['prompt'][i:i+batch_size]]
        titles = pipeline(batch, max_length=max_title_length, num_beams=4, length_penalty=2.0, early_stopping=True, top_k=50)
        generated_train_titles.extend([title['summary_text'] for title in titles])

    # generate titles for validation
    generated_validation_titles = []
    for i in tqdm(range(0, min(N, len(validation_dataset['prompt'])), batch_size)):
        batch = [text[:max_text_length] for text in validation_dataset['prompt'][i:i+batch_size]]
        titles = pipeline(batch, max_length=max_title_length, num_beams=4, length_penalty=2.0, early_stopping=True, top_k=50)
        generated_validation_titles.extend([title['summary_text'] for title in titles])

    return generated_train_titles, generated_validation_titles

def generate_test_titles(model, batch_size = 8):
    pipeline = SummarizationPipeline(model, tokenizer, device=device)
    # generate titles for test
    generated_test_titles = []
    for i in tqdm(range(0, len(test_dataset['prompt']), batch_size)):
        batch = [text[:max_text_length] for text in test_dataset['prompt'][i:i+batch_size]]
        titles = pipeline(batch, max_length=max_title_length, num_beams=4, length_penalty=2.0, early_stopping=True, top_k=50)
        generated_test_titles.extend([title['summary_text'] for title in titles])

    return generated_test_titles

In [None]:
def calculate_rouge(generated_train_titles, generated_validation_titles):
    # rouge score for train
    train_rouge_scores = []
    N = len(generated_train_titles)
    for i in tqdm(range(N)):
        rouge_score = calculate_rouge_score(train_df['titles'][i], generated_train_titles[i])
        train_rouge_scores.append(rouge_score)
    avg_train_rouge_score = sum(train_rouge_scores) / len(train_rouge_scores)

    # rouge score for validation
    validation_rouge_scores = []
    N = len(generated_validation_titles)
    for i in tqdm(range(N)):
        rouge_score = calculate_rouge_score(validation_df['titles'][i], generated_validation_titles[i])
        validation_rouge_scores.append(rouge_score)
    avg_validation_rouge_score = sum(validation_rouge_scores) / len(validation_rouge_scores)

    return avg_train_rouge_score, avg_validation_rouge_score

# Store the generated summaries in the Kaggle-accepted format
def generate_submission_df(generated_test_titles):
    submission_df = pd.DataFrame({
        'ID': test_df['ID'],
        'titles': generated_test_titles,
    })
    return submission_df

## `3 Test the Base Model`


In [None]:
# base model
base_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)

  return self.fget.__get__(instance, owner)()


In [None]:
# generate titles for train and validation
generated_validation_titles, generated_train_titles = generate_titles(
    base_model,
    N=200,
    batch_size=8
)

  4%|▍         | 1/25 [00:03<01:32,  3.85s/it]Your max_length is set to 128, but your input_length is only 117. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
Your max_length is set to 128, but your input_length is only 121. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)
  8%|▊         | 2/25 [00:07<01:21,  3.53s/it]Your max_length is set to 128, but your input_length is only 127. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=63)
Your max_length is set to 128, but your input_length is only 121. Since this is a summarization task, where outputs shorter than the input are typically wante

In [None]:
# calculate rouge score
avg_train_rouge_score, avg_validation_rouge_score = calculate_rouge(
    generated_train_titles,
    generated_validation_titles
)

print(f'Average rouge score for train: {avg_train_rouge_score}')
print(f'Average rouge score for validation: {avg_validation_rouge_score}')

100%|██████████| 200/200 [00:00<00:00, 2656.56it/s]
100%|██████████| 200/200 [00:00<00:00, 2851.87it/s]

Average rouge score for train: 0.10547439239641263
Average rouge score for validation: 0.10519317043300289





## `2 PEFT with LORA`


In [None]:
task_type = TaskType.SEQ_2_SEQ_LM
lora_rank = 2
learning_rate = 1e-3
weight_decay = 0.01
lora_alpha = 32
lora_dropout = 0.1
target_modules = ['k_proj', 'v_proj', 'q_proj']

num_epochs = 50
batch_size = 8

output_dir = f'mbart-mlsum-automatic-summarization/mt{max_text_length}-ms{max_title_length}-lora{lora_rank}/'

In [None]:
peft_config = LoraConfig(
    task_type=task_type,
    inference_mode=False,
    target_modules= target_modules,
    r=lora_rank,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    )

peft_model = get_peft_model(base_model, peft_config).to(device)
peft_model.print_trainable_parameters()

trainable params: 442,368 || all params: 611,321,856 || trainable%: 0.07236253630689755


In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=weight_decay,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)

- fine-tune the model


In [None]:
# trainer.train()

In [None]:
# last_checkpoint = # TODO: Set the last checkpoint manually

# with open(f'{output_dir}checkpoint-{last_checkpoint}/trainer_state.json') as f:
#     training_state = json.load(f)

# log_history = training_state['log_history']
# # plot the training loss and validation loss over the epochs
# epochs_loss = [(log['epoch'], log['loss']) for log in log_history if 'loss' in log]
# epochs_val_loss = [(log['epoch'], log['eval_loss']) for log in log_history if 'eval_loss' in log]

# plt.plot(*zip(*epochs_loss), label='Training Loss')
# plt.plot(*zip(*epochs_val_loss), label='Validation Loss')
# plt.xlabel('Epochs')
# plt.ylabel('Loss')
# plt.legend()
# plt.show()

## `3 Test Fine-Tuned Model`


In [None]:
# checkpoint
# TODO: Choose a checkpoint manually
checkpoint = 21410
path_checkpoint = f'{output_dir}checkpoint-{checkpoint}'

checkpoint_model = AutoModelForSeq2SeqLM.from_pretrained(path_checkpoint).to(device)

In [None]:
# generate titles for train and validation
generated_validation_titles, generated_train_titles = generate_titles(
    checkpoint_model,
    N=200,
    batch_size=8
)

In [None]:
# calculate rouge score
avg_train_rouge_score, avg_validation_rouge_score = calculate_rouge(
    generated_train_titles,
    generated_validation_titles
)

print(f'Average rouge score for train: {avg_train_rouge_score}')
print(f'Average rouge score for validation: {avg_validation_rouge_score}')