In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install rouge-score



In [3]:
import pandas as pd
from transformers import BartForConditionalGeneration, BartTokenizer, Trainer, TrainingArguments
from rouge_score import rouge_scorer
import torch

In [4]:
# Load data
path = '/content/drive/My Drive/Colab Notebooks/11_DataAI_INF582_TextMining_NLP/Challenge/data/'
train_df = pd.read_csv(path + 'train.csv')
validation_df = pd.read_csv(path + 'validation.csv')
test_df = pd.read_csv(path + 'test_text.csv')

In [5]:
# Load BART tokenizer and model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
train_df.tail(5)

Unnamed: 0,text,titles
21396,"Une ""main tendue"" que refuse de saisir l'eurod...",POLITIQUE. La présidente du Front national a s...
21397,"Venue présenter son dernier spectacle, Elektri...",Que serait un dimanche matin sans un passage e...
21398,Mauvaise nouvelle pour les amateurs de chasse ...,"La chasse à la glu sera interdite cette année,..."
21399,"""Ce n'est pas une question de simple voile, c'...","Bruno Le Maire, candidat à la primaire de la d..."
21400,C'est le montant total encaissé par l'État l'a...,"En 2019, les sommes recouvrées par l'État au t..."


In [7]:
# Tokenize and format training data
train_texts = train_df['text'].tolist()
train_titles = train_df['titles'].tolist()

train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors="pt")
train_labels = tokenizer(train_titles, truncation=True, padding=True, return_tensors="pt")

In [12]:
train_encodings

{'input_ids': tensor([[    0, 11329,   906,  ...,     1,     1,     1],
        [    0,   347,   108,  ...,     1,     1,     1],
        [    0, 10766, 10969,  ...,  5739,  8009,     2],
        ...,
        [    0,   448,  1180,  ...,     1,     1,     1],
        [    0,   113,   347,  ...,     1,     1,     1],
        [    0,   347,   108,  ...,  6534,   784,     2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]])}

In [13]:
train_labels

{'input_ids': tensor([[    0,   574,   108,  ...,     1,     1,     1],
        [    0, 10350,   475,  ...,     1,     1,     1],
        [    0, 16040,  2760,  ...,     1,     1,     1],
        ...,
        [    0, 10766,  1855,  ...,     1,     1,     1],
        [    0,   387,  2962,  ...,     1,     1,     1],
        [    0, 16040,   954,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [8]:
# Tokenize and format validation data
validation_texts = validation_df['text'].tolist()
validation_titles = validation_df['titles'].tolist()

validation_encodings = tokenizer(validation_texts, truncation=True, padding=True, return_tensors="pt")
validation_labels = tokenizer(validation_titles, truncation=True, padding=True, return_tensors="pt")

In [16]:
validation_encodings

{'input_ids': tensor([[    0, 27526,  7427,  ...,     1,     1,     1],
        [    0, 10766,   748,  ...,     1,     1,     1],
        [    0, 29774,   257,  ...,     1,     1,     1],
        ...,
        [    0,   113,   510,  ...,     1,     1,     1],
        [    0, 41872,  8025,  ...,  1140,  7085,     2],
        [    0,   347,   108,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [17]:
validation_labels

{'input_ids': tensor([[    0, 10350,   741,  ...,     1,     1,     1],
        [    0, 10350,  2242,  ...,     1,     1,     1],
        [    0,  3849,  7471,  ...,     1,     1,     1],
        ...,
        [    0, 45344,   811,  ...,     1,     1,     1],
        [    0, 13365,   979,  ...,     1,     1,     1],
        [    0,   347,   108,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [28]:
# !pip install accelerate>=0.21.0

In [None]:
# # Fine-tune model
# training_args = TrainingArguments(
#     output_dir='./output_dir',
#     per_device_train_batch_size=4,
#     per_device_eval_batch_size=4,
#     num_train_epochs=3,
#     logging_dir='./logs',
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_encodings,
#     eval_dataset=validation_encodings,
# )

# trainer.train()

In [13]:
print(train_encodings.keys())
print(validation_encodings.keys())

dict_keys(['input_ids', 'attention_mask'])
dict_keys(['input_ids', 'attention_mask'])


In [17]:
print(train_labels)
print(validation_labels)

{'input_ids': tensor([[    0,   574,   108,  ...,     1,     1,     1],
        [    0, 10350,   475,  ...,     1,     1,     1],
        [    0, 16040,  2760,  ...,     1,     1,     1],
        ...,
        [    0, 10766,  1855,  ...,     1,     1,     1],
        [    0,   387,  2962,  ...,     1,     1,     1],
        [    0, 16040,   954,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
{'input_ids': tensor([[    0, 10350,   741,  ...,     1,     1,     1],
        [    0, 10350,  2242,  ...,     1,     1,     1],
        [    0,  3849,  7471,  ...,     1,     1,     1],
        ...,
        [    0, 45344,   811,  ...,     1,     1,     1],
        [    0, 13365,   979,  ...,     1,     1,     1],
        [    0,   347,   108,  ...,     1,     1,     1]]), 'attentio

In [21]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets

In [23]:
from datasets import Dataset

train_dataset = Dataset.from_dict({"input_ids": train_encodings["input_ids"],
                                   "attention_mask": train_encodings["attention_mask"],
                                   "labels": train_labels["input_ids"]})

eval_dataset = Dataset.from_dict({"input_ids": validation_encodings["input_ids"],
                                  "attention_mask": validation_encodings["attention_mask"],
                                  "labels": validation_labels["input_ids"]})

In [None]:
from tqdm import tqdm

# Fine-tune model with tqdm progress bar
training_args = TrainingArguments(
    output_dir='./output_dir',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_dir='./logs',
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Train model with tqdm progress bar
progress_bar = tqdm(range(training_args.num_train_epochs), desc="Training")
for epoch in progress_bar:
    trainer.train()
    progress_bar.set_description(f"Epoch {epoch+1}")

Training:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# from datasets import Dataset

# train_dataset = Dataset.from_dict({"input_ids": train_encodings["input_ids"],
#                                    "attention_mask": train_encodings["attention_mask"],
#                                    "labels": train_labels["input_ids"]})

# eval_dataset = Dataset.from_dict({"input_ids": validation_encodings["input_ids"],
#                                   "attention_mask": validation_encodings["attention_mask"],
#                                   "labels": validation_labels["input_ids"]})

# # Create Trainer instance
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
# )

# # Train model
# trainer.train()

In [None]:
# Generate predictions for test data
test_texts = test_df['text'].tolist()
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors="pt")

outputs = model.generate(input_ids=test_encodings.input_ids, attention_mask=test_encodings.attention_mask)
predicted_titles = tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [None]:
# Save predictions
test_df['predicted_title'] = predicted_titles
test_df.to_csv(path + 'test_predicted.csv', index=False)

In [None]:
# Evaluate model using ROUGE metric
scorer = rouge_scorer.RougeScorer(['rougeL'])
validation_predictions = model.generate(input_ids=validation_encodings.input_ids, attention_mask=validation_encodings.attention_mask)
predicted_validation_titles = tokenizer.batch_decode(validation_predictions, skip_special_tokens=True)

scores = scorer.score(validation_df['title'], predicted_validation_titles)

print("ROUGE scores:", scores)