In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

import torch

import spacy
import re
import nltk
import transformers

from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rougeL'])

In [2]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test_text.csv')
validation_df = pd.read_csv('../data/validation.csv')

# `Fine-Tuning mBART MLSUM`


## `1` Preprocessing


In [3]:
# !pip install transformers datasets spacy
# !python -m spacy download fr_core_news_sm

In [3]:
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('french'))  # Use French stopwords

nlp = spacy.load('fr_core_news_sm')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dabereabasse/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def preprocess(text):
    text = text.lower()  # Lowercasing
    url_pattern = r'https?://\S+|www\.\S+'
    text = re.sub(url_pattern, '', text)  # Remove URLs
    html_pattern = r'<.*?>'
    text = re.sub(html_pattern, '', text)  # Remove HTML tags
    punctuation_pattern = r'[^\w\s]'
    text = re.sub(punctuation_pattern, '', text)  # Remove punctuation
    number_pattern = r'\d+'
    text = re.sub(number_pattern, '', text)  # Remove numbers
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    text = ' '.join([word.lemma_ for word in nlp(text)])  # Lemmatization
    return text

# preprocess the text
## train_df['preprocessed_text'] = train_df['text'].apply(preprocess)
## validation_df['preprocessed_text'] = validation_df['text'].apply(preprocess)

# load the preprocessed text
train_df = pd.read_csv('../recup/train_preprocessed.csv')
validation_df = pd.read_csv('../recup/validation_preprocessed.csv')

## `2` pretrained mBart mlsum Model


In [7]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import SummarizationPipeline

model_name = 'lincoln/mbart-mlsum-automatic-summarization'

# Load model and tokenizer
loaded_tokenizer = AutoTokenizer.from_pretrained(model_name)
loaded_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Create summarization pipeline
summarizer = SummarizationPipeline(loaded_model, loaded_tokenizer)

### pretrained mBart on text


In [8]:
# generate titles for the N first texts in the validation set
validation_texts_titles = []
batch_size = 8
N = 100

for i in tqdm(range(0, min(N, len(validation_df)), batch_size)):
    # max length of the input text is 512 based on FirstAnalysis.ipynb
    batch = [text[:512] for text in validation_df['text'][i:i+batch_size]]
    titles = summarizer(batch, max_length=39, min_length=24, num_beams=4, length_penalty=2.0, early_stopping=True, no_repeat_ngram_size=3, top_k=50)
    validation_texts_titles.extend([title['summary_text'] for title in titles])

# calculate ROUGE scores between the generated titles and the true titles
text_mbart_rouge = []
for idx, title in validation_df['titles'][0:N].items():
    text_mbart_rouge.append(scorer.score(validation_texts_titles[idx], title)['rougeL'][2])

avg_rouge_score_text_mbart = sum(text_mbart_rouge) / len(text_mbart_rouge)
print("Average Rouge-L F-Score with basic mBART:", avg_rouge_score_text_mbart)

100%|██████████| 13/13 [11:26<00:00, 52.83s/it]

Average Rouge-L F-Score with basic mBART: 0.19894247120905928





### pretrained mBart on preprocessed_text


In [9]:
# generate titles for the N first preprocessed texts in the validation set
validation_preprocessed_texts_titles = []
batch_size = 8
N = 100

for i in tqdm(range(0, min(N, len(validation_df)), batch_size)):
    # max length of the input text is 512 based on FirstAnalysis.ipynb
    batch = [text[:512] for text in validation_df['preprocessed_text'][i:i+batch_size]]
    titles = summarizer(batch, max_length=39, min_length=24, num_beams=4, length_penalty=2.0, early_stopping=True, no_repeat_ngram_size=3, top_k=50)
    validation_preprocessed_texts_titles.extend([title['summary_text'] for title in titles])

# calculate ROUGE scores between the generated titles and the true titles
preprocessed_mbart_rouge = []
for idx, title in validation_df['titles'][0:N].items():
    preprocessed_mbart_rouge.append(scorer.score(validation_preprocessed_texts_titles[idx], title)['rougeL'][2])

avg_rouge_score_preprocessed_mbart = sum(preprocessed_mbart_rouge) / len(preprocessed_mbart_rouge)
print("Average Rouge-L F-Score with preprocessed mBART:", avg_rouge_score_preprocessed_mbart)

100%|██████████| 13/13 [10:42<00:00, 49.40s/it]

Average Rouge-L F-Score with preprocessed mBART: 0.12606814101652428



