Afin de procéder au test de ce code, il convient préalablement de se munir d'un jeu de données composé de phrases FALC et complexes, formaté en xlsx, et d'adapter l'ensemble des chemins de fichier employés dans ce code en fonction de l'environnement et de l'emplacement du nouveau fichier dataset.xlsx.





In [None]:
!pip install --upgrade torch transformers
!pip install datasets
!pip install sentencepiece


In [None]:
import pandas as pd

# Read the xlsx file
data = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/dataset.xlsx', usecols=['A', 'B'])
data = data[data['B'].str.count('\n') == 0]
# Save simple and complex sentences as separate text files
data['A'].to_csv('/content/drive/MyDrive/Colab Notebooks/model/simple_sentences.txt', index=False, header=None)
data['B'].to_csv('/content/drive/MyDrive/Colab Notebooks/model/complex_sentences.txt', index=False, header=None)

with open('/content/drive/MyDrive/Colab Notebooks/model/complex_sentences.txt', 'r') as f:
    complex_sentences = f.read().splitlines()

with open('/content/drive/MyDrive/Colab Notebooks/model/complex_sentences.txt', 'r') as file:
    text = file.read()

text = text.replace('"', '')

with open('/content/drive/MyDrive/Colab Notebooks/model/complex_sentences.txt', 'w') as file:
    file.write(text)


In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model = T5ForConditionalGeneration.from_pretrained("t5-small")
tokenizer = T5Tokenizer.from_pretrained("t5-small")


In [None]:
from datasets import Dataset

def read_sentences(file_path):
    with open(file_path, "r") as f:
        sentences = [line.strip() for line in f.readlines()]
    return sentences

simple_sentences = read_sentences("/content/drive/MyDrive/Colab Notebooks/model/simple_sentences.txt")
complex_sentences = read_sentences("/content/drive/MyDrive/Colab Notebooks/model/complex_sentences.txt")

data_dict = {'simple': simple_sentences, 'complex': complex_sentences}
dataset = Dataset.from_dict(data_dict)

dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset['train']
val_dataset = dataset['test']


In [None]:
def preprocess(examples):
    sources = examples['complex']
    targets = examples['simple']

    source_tokenized = tokenizer(["simplifier: " + src for src in sources], max_length=128, padding="max_length", truncation=True, return_tensors="pt")
    target_tokenized = tokenizer(targets, max_length=128, padding="max_length", truncation=True, return_tensors="pt")

    return {
        "input_ids": source_tokenized["input_ids"],
        "attention_mask": source_tokenized["attention_mask"],
        "labels": target_tokenized["input_ids"]
    }

train_dataset = train_dataset.map(preprocess, batched=True, remove_columns=["simple", "complex"])
val_dataset = val_dataset.map(preprocess, batched=True, remove_columns=["simple", "complex"])


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=12,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="no",
    logging_dir="./logs",
    logging_steps=100,
    learning_rate= 5e-5,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)
trainer.train()
model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/model/fine_tuned_t5_small")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/model/fine_tuned_t5_small")


Epoch,Training Loss,Validation Loss
1,0.1456,0.116753
2,0.1254,0.108213
3,0.1186,0.10321
4,0.111,0.100984
5,0.1045,0.099048
6,0.1025,0.097438
7,0.0992,0.096148
8,0.0964,0.095683
9,0.0917,0.095139
10,0.0899,0.094895


('/content/drive/MyDrive/Colab Notebooks/model/fine_tuned_t5_small/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/model/fine_tuned_t5_small/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/model/fine_tuned_t5_small/spiece.model',
 '/content/drive/MyDrive/Colab Notebooks/model/fine_tuned_t5_small/added_tokens.json')

**TEST**

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/Colab Notebooks/model/fine_tuned_t5_small")
tokenizer = T5Tokenizer.from_pretrained("/content/drive/MyDrive/Colab Notebooks/model/fine_tuned_t5_small")

input_text = "Le nouveau logiciel de gestion de projet que nous avons acheté est très complexe et difficile à utiliser, ce qui a entraîné de nombreux retards dans la réalisation de nos tâches."

input_ids = tokenizer.encode(input_text, return_tensors='pt')

summary_ids = model.generate(input_ids=input_ids,
                              num_beams=4,
                              max_length=60,
                              repetition_penalty=2.5,
                              length_penalty=1.0)

summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print(summary_text)


Le nouveau logiciel de gestion du projet nous retarde beaucoup.
