In [1]:
pip install torch transformers datasets sentencepiece


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [47]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import torch

# Charger et parser training.txt
def load_dataset_txt(path):
    inputs, targets = [], []
    with open(path, encoding='utf-8') as f:
        for line in f:
            if '=' in line:
                inp, out = line.strip().split('=', 1)
                if inp.strip() == out.strip() or inp.strip().startswith("Gare"):
                    continue
                inputs.append(f"abbréviation de: {inp.strip()}")
                targets.append(out.strip())
    return Dataset.from_dict({"input": inputs, "output": targets})

# Tokenizer et modèle
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

dataset = load_dataset_txt("training.txt")

print(f"Nombre d'exemples: {len(dataset)}")

# Prétraitement
def preprocess(example):
    model_inputs = tokenizer(example["input"], max_length=64, truncation=True, padding="max_length")
    labels = tokenizer(example["output"], max_length=32, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess)

# Entraînement
training_args = TrainingArguments(
    output_dir="./t5_abbr_model",
    per_device_train_batch_size=4,
    num_train_epochs=4,           # Réduit de 20 → 5
    logging_steps=10,
    save_steps=100,               # Sauvegarde plus fréquente
    save_total_limit=2,           # Ne garde que les 2 dernières sauvegardes
    fp16=torch.cuda.is_available(),
    learning_rate=0.0005
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

trainer.train()

# Sauvegarde
model.save_pretrained("./t5_abbr_model")
tokenizer.save_pretrained("./t5_abbr_model")

Nombre d'exemples: 377


Map: 100%|██████████| 377/377 [00:00<00:00, 3099.87 examples/s]
  trainer = Trainer(


Step,Training Loss
10,3.4223
20,0.8197
30,0.4333
40,0.2928
50,0.2273
60,0.2503
70,0.2075
80,0.1815
90,0.2079
100,0.1774




('./t5_abbr_model/tokenizer_config.json',
 './t5_abbr_model/special_tokens_map.json',
 './t5_abbr_model/spiece.model',
 './t5_abbr_model/added_tokens.json')

In [52]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model = T5ForConditionalGeneration.from_pretrained("./t5_abbr_model")
tokenizer = T5Tokenizer.from_pretrained("./t5_abbr_model")

def abbreviate(input_text: str) -> str:
    prompt = f"abbréviation de: {input_text}"
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    output_ids = model.generate(
        input_ids,
        max_length=32,
        num_beams=4,
        early_stopping=True
    )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Exemple :
print(abbreviate("Gare Montparnasse"))
print(abbreviate("Gare de Lyon"))
print(abbreviate("Gare de l'est"))
print(abbreviate("Gare Saint-Lazare"))
print(abbreviate("Gare du Nord"))
print(abbreviate("Villejuif-Paul Vaillant Couturier (Hôpital Paul Brousse)"))
print(abbreviate("Saint-Quentin - Gare Routière des Prés (Quai A)"))
print(abbreviate("Rue du Puits / Le Plessis Chenet"))
print(abbreviate("Bibliothèque François Mitterrand"))
print(abbreviate("Neuilly-Porte-Maillot"))
print(abbreviate("Aéroport Charles de Gaulle 2 - (Terminal 2)"))
print(abbreviate("Neuilly-Plaisance Université"))
print(abbreviate("Boulevard des grands prés"))

Gare Montp.
Gare de Lyon
Gare de l'est
Gare St-Lazare
Gare du Nord
Villejuif-Paul V. Couturier (H. Paul Brousse)
St-Quentin - Gare Routière des Prés (Quai A)
R. du Puits / Le Plessis Chenet
Bibliothèque François Mitterrand
Neuilly-Pte-Maillot
C. de Gaulle 2 - (T2)
Neuilly-Pl.
B. des grands prés
