<a href="https://colab.research.google.com/github/artin-k/avatCo/blob/main/SFT_Fine_Tuning_FLAN_T5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments, Seq2SeqTrainer
)
from datasets import load_dataset
import torch

# --- 1. بارگذاری مدل و مجموعه داده اصلی ---
# مدل درخواستی در تمرین
model_id = "google/flan-t5-small"
dataset_id = "knkarthick/dialogsum"

# بارگذاری توکنایزر
print(f"Loading tokenizer: {model_id}")
tokenizer = AutoTokenizer.from_pretrained(model_id)

# بارگذاری مدل پایه
print(f"Loading model: {model_id}")
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

# بارگذاری مجموعه داده DialogSum
print(f"Loading dataset: {dataset_id}")
dialogue_dataset = load_dataset(dataset_id)

# --- 2. تابع پیش‌پردازش و توکنایز کردن ---
prefix = "summarize: "
MAX_INPUT_LENGTH = 1024
MAX_TARGET_LENGTH = 128

def preprocess_function(examples):
    # ساختن پرامپت با اضافه کردن پیشوند دستوری
    inputs = [prefix + dialogue for dialogue in examples["dialogue"]]

    # توکنایز کردن ورودی‌ها
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)

    # توکنایز کردن خروجی‌های مطلوب (labels)
    # توجه: T5 از 'labels' برای خروجی دیکودر استفاده می‌کند
    labels = tokenizer(text_target=examples["summary"], max_length=MAX_TARGET_LENGTH, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# اعمال تابع پیش‌پردازش بر روی کل مجموعه داده
print("Tokenizing the datasets...")
tokenized_datasets = dialogue_dataset.map(preprocess_function, batched=True)

# --- 3. آماده‌سازی داده‌ها برای آموزش ---
# تعریف Data Collator برای Seq2Seq (جایگزین کردن Padding در Labels با -100)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# انتخاب زیرمجموعه‌های کوچک برای تسریع تمرین (مطابق با درخواست اولیه)
train_subset = tokenized_datasets["train"].select(range(5000))
eval_subset = tokenized_datasets["validation"].select(range(500))

# --- 4. تنظیم آرگومان‌ها و Trainer ---
training_args = Seq2SeqTrainingArguments(
    output_dir="./sft_dialogue_summary_results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    logging_dir='./sft_dialogue_summary_logs',
    logging_steps=500,
    eval_strategy="epoch",  # ارزیابی در پایان هر اپوک
    save_strategy="epoch",        # ذخیره مدل در پایان هر اپوک
    load_best_model_at_end=True,  # بارگذاری بهترین مدل
    fp16=torch.cuda.is_available(), # استفاده از FP16 در صورت وجود GPU
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_subset,
    eval_dataset=eval_subset,     # استفاده از زیرمجموعه اعتبارسنجی
    processing_class=tokenizer,   # Updated: Use processing_class instead of tokenizer
    data_collator=data_collator,
)

# --- 5. اجرای آموزش ---
print("\n" + "="*50)
print("Starting SFT training (5000 samples for 3 epochs)...")
print("="*50 + "\n")
trainer.train()
print("\nTraining finished.")

# ذخیره مدل نهایی
trainer.save_model("./final_sft_model")

# --- 6. تست مدل (Inference) ---
print("\n" + "="*50)
print("Testing Fine-Tuned Model (Inference)")
print("="*50)

# انتخاب یک نمونه از مجموعه داده تست
test_sample = dialogue_dataset["test"][10]
dialogue_to_summarize = test_sample["dialogue"]
true_summary = test_sample["summary"]

# آماده‌سازی ورودی به فرمت دستوری
input_text = f"summarize: {dialogue_to_summarize}"

# مدیریت دستگاه (GPU/CPU)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# توکنایز کردن ورودی و انتقال به دستگاه
input_ids = tokenizer(input_text, return_tensors="pt").input_ids
model.to(device)
input_ids = input_ids.to(device)

# تولید خلاصه
outputs = model.generate(
    input_ids,
    max_length=150,
    num_beams=4,
    early_stopping=True
)

# دیکود کردن خروجی
generated_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

# چاپ نتایج
print(f"**دستگاه مورد استفاده:** {device.upper()}")
print(f"**گفتگو:**\n{dialogue_to_summarize}")
print("-" * 20)
print(f"**خلاصه تولید شده (SFT):**\n{generated_summary}")
print(f"**خلاصه واقعی (Ground Truth):**\n{true_summary}")
print("-" * 20)
print("The Fine-Tuned model is saved in './final_sft_model'")

Loading tokenizer: google/flan-t5-small


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Loading model: google/flan-t5-small


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Loading dataset: knkarthick/dialogsum


README.md: 0.00B [00:00, ?B/s]

train.csv:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

validation.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/12460 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Tokenizing the datasets...


Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]


Starting SFT training (5000 samples for 3 epochs)...



Epoch,Training Loss,Validation Loss
1,0.0,
2,0.0,
3,0.0,



Training finished.

Testing Fine-Tuned Model (Inference)
**دستگاه مورد استفاده:** CUDA
**گفتگو:**
#Person1#: Happy Birthday, this is for you, Brian.
#Person2#: I'm so happy you remember, please come in and enjoy the party. Everyone's here, I'm sure you have a good time.
#Person1#: Brian, may I have a pleasure to have a dance with you?
#Person2#: Ok.
#Person1#: This is really wonderful party.
#Person2#: Yes, you are always popular with everyone. and you look very pretty today.
#Person1#: Thanks, that's very kind of you to say. I hope my necklace goes with my dress, and they both make me look good I feel.
#Person2#: You look great, you are absolutely glowing.
#Person1#: Thanks, this is a fine party. We should have a drink together to celebrate your birthday
--------------------
**خلاصه تولید شده (SFT):**
#Person1#: Happy Birthday, Brian.
**خلاصه واقعی (Ground Truth):**
#Person1# attends Brian's birthday party. Brian thinks #Person1# looks great and charming.
--------------------
The Fin

In [None]:
!pip install sacrebleu
!pip install evaluate



Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebleu-2.5.1


In [None]:
# Install rouge_score to fix the previous ImportError related to the ROUGE metric
#!pip install rouge_score
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebleu-2.5.1


In [None]:
#i load the folder down here
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained("./final_sft_model")
tokenizer = AutoTokenizer.from_pretrained("./final_sft_model")

#now i calculate the rouge
from datasets import load_dataset
import evaluate

# load the dataset test
dataset = load_dataset("knkarthick/dialogsum")
test_data = dataset["test"].select(range(100))  #get the refrences

# load the ROUGE metrics
rouge = evaluate.load("rouge")

# make the predications and comapre them with the refrences
predictions = []
references = []

for sample in test_data:
    dialogue = sample["dialogue"]
    ref_summary = sample["summary"]

    # tokenize the input
    inputs = tokenizer(dialogue, return_tensors="pt", truncation=True, max_length=512)

     #make the summary
    output_ids = model.generate(**inputs, max_length=64)
    pred_summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    predictions.append(pred_summary)
    references.append(ref_summary)

# compute ROUGE
results = rouge.compute(predictions=predictions, references=references)
print("ROUGE results:", results)


ROUGE results: {'rouge1': np.float64(0.11712578306378237), 'rouge2': np.float64(0.01031833833302295), 'rougeL': np.float64(0.10415009369560158), 'rougeLsum': np.float64(0.10397536234982818)}


In [23]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
import torch
import evaluate

# Load model
model_name = "facebook/m2m100_418M"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# English input sentences
inputs_eng = [
    "The website loads faster after the update.",
    "Natural Language Processing is a fascinating field."
]

# True Persian translations
references_per = [
    ["پس از به‌روزرسانی، وب‌سایت سریع‌تر بارگذاری می‌شود."],
    ["پردازش زبان طبیعی یک حوزه شگفت‌انگیز است."]
]

predictions = []

for sentence in inputs_eng:
    tokenizer.src_lang = "en"
    encoded = tokenizer(sentence, return_tensors="pt").to(device)

    generated = model.generate(
        **encoded,
        forced_bos_token_id=tokenizer.get_lang_id("fa"),
        num_beams=5,
        no_repeat_ngram_size=3,
        max_length=128
    )

    translated = tokenizer.decode(generated[0], skip_special_tokens=True)
    predictions.append(translated)

print("Model Predictions:", predictions)

# BLEU
bleu = evaluate.load("sacrebleu")
result = bleu.compute(predictions=predictions, references=references_per)

print("BLEU Score:", result["score"])


tokenizer_config.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

Model Predictions: ['وبسایت بعد از به روز رسانی سریع تر می شود.', 'پردازش زبان طبیعی یک زمینه جذاب است.']
BLEU Score: 17.854007384848675


In [None]:
import os
os.listdir()


['.config', 'final_sft_model', 'sft_dialogue_summary_results', 'sample_data']