## **Summarization**

In [1]:
!pip install evaluate rouge_score

In [None]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
import evaluate

In [None]:
# 1. دیتاست
dataset = load_dataset("cnn_dailymail", "3.0.0")

# 2. مدل و توکنایزر
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

In [None]:
train_dataset = dataset['train'].select(range(1000))
val_dataset = datset['validation'].select(range(250))
test_dataset = dataset['test'].select(range(300))

In [None]:
# 3. پیش‌پردازش داده‌ها
max_input_length = 512
max_target_length = 128

In [None]:
def preprocess(examples):
    inputs = examples["article"]
    targets = examples["highlights"]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # tokenize summaries
    labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_tarin_data = train_dataset.map(
    preprocess,
    batched=True,
    remove_columns=["article", "highlights", "id"]
)

tokenized_val_data = val_dataset.map(
    preprocess,
    batched=True,
    remove_columns=["article", "highlights", "id"]
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
tokenized_tarin_data

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

In [None]:
# 4. Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
# 5. متریک (ROUGE)
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # replace -100 (padding token id) with pad_token_id
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
# 6. تنظیمات آموزش
training_args = Seq2SeqTrainingArguments(
    output_dir="./summarization-model",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=1,
    predict_with_generate=True,   # 👈 مهم برای Seq2Seq
    fp16=True,                    # در GPU سریع‌تر
    logging_dir='./logs',
    logging_steps=100,
    report_to = "tensorboard"
)

In [None]:
# 7. Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_tarin_data,
    eval_dataset=tokenized_val_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [None]:
# 8. شروع آموزش
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,1.482,1.86236,0.3354,0.1478,0.2426,0.3122




TrainOutput(global_step=500, training_loss=1.6106011047363282, metrics={'train_runtime': 431.4384, 'train_samples_per_second': 2.318, 'train_steps_per_second': 1.159, 'total_flos': 1068958206001152.0, 'train_loss': 1.6106011047363282, 'epoch': 1.0})

#### **Rouge**

 این معیار برای ارزیابی کیفیت خلاصه‌سازی استفاده میشه
این معیار چند نسخه داره:

ROUGE-1: (تک‌کلمه‌ای‌ها) ها n-gram مقایسه

ROUGE-2: (دوکلمه‌ای)
 ها bigram مقایسه

ROUGE-L: طولانی‌ترین زیر‌دنباله مشترک (LCS)

__این معیار فقط شباهت سطحی یا واژه ای رو می‌سنجه، نه کیفیت معنایی__

  مقادیر معمول برای معیارهای بالا در مقالات پژوهشی▶

ROUGE-1 : بین 40 تا 45

ROUGE-2 : بین 15 تا 22

ROUGE-L : بین 35 تا 40

__برای اینکه نتایج حاصل از فاین تیون بهتر شود داده آموزش و تعداد ایپاک آموزشی را بیشتر کنید__


In [None]:
# 9. تست روی یک نمونه
sample = test_dataset[0]["article"]
inputs = tokenizer(sample, return_tensors="pt", truncation=True, max_length=512)
summary_ids = model.generate(inputs["input_ids"], max_length=128, num_beams=4)
print("Predicted summary:", tokenizer.decode(summary_ids[0], skip_special_tokens=True))