### **BART-Model**

 Defining a BART model with the Transformers library tools

In [1]:
from transformers import BartConfig, BartForConditionalGeneration, AutoTokenizer

In [2]:
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [None]:
config = BartConfig(
    vocab_size = tokenizer.vocab_size,
    d_model = 128,           #256
    encoder_layers = 2,
    decoder_layers = 2,
    encoder_attention_heads = 4,
    decoder_attention_heads = 4,
    encoder_ffn_dim = 1024,
    decoder_ffn_dim = 1024,
    max_position_embeddings = 512,
    dropout = 0.1,
    activation_function = "gelu",
    pad_token_id = tokenizer.pad_token_id,
    bos_token_id = tokenizer.bos_token_id,
    eos_token_id = tokenizer.eos_token_id,
    scale_embedding = True,
    use_cache = True
)

In [6]:
model = BartForConditionalGeneration(config)

### **بررسی مدل**

In [7]:
config = model.config

In [8]:
print(f"مدل: {config.model_type}")
print(f"تعداد لایه های انکودر: {config.encoder_layers}")
print(f"تعداد لایه های دیکودر: {config.decoder_layers}")
print(f"ابعاد مخفی ( Hidden Size ): {config.d_model}")
print(f"تعداد توجه ( Attention Heads ): {config.encoder_attention_heads}")
print(f"اندازه واژگان ( Vocab Size ): {config.vocab_size}")
print(f"حداکثر طول توکن ( Max Position Embedding ): {config.max_position_embeddings}")
print(f"استفاده از توجه متقاطع (Cross Attention): {config.use_cache}")

مدل: bart
تعداد لایه های انکودر: 2
تعداد لایه های دیکودر: 2
ابعاد مخفی ( Hidden Size ): 128
تعداد توجه ( Attention Heads ): 4
اندازه واژگان ( Vocab Size ): 50265
حداکثر طول توکن ( Max Position Embedding ): 512
استفاده از توجه متقاطع (Cross Attention): True


### **train**

In [1]:
!pip install evaluate rouge_score

In [10]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
import evaluate

In [11]:
dataset = load_dataset("cnn_dailymail", "3.0.0")

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [28]:
train_dataset = dataset['train'].select(range(8000))
val_dataset = dataset['validation'].select(range(2700))
test_dataset = dataset['test'].select(range(3000))

In [29]:
# 3. پیش‌پردازش داده‌ها
max_input_length = 512
max_target_length = 128

def preprocess(examples):
    inputs = examples["article"]
    targets = examples["highlights"]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # tokenize summaries
    labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_tarin_data = train_dataset.map(
    preprocess,
    batched=True,
    remove_columns=["article", "highlights", "id"]
)

tokenized_val_data = val_dataset.map(
    preprocess,
    batched=True,
    remove_columns=["article", "highlights", "id"]
)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2700 [00:00<?, ? examples/s]

In [30]:
# 4. Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [31]:
# 5. متریک (ROUGE)
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # replace -100 (padding token id) with pad_token_id
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    return {k: round(v, 4) for k, v in result.items()}

In [32]:
# 6. تنظیمات آموزش
training_args = Seq2SeqTrainingArguments(
    output_dir="./bart-model",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=1,
    predict_with_generate=True,   # 👈 مهم برای Seq2Seq
    fp16=True,                    # در GPU سریع‌تر
    logging_dir='./logs',
    logging_steps=100,
    report_to = "tensorboard"
)

In [33]:
# 7. Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_tarin_data,
    eval_dataset=tokenized_val_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [34]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,6.9184,7.207514,0.003,0.0,0.003,0.003


TrainOutput(global_step=4000, training_loss=7.046169708251953, metrics={'train_runtime': 228.1996, 'train_samples_per_second': 35.057, 'train_steps_per_second': 17.529, 'total_flos': 35198458730496.0, 'train_loss': 7.046169708251953, 'epoch': 1.0})

In [35]:
# test
text = "BART is a denoising autoencoder for sequence-to-sequence pretraining."
enc = tokenizer([text], return_tensors='pt', padding=True, truncation=True, max_length=128)

# Move input tensors to the same device as the model
device = model.device
enc = {k: v.to(device) for k, v in enc.items()}

out_ids = model.generate(**enc, max_length=40, num_beams=4)
print(tokenizer.decode(out_ids[0], skip_special_tokens=True))

NEWNEWNEW::::::::::::::::::::::::::::::::::


خب نتایج ارزیابی روی داده های ارزیابی هنگام آموزش و تست بالا نشون میده که مدل عملا چیز مفیدی یاد نگرفته . دلیلش داده بسیار کم آموزش است.میتوانید داده آموزش را به دلخواه زیاد کنید

### **evaluate the model**

In [36]:
tokenized_test_data = test_dataset.map(
    preprocess,
    batched=True,
    remove_columns=["article", "highlights", "id"]
)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [37]:
# 6. ارزیابی مدل
trainer.evaluate(tokenized_test_data)

{'eval_loss': 7.223298072814941,
 'eval_rouge1': 0.0035,
 'eval_rouge2': 0.0,
 'eval_rougeL': 0.0035,
 'eval_rougeLsum': 0.0035,
 'eval_runtime': 183.9858,
 'eval_samples_per_second': 16.306,
 'eval_steps_per_second': 8.153,
 'epoch': 1.0}

این ارزیابی نشون میده که مدل خوب آموزش ندیده و به داده های بسیار بیشتری نیاز هست