In [None]:
pip install --upgrade datasets fsspec

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec
  Downloading fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency resolver 

In [None]:
rm -rf ~/.cache/huggingface/datasets

In [None]:
import torch
from datasets import load_dataset
from transformers import BartTokenizer


dataset = load_dataset("abisee/cnn_dailymail", "3.0.0")
train_dataset = dataset['train'].select(range(1000))  # 3000 نمونه
val_dataset = dataset['validation'].select(range(200))  # 500 نمونه
test_dataset = dataset['test'].select(range(100))

tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

def preprocess_data(example):
    article = example['article']
    summary = example['highlights']
    inputs = tokenizer(article, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
    outputs = tokenizer(summary, max_length=150, truncation=True, padding="max_length", return_tensors="pt")
    return {
        "input_ids": inputs["input_ids"].squeeze(),
        "attention_mask": inputs["attention_mask"].squeeze(),
        "labels": outputs["input_ids"].squeeze()
    }


train_dataset = train_dataset.map(preprocess_data, batched=True)
val_dataset = val_dataset.map(preprocess_data, batched=True)
test_dataset = test_dataset.map(preprocess_data, batched=True)

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1, dropout=0.1):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        embedded = self.dropout(self.embedding(input_ids))
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell

class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))

    def forward(self, hidden, encoder_outputs, attention_mask):
        # hidden: [batch_size, hidden_size]
        # encoder_outputs: [batch_size, seq_len, hidden_size]
        batch_size, seq_len, _ = encoder_outputs.size()
        hidden = hidden.unsqueeze(1).repeat(1, seq_len, 1)  # [batch_size, seq_len, hidden_size]
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))  # [batch_size, seq_len, hidden_size]
        energy = energy.matmul(self.v)  # [batch_size, seq_len]
        energy = energy.masked_fill(attention_mask == 0, -1e10)
        attn_weights = F.softmax(energy, dim=1).unsqueeze(2)  # [batch_size, seq_len, 1]
        context = attn_weights * encoder_outputs  # [batch_size, seq_len, hidden_size]
        context = context.sum(dim=1)  # [batch_size, hidden_size]
        return context, attn_weights

class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1, dropout=0.1):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size + hidden_size, hidden_size, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.attention = Attention(hidden_size)

    def forward(self, input_ids, hidden, cell, encoder_outputs, attention_mask):
        embedded = self.dropout(self.embedding(input_ids.unsqueeze(1)))  # [batch_size, 1, embed_size]
        context, attn_weights = self.attention(hidden[-1], encoder_outputs, attention_mask)
        lstm_input = torch.cat((embedded, context.unsqueeze(1)), dim=2)  # [batch_size, 1, embed_size + hidden_size]
        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
        output = self.fc(output.squeeze(1))  # [batch_size, vocab_size]
        return output, hidden, cell, attn_weights

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_ids, input_mask, target_ids):
        encoder_outputs, hidden, cell = self.encoder(input_ids, input_mask)
        outputs = []
        input_token = target_ids[:, 0]

        for t in range(1, target_ids.size(1)):
            output, hidden, cell, _ = self.decoder(input_token, hidden, cell, encoder_outputs, input_mask)
            outputs.append(output)
            input_token = target_ids[:, t]

        outputs = torch.stack(outputs, dim=1)  # [batch_size, seq_len, vocab_size]
        return outputs

In [None]:
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm


vocab_size = tokenizer.vocab_size
embed_size = 256
hidden_size = 512
num_layers = 2
dropout = 0.3


encoder = Encoder(vocab_size, embed_size, hidden_size, num_layers, dropout)
decoder = Decoder(vocab_size, embed_size, hidden_size, num_layers, dropout)
model = Seq2Seq(encoder, decoder)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)


train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)


num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")

    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        output = model(input_ids, attention_mask, labels)
        output = output.view(-1, vocab_size)
        labels = labels[:, 1:].contiguous().view(-1)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        progress_bar.set_postfix({"batch_loss": loss.item(), "avg_loss": epoch_loss / (progress_bar.n + 1)})

    avg_epoch_loss = epoch_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs} completed, Average Loss: {avg_epoch_loss:.4f}")

Epoch 1/10:  17%|█▋        | 131/750 [02:27<11:24,  1.11s/batch, batch_loss=7.73, avg_loss=8.14]

In [None]:

import torch
import numpy as np
from rouge_score import rouge_scorer

def generate_summary(model, tokenizer, input_ids, attention_mask, max_length=150):
    model.eval()
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    encoder_outputs, hidden, cell = model.encoder(input_ids, attention_mask)
    generated = [tokenizer.bos_token_id]
    for _ in range(max_length):
        input_token = torch.tensor([generated[-1]], dtype=torch.long).to(device)
        with torch.no_grad():
            output, hidden, cell, _ = model.decoder(input_token, hidden, cell, encoder_outputs, attention_mask)
        next_token = output.argmax(dim=-1).item()
        generated.append(next_token)
        if next_token == tokenizer.eos_token_id:
            break

    return tokenizer.decode(generated, skip_special_tokens=True)
def evaluate_model(model, tokenizer, dataset):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    predictions = []
    references = []

    model = model.to(device)

    for example in dataset:
        input_ids = example["input_ids"].unsqueeze(0).to(device)
        attention_mask = example["attention_mask"].unsqueeze(0).to(device)
        pred = generate_summary(model, tokenizer, input_ids, attention_mask)
        ref = tokenizer.decode(example["labels"], skip_special_tokens=True)
        predictions.append(pred)
        references.append(ref)


    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []

    for pred, ref in zip(predictions, references):
        scores = scorer.score(ref, pred)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)

    return {
        'rouge1': np.mean(rouge1_scores),
        'rouge2': np.mean(rouge2_scores),
        'rougeL': np.mean(rougeL_scores)
    }

# ارزیابی
rouge_scores = evaluate_model(model, tokenizer, test_dataset)
print(rouge_scores)

In [None]:
import os
# ایجاد پوشه برای ذخیره مدل
save_dir = "./bart-summarizer-final"
os.makedirs(save_dir, exist_ok=True)

# ذخیره مدل و توکنایزر
torch.save(model.state_dict(), os.path.join(save_dir, "model.pt"))
tokenizer.save_pretrained(save_dir)

{'id': '0054d6d30dbcad772e20b22771153a2a9cbeaf62',
 'article': '(CNN) -- An American woman died aboard a cruise ship that docked at Rio de Janeiro on Tuesday, the same ship on which 86 passengers previously fell ill, according to the state-run Brazilian news agency, Agencia Brasil. The American tourist died aboard the MS Veendam, owned by cruise operator Holland America. Federal Police told Agencia Brasil that forensic doctors were investigating her death. The ship's doctors told police that the woman was elderly and suffered from diabetes and hypertension, according the agency. The other passengers came down with diarrhea prior to her death during an earlier part of the trip, the ship's doctors said. The Veendam left New York 36 days ago for a South America tour.'
 'highlights': 'The elderly woman suffered from diabetes and hypertension, ship's doctors say .\nPreviously, 86 passengers had fallen ill on the ship, Agencia Brasil says .'}

In [None]:
def summarize_text(text, model, tokenizer, max_length=150):

    inputs = tokenizer(text, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)


    summary = generate_summary(model, tokenizer, input_ids, attention_mask, max_length=max_length)
    return summary

article = "(CNN) -- An American woman died aboard a cruise ship that docked at Rio de Janeiro on Tuesday, the same ship on which 86 passengers previously fell ill, according to the state-run Brazilian news agency, Agencia Brasil. The American tourist died aboard the MS Veendam, owned by cruise operator Holland America. Federal Police told Agencia Brasil that forensic doctors were investigating her death. The ship's doctors told police that the woman was elderly and suffered from diabetes and hypertension, according the agency. The other passengers came down with diarrhea prior to her death during an earlier part of the trip, the ship's doctors said. The Veendam left New York 36 days ago for a South America tour."
summary = summarize_text(article, model, tokenizer)
print("Summary:", summary)