In [1]:
!pip install transformers datasets torch accelerate
!pip install transformers[torch] --upgrade
!pip install accelerate -U
!pip show accelerate

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━

In [2]:
!pip install transformers datasets torch

import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import load_dataset



In [3]:
dataset = load_dataset('cnn_dailymail', '3.0.0')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/259M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [4]:
model = T5ForConditionalGeneration.from_pretrained('t5-base')  # Load the T5 model for conditional generation (e.g., summarization) from the pre-trained 't5-base' model
tokenizer = T5Tokenizer.from_pretrained('t5-base')  # Load the T5 tokenizer that corresponds to the 't5-base' model

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
def preprocess_data(batch):
    inputs = ["summarize: " + article for article in batch['article']]  # Prefix each article with 'summarize: ' to prepare data for summarization
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length", return_tensors="pt")  # Tokenize inputs with a maximum length of 512, adding padding and converting to PyTorch tensors

    # Prepare the labels
    with tokenizer.as_target_tokenizer():  # Use tokenizer's target mode to properly handle summarization targets
        labels = tokenizer(batch['highlights'], max_length=150, truncation=True, padding="max_length", return_tensors="pt")  # Tokenize the summary labels with a max length of 150

    # Adjust labels to ignore padding in the loss calculation by replacing pad token id with -100
    labels["input_ids"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in labels_example] for labels_example in labels["input_ids"]
    ]

    model_inputs["labels"] = torch.tensor(labels["input_ids"])  # Add labels to model inputs and convert them to a PyTorch tensor
    return model_inputs

In [6]:
train_dataset = dataset['train'].select(range(10000))  # Using only 10,000 samples for training
train_dataset = train_dataset.map(preprocess_data, batched=True, batch_size=16)  # Adjust batch_size as per your GPU memory


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]



In [7]:
from transformers import Trainer, TrainingArguments

# Initialize TrainingArguments with specified training parameters
training_args = TrainingArguments(
    output_dir='./results',  # Directory where the model outputs will be saved
    num_train_epochs=3,  # Total number of training epochs
    per_device_train_batch_size=8,  # Batch size per device during training
    per_device_eval_batch_size=16,  # Batch size for evaluation
    warmup_steps=500,  # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # Weight decay to prevent overfitting
    logging_dir='./logs',  # Directory where logs will be saved
    logging_steps=10,  # Log training loss every 10 steps
)

# Initialize the Trainer with the model, training arguments, and datasets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=None  # Optional: Include evaluation dataset if available
)

# Begin the training process
trainer.train()


Step,Training Loss
10,1.9758
20,1.919
30,2.0684
40,2.0418
50,1.9208
60,2.0302
70,1.9416
80,1.9593
90,1.8957
100,1.8002


Step,Training Loss
10,1.9758
20,1.919
30,2.0684
40,2.0418
50,1.9208
60,2.0302
70,1.9416
80,1.9593
90,1.8957
100,1.8002


TrainOutput(global_step=3750, training_loss=1.5269989247639975, metrics={'train_runtime': 5150.3547, 'train_samples_per_second': 5.825, 'train_steps_per_second': 0.728, 'total_flos': 1.82687367168e+16, 'train_loss': 1.5269989247639975, 'epoch': 3.0})

In [16]:
input_text = """summarize: " + "In an interview with The Associated Press on April 24, a senior leader within Hamas' Politburo made remarks that were intriguing to some but enraging to others. Khalil al-Hayya stated that Hamas would, in fact, be willing to lay down its arms and become a strictly political group if the Palestinians were able to establish an independent state on the 1967 borders in the West Bank and the Gaza Strip, in addition to allowing for the right of return of Palestinian refugees and their descendants. The latter point is a contentious one because Israel will never agree to be flooded with millions of Palestinian refugees and their descendants within its territory. However, the former point is significant in that this was the first time the Islamist group explicitly addressed the issue of disarmament and the end of its guerrilla-style militancy against Israel.

This statement comes six months after Hamas initiated a horrendous attack on Israel on Oct. 7 and, in turn, ignited a deadly and destructive war on Gaza that rages on. Israeli officials have insisted that the war would only end with Hamas returning the hostages and surrendering, though no clear parameters have actually been articulated to describe what exactly such a surrender would entail to be acceptable to Israel.

For years, Hamas has given mixed signals on its stance regarding a two-state solution, especially after the group revised its Charter in 2017, giving the impression of a more pragmatic willingness to accept Israel's existence. Recently, however, another senior political leader, Khaled Mashal, made a contradictory statement in which he rejected the two-state solution and signaled an unwillingness to accept Israel's presence in any part of historic Palestine.

In Gaza
A child sits in a small trolley cart with a jerrycan as people collect water from a tanker in Deir el-Balah in the central Gaza Strip on April 30. -/AFP VIA GETTY IMAGES
Hamas's incoherent and inconsistent political positions and stances are nothing new for the Islamist group, which has regularly had inner conflict between intransigent ideologues and a smaller number of relatively moderate figures who understood the limits of what the Palestinians could ever achieve. This goes back to 30 years ago, during the golden years of the Oslo Peace Process, an imperfect yet viable framework that provided a pathway and opportunity for the Palestinians to obtain an independent state on the 1967 borders. Hamas viciously and relentlessly attacked Yasser Arafat for "giving up" 78 percent of historic Palestine and accepting a state only on the territories of the West Bank and the Gaza Strip.

Hamas claimed that its armed resistance project would achieve what Arafat and the Palestine Liberation Organization (PLO) could not through political negotiations. However, the Islamist group never articulated how its terror attacks, including suicide bombings against Israeli bus stops, restaurants, and wedding halls, would actually reverse Israel's occupation of the Palestinian Territories or could totally defeat an advanced nuclear-armed nation with state-of-the-art armed forces. The only coherent constant in Hamas's strategy is that ongoing chaos and instability would somehow be adequate to perpetuate the conflict until a satisfactory resolution eventually emerges."""

# Check for GPU availability and set the device accordingly
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Move the model to the specified device (GPU or CPU)
model = model.to(device)

# Tokenize the input text and ensure the tensor is on the correct device
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

# Generate summary IDs from the model
summary_ids = model.generate(input_ids)

# Decode the generated IDs to text and remove special tokens
print(tokenizer.decode(summary_ids[0], skip_special_tokens=True))

Hamas has repeatedly attacked Arafat for giving up 78 percent of Palestine.
