In [None]:
!pip install transformers datasets rouge_score

from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from rouge_score import rouge_scorer
import numpy as np

# Load Subset of the Dataset
try:
    dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")
except ValueError as e:
    print(f"Error loading dataset: {e}")
    print("Check dataset name, version, and Hugging Face Hub availability.")
    raise


model_name = 't5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Preprocess Data
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length", return_tensors="pt")

    labels = tokenizer(examples["highlights"], max_length=64, truncation=True, padding="max_length", return_tensors="pt")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Training
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    save_steps=10000,
    logging_steps=100,
    push_to_hub=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer,
)

trainer.train()

# Simplified Evaluation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = []

    for ref, pred in zip(decoded_labels, decoded_preds):
        try:
            score = scorer.score(ref, pred)
            rouge_scores.append(score)
        except Exception as e:
            print(f"Error calculating ROUGE score: {e}")
            print(f"Reference: {ref}")
            print(f"Prediction: {pred}")
            continue

    # Calculate average ROUGE scores, handling cases where rouge_scores is empty
    if rouge_scores:
        avg_rouge1 = np.mean([score['rouge1'].fmeasure for score in rouge_scores])
        avg_rouge2 = np.mean([score['rouge2'].fmeasure for score in rouge_scores])
        avg_rougeL = np.mean([score['rougeL'].fmeasure for score in rouge_scores])

        return {
            'rouge1': avg_rouge1,
            'rouge2': avg_rouge2,
            'rougeL': avg_rougeL,
        }
    else:
        return {
            'rouge1': 0.0,
            'rouge2': 0.0,
            'rougeL': 0.0,
            'error': "No valid ROUGE scores calculated"
        }

class CustomTrainer(Trainer):
    def prediction_step(
        self,
        model,
        inputs,
        prediction_loss_only,
        ignore_keys = None
    ):
        labels = inputs["labels"].clone()
        labels[labels == tokenizer.pad_token_id] = -100
        inputs["labels"] = labels
        return super().prediction_step(
            model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
        )




  trainer = Trainer(


Step,Training Loss
100,3.3814
200,2.6262
300,2.5081


In [None]:
texts = ["The U.S. economy added 850,000 jobs in June, a sign of continued recovery as businesses reopen and consumers spend more. The unemployment rate, however, ticked up slightly to 5.9% from 5.8% in May.",
        "A massive wildfire in northern California has scorched over 150,000 acres, forcing thousands to evacuate. Firefighters are struggling to contain the blaze amid high temperatures and strong winds.",
        "Scientists have discovered a new species of dinosaur in Argentina. The creature, named 'Llukalkan aliocranianus,' lived approximately 80 million years ago and is believed to have been a formidable predator.",
        "The Tokyo 2020 Olympics, postponed due to the COVID-19 pandemic, are set to begin with strict health protocols in place. Athletes will undergo regular testing, and spectators will be limited to local residents.",
        "A recent study suggests that drinking coffee may reduce the risk of developing Alzheimer's disease. Researchers found that participants who consumed higher amounts of caffeine had a lower incidence of the neurodegenerative condition.",
        "The United Nations has called for an immediate ceasefire in the ongoing conflict in Yemen. The humanitarian crisis has worsened, with millions facing famine and limited access to medical supplies.",
        "Tech giant Apple has announced plans to invest $1 billion in building a new campus in North Carolina. The facility is expected to create thousands of jobs and bolster the state's economy."]

for text in texts:
  input_ids = tokenizer.encode("summarize: " + text, return_tensors="pt").to(model.device)
  output = model.generate(
      input_ids,
      max_length=100,
      num_beams=4,
      early_stopping=True,
      do_sample=True,
      temperature=0.9,
      top_k=50,
      top_p=0.95
  )
  summary = tokenizer.decode(output[0], skip_special_tokens=True)
  print(summary)

The U.S. economy added 850,000 jobs in June. The unemployment rate ticked slightly from 5.8% in May.
blaze in northern California has scorched over 150,000 acres. Firefighters are struggling to contain fire amid high temperatures and strong winds.
'Llukalkan aliocranianus' lived approximately 80 million years ago.
Tokyo 2020 is postponed due to the COVID-19 pandemic.
drinking coffee may reduce the risk of Alzheimer's.
the conflict has worsened, with millions facing famine and limited access to medical supplies.
Apple has announced plans to invest $1 billion in building a new campus in North Carolina. The facility will create thousands of jobs and bolster the state's economy.
