In [None]:
# Load Subset of the Dataset
dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")


model_name = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Preprocess Data
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=128, truncation=True, padding="max_length")

    labels["input_ids"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in labels["input_ids"][i]]
        for i in range(len(labels["input_ids"]))
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Training
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_steps=100,
    push_to_hub=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer
)

# Train the model
trainer.train()


  trainer = Trainer(


Step,Training Loss
100,1.787
200,1.6516
300,1.5977
400,1.6156
500,1.5595
600,1.5807
700,1.5724
800,1.5644
900,1.5392
1000,1.5084


TrainOutput(global_step=1077, training_loss=1.5937534002872562, metrics={'train_runtime': 1515.6891, 'train_samples_per_second': 5.683, 'train_steps_per_second': 0.711, 'total_flos': 5244954311393280.0, 'train_loss': 1.5937534002872562, 'epoch': 3.0})

In [None]:
texts = ["The U.S. economy added 850,000 jobs in June, a sign of continued recovery as businesses reopen and consumers spend more. The unemployment rate, however, ticked up slightly to 5.9% from 5.8% in May.",
        "A massive wildfire in northern California has scorched over 150,000 acres, forcing thousands to evacuate. Firefighters are struggling to contain the blaze amid high temperatures and strong winds.",
        "Scientists have discovered a new species of dinosaur in Argentina. The creature, named 'Llukalkan aliocranianus,' lived approximately 80 million years ago and is believed to have been a formidable predator.",
        "The Tokyo 2020 Olympics, postponed due to the COVID-19 pandemic, are set to begin with strict health protocols in place. Athletes will undergo regular testing, and spectators will be limited to local residents.",
        "A recent study suggests that drinking coffee may reduce the risk of developing Alzheimer's disease. Researchers found that participants who consumed higher amounts of caffeine had a lower incidence of the neurodegenerative condition.",
        "The United Nations has called for an immediate ceasefire in the ongoing conflict in Yemen. The humanitarian crisis has worsened, with millions facing famine and limited access to medical supplies.",
        "Tech giant Apple has announced plans to invest $1 billion in building a new campus in North Carolina. The facility is expected to create thousands of jobs and bolster the state's economy."]

for text in texts:
  input_ids = tokenizer.encode("summarize: " + text, return_tensors="pt").to(model.device)
  output = model.generate(
      input_ids,
      max_length=100,
      num_beams=3,
      early_stopping=True,
      do_sample=True,
      temperature=0.9,
      top_k=50,
      top_p=0.95
  )
  summary = tokenizer.decode(output[0], skip_special_tokens=True)
  print(summary)


U.S. economy added 850,000 jobs in June, sign of continued recovery . The unemployment rate ticked up slightly to 5.9% from 5.8% .
A massive wildfire in northern California has scorched over 150,000 acres . Firefighters are struggling to contain the blaze amid high temperatures and strong winds .
Scientists have discovered a new species of dinosaur in Argentina . The creature, named 'Lukalkan aliocranianus', lived approximately 80 million years ago .
Athletes will undergo regular testing, and spectators will be limited to local residents . The Tokyo 2020 Olympics are set to begin with strict health protocols in place .
Researchers found people who consumed more caffeine had a lower incidence of Alzheimer's disease .
The humanitarian crisis has worsened, with millions facing famine .
Tech giant Apple has announced plans to invest $1 billion in a new campus . The facility is expected to create thousands of jobs .
