In [19]:
!pip install transformers datasets rouge-score accelerate
!pip install sentencepiece
!pip install matplotlib
!pip install absl-py nltk rouge_score
!pip install bert-score
!pip install rouge_score
!pip install datasets

In [None]:
import os
import torch
import gc
import time
from datetime import datetime
from torch.utils.data import DataLoader, Dataset
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments, AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset, load_metric
from torch.cuda.amp import GradScaler, autocast
from bert_score import score

# Set environment variable for PyTorch CUDA memory allocation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'

In [None]:
class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item
    def __len__(self):
        return len(self.labels['input_ids'])


In [None]:
def prepare_data(model_name,
                 train_texts, train_labels,
                 val_texts=None, val_labels=None,
                 test_texts=None, test_labels=None):
  """
  Prepare input data for model fine-tuning
  """
  tokenizer = PegasusTokenizer.from_pretrained(model_name)

  prepare_val = False if val_texts is None or val_labels is None else True
  prepare_test = False if test_texts is None or test_labels is None else True

  def tokenize_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True)
    decodings = tokenizer(labels, truncation=True, padding=True)
    dataset_tokenized = PegasusDataset(encodings, decodings)
    return dataset_tokenized

  train_dataset = tokenize_data(train_texts, train_labels)
  val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None
  test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None

  return train_dataset, val_dataset, test_dataset, tokenizer

In [None]:
def prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset=None, output_dir='./results'):
    """
    Prepare configurations and base model for fine-tuning
    """
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

    # Freeze all model parameters initially
    for param in model.parameters():
        param.requires_grad = False

    # Unfreeze the last 6 layers of the decoder
    num_decoder_layers = model.config.decoder_layers
    layers_to_unfreeze = num_decoder_layers - 6
    for layer in model.model.decoder.layers[layers_to_unfreeze:]:
        for param in layer.parameters():
            param.requires_grad = True

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,            # output directory
        num_train_epochs=6,               # number of training epochs
        per_device_train_batch_size=8,    # batch size per device during training
        per_device_eval_batch_size=8,     # batch size for evaluation
        save_strategy="epoch",                   # number of updates steps before checkpoint saves
        save_total_limit=5,               # limit the total amount of checkpoints
        evaluation_strategy='epoch',      # evaluation strategy to adopt during training
        eval_steps=100,                   # number of update steps before evaluation
        warmup_steps=500,                 # number of warmup steps for learning rate scheduler
        weight_decay=0.01,                # strength of weight decay
        logging_dir=f'{output_dir}/logs', # directory for storing logs
        logging_steps=10,
        seed=42 ,
        load_best_model_at_end=True,
        metric_for_best_model='loss'
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=val_dataset if val_dataset else None,  # evaluation dataset
        tokenizer=tokenizer                  # tokenizer for encoding the data
    )

    return trainer


In [18]:
if __name__ == '__main__':
    # Load the full XSum dataset
    dataset = load_dataset("xsum")

    # Sample a subset of the dataset for training, validation, and testing
    # Using 20% of training and 50% of validation and testing
    train_size = int(0.10 * len(dataset['train']))
    val_size = int(0.20 * len(dataset['validation']))
    test_size = int(0.20 * len(dataset['test']))

    # Randomly shuffle and select the subset for training, validation, and testing
    train_dataset = dataset['train'].shuffle(seed=42).select(range(train_size))
    val_dataset = dataset['validation'].shuffle(seed=42).select(range(val_size))
    test_dataset = dataset['test'].shuffle(seed=42).select(range(test_size))

    # Extract texts and labels from the subsets
    train_texts, train_labels = train_dataset['document'], train_dataset['summary']
    val_texts, val_labels = val_dataset['document'], val_dataset['summary'] if val_dataset else (None, None)
    test_texts, test_labels = test_dataset['document'], test_dataset['summary'] if test_dataset else (None, None)

    # Prepare data and tokenizer
    model_name = 'google/pegasus-large'
    train_dataset, val_dataset, test_dataset, tokenizer = prepare_data(
        model_name, train_texts, train_labels, val_texts, val_labels, test_texts, test_labels
    )
      # Prepare the trainer with fine-tuning setup
    trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset)
    trainer.train()
    print("Fine tuning finished successfully")

Syncing run ./results to Weights & Biases (docs)
Fine tuning finished successfully


In [None]:
model_name = "./results/checkpoint-15306"
model = PegasusForConditionalGeneration.from_pretrained(model_name)
tokenizer = PegasusTokenizer.from_pretrained(model_name)

In [4]:
dataset = load_dataset("xsum")
test_size = int(0.05 * len(dataset['test']))
test_dataset = dataset['test'].shuffle(seed=42).select(range(test_size))
test_texts = test_dataset['document']


Downloading builder script:   100%|          | 2.05k/2.05k 
Downloading metadata:   100%|          | 954/954 
Using custom data configuration default
Downloading and preparing dataset xsum/default (download: 245.38 MiB, generated: 507.60 MiB, post-processed: Unknown size, total: 752.98 MiB) to /root/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934...
Downloading data files:   100%|          | 2/2 
Downloading data:   100%|          | 255M/255M 
Downloading data:   100%|          | 1.00/1.00M 
Generating train split:   100%|          | 204045/204045 
Generating validation split:   100%|          | 11332/11332 
Generating test split:   100%|          | 11334/11334 
Dataset xsum downloaded and prepared to /root/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934. Subsequent calls will reuse this data.
  # 100%|          | 3/3 



In [None]:
test_encodings = tokenizer(test_texts, truncation=True, padding='longest', return_tensors="pt")

# Running inference on GPU, if available.
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

batch_size = 8
generated_summaries = []

for i in range(0, test_encodings.input_ids.size(0), batch_size):
    batch_input_ids = test_encodings.input_ids[i:i+batch_size].to(device)
    summary_ids = model.generate(batch_input_ids)
    generated_summaries += [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]




In [5]:
rouge = load_metric('rouge')
references = test_dataset['summary']
results = rouge.compute(predictions=generated_summaries, references=references, use_stemmer=True)

# Print out the results
for key in results.keys():
    print(key, results[key].mid.fmeasure)

rouge1 0.3538725498074857
rouge2 0.14308078644601482
rougeL 0.2797705677040414
rougeLsum 0.2802197843314166


In [None]:
path_to_checkpoint = "./results/checkpoint-15306"
model = PegasusForConditionalGeneration.from_pretrained(path_to_checkpoint)
tokenizer = PegasusTokenizer.from_pretrained(path_to_checkpoint)

original_text = """
Dogs can understand that certain words refer to specific objects, according to a recent study, suggesting that they may understand words in a similar way to humans.

It offers the first evidence of brain activity for this comprehension in a non-human animal, researchers said, though the study’s conclusion has faced scrutiny from other experts in the field.

 It has long been known that dogs can learn commands like sit, stay, or fetch and can respond to these words with learned behaviors, often with the help of a treat or two, but untangling their understanding of nouns has proven more difficult.
 """

inputs = tokenizer(original_text, return_tensors="pt", truncation=True, padding="longest", max_length=512)
summary_ids = model.generate(inputs['input_ids'], max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)

# Decode and print the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(summary)


Dogs can understand that certain words refer to specific objects, according to a recent study, suggesting that they may understand words in a similar way to humans.


In [None]:
# Path to the saved checkpoint
checkpoint_path = "./results/checkpoint-10204"

# Load the model and tokenizer from the checkpoint
model = PegasusForConditionalGeneration.from_pretrained(checkpoint_path)
tokenizer = PegasusTokenizer.from_pretrained(checkpoint_path)

In [6]:
dataset = load_dataset("xsum")

# Sample a subset of the dataset for training, validation, and testing
    # Using 10% of training and 20% of validation and testing
train_size = int(0.10 * len(dataset['train']))
val_size = int(0.20 * len(dataset['validation']))
test_size = int(0.20 * len(dataset['test']))

    # Randomly shuffle and select the subset for training, validation, and testing
train_dataset = dataset['train'].shuffle(seed=42).select(range(train_size))
val_dataset = dataset['validation'].shuffle(seed=42).select(range(val_size))
test_dataset = dataset['test'].shuffle(seed=42).select(range(test_size))

train_texts, train_labels = train_dataset['document'], train_dataset['summary']
val_texts, val_labels = val_dataset['document'], val_dataset['summary'] if val_dataset else (None, None)
test_texts, test_labels = test_dataset['document'], test_dataset['summary'] if test_dataset else (None, None)

# Tokenize the data
train_dataset, val_dataset, _, _ = prepare_data(
    'google/pegasus-large', train_texts, train_labels, val_texts, val_labels
)


Downloading builder script:   100%|          | 2.05k/2.05k 
Downloading metadata:   100%|          | 954/954 
Using custom data configuration default
Downloading and preparing dataset xsum/default (download: 245.38 MiB, generated: 507.60 MiB, post-processed: Unknown size, total: 752.98 MiB) to /root/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934...
Downloading data files:   100%|          | 2/2 
Downloading data:   100%|          | 255/255M 
Downloading data:   100%|          | 1.00/1.00M 
Generating train split:   100%|          | 204045/204045 
Generating validation split:   100%|          | 11332/11332 
Generating test split:   100%|          | 11332/11334 
Dataset xsum downloaded and prepared to /root/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934. Subsequent calls will reuse this data.
  100%|          | 3/3 
Downloading spiece.model:   100%|          

In [None]:
for param in model.parameters():
        param.requires_grad = False

    # Unfreeze the last 6 layers of the decoder
num_decoder_layers = model.config.decoder_layers
layers_to_unfreeze = num_decoder_layers - 6
for layer in model.model.decoder.layers[layers_to_unfreeze:]:
    for param in layer.parameters():
        param.requires_grad = True

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",         # Directory to save the outputs
    num_train_epochs=10,            # Total epochs to reach including previous training
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    logging_dir="./results/logs",   # Directory to save logs
    resume_from_checkpoint=checkpoint_path,  # Continue from last checkpoint
    seed=42,
    load_best_model_at_end=True,
    metric_for_best_model='loss'
)

In [17]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Resume training
trainer.train()

Syncing run ./results to Weights & Biases (docs)
Fine tuning finished successfully


In [None]:
model_name = "./results/checkpoint-10204"
model = PegasusForConditionalGeneration.from_pretrained(model_name)
tokenizer = PegasusTokenizer.from_pretrained(model_name)

In [8]:
dataset = load_dataset("xsum")
test_size = int(0.05 * len(dataset['test']))
test_dataset = dataset['test'].shuffle(seed=42).select(range(test_size))
test_texts = test_dataset['document']
test_references = test_dataset['summary']

In [None]:
test_encodings = tokenizer(test_texts, truncation=True, padding='longest', return_tensors="pt")


device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

batch_size = 8
generated_summaries = []

for i in range(0, test_encodings.input_ids.size(0), batch_size):
    batch_input_ids = test_encodings.input_ids[i:i+batch_size].to(device)
    summary_ids = model.generate(batch_input_ids)
    generated_summaries += [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]


In [15]:
rouge = load_metric('rouge')
references = test_dataset['summary']
results = rouge.compute(predictions=generated_summaries, references=references, use_stemmer=True)

# Print out the results
for key in results.keys():
    print(key, results[key].mid.fmeasure)

rouge1 0.3658887282212533
rouge2 0.1500332057209846
rougeL 0.29663345261467633
rougeLsum 0.29602682013087056


In [10]:
# Calculate BERTScore
P, R, F1 = score(generated_summaries, test_references, lang="en", rescale_with_baseline=True)

# Calculate the mean scores for BERTScore
P_mean = P.mean().item()
R_mean = R.mean().item()
F1_mean = F1.mean().item()

# Print the BERTScore results
print(f"BERTScore Precision (mean): {P_mean:.4f}")
print(f"BERTScore Recall (mean): {R_mean:.4f}")
print(f"BERTScore F1 (mean): {F1_mean:.4f}")

BERTScore Precision (mean): 0.4256
BERTScore Recall (mean): 0.3770
BERTScore F1 (mean): 0.4014


In [None]:
model_name = "./results/checkpoint-12755"
model = PegasusForConditionalGeneration.from_pretrained(model_name)
tokenizer = PegasusTokenizer.from_pretrained(model_name)

In [11]:
dataset = load_dataset("xsum")
test_size = int(0.05 * len(dataset['test']))
test_dataset = dataset['test'].shuffle(seed=42).select(range(test_size))
test_texts = test_dataset['document']


Using custom data configuration default
Downloading and preparing dataset xsum/default (download: 245.38 MiB, generated: 507.60 MiB, post-processed: Unknown size, total: 752.98 MiB) to /root/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934...
Downloading data files:   100%|          | 2/2 
Generating train split:   100%|          | 204045/204045 
Generating validation split:   100%|          | 11332/11332 
Generating test split:   100%|          | 11334/11334 
Dataset xsum downloaded and prepared to /root/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934. Subsequent calls will reuse this data.
  100%|          | 3/3 



In [None]:
test_encodings = tokenizer(test_texts, truncation=True, padding='longest', return_tensors="pt")

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

batch_size = 8
generated_summaries = []

for i in range(0, test_encodings.input_ids.size(0), batch_size):
    batch_input_ids = test_encodings.input_ids[i:i+batch_size].to(device)
    summary_ids = model.generate(batch_input_ids)
    generated_summaries += [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]




In [13]:
rouge = load_metric('rouge')
references = test_dataset['summary']
results = rouge.compute(predictions=generated_summaries, references=references, use_stemmer=True)

# Print out the results
for key in results.keys():
    print(key, results[key].mid.fmeasure)

rouge1 0.3615360871166696
rouge2 0.14448061052666816
rougeL 0.28897143776885015
rougeLsum 0.28845520980130684


In [None]:
path_to_checkpoint = "./results/checkpoint-12755"
model = PegasusForConditionalGeneration.from_pretrained(path_to_checkpoint)
tokenizer = PegasusTokenizer.from_pretrained(path_to_checkpoint)

original_text = """
Dogs can understand that certain words refer to specific objects, according to a recent study, suggesting that they may understand words in a similar way to humans.

It offers the first evidence of brain activity for this comprehension in a non-human animal, researchers said, though the study’s conclusion has faced scrutiny from other experts in the field.

 It has long been known that dogs can learn commands like sit, stay, or fetch and can respond to these words with learned behaviors, often with the help of a treat or two, but untangling their understanding of nouns has proven more difficult.
 """

# Encode the text into tensor and run it through the model
inputs = tokenizer(original_text, return_tensors="pt", truncation=True, padding="longest", max_length=512)
summary_ids = model.generate(inputs['input_ids'], max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)

# Decode and print the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(summary)

Dogs have been shown to be able to understand certain words, according to a new study in the field of animal psychology.


In [None]:
path_to_checkpoint = "./results/checkpoint-12755"
model = PegasusForConditionalGeneration.from_pretrained(path_to_checkpoint)
tokenizer = PegasusTokenizer.from_pretrained(path_to_checkpoint)

original_text = """
GizmoChina has spotted data about Google's smartwatch has been spotted in the US Federal Communications Commission's (FCC) certification website's database. The source believes it to be the Pixel Watch 2a, the budget flagship version of the Pixel Watch 2.

The FCC has tagged Google's new watch with the catalog number G4SKY and the device is believed to run on WearOS. It will also get Bluetooth, Wi-Fi and LTE support in one model.

In addition, the FCC has information about a UWB module that can be used to locate the watch if it is lost and unlock a synced smartphone more accurately.
 """

# Encode the text into tensor and run it through the model
inputs = tokenizer(original_text, return_tensors="pt", truncation=True, padding="longest", max_length=512)
summary_ids = model.generate(inputs['input_ids'], max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)

# Decode and print the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(summary)

Google has released a new version of its smartwatch, the G4SKY.


In [None]:
path_to_checkpoint = "./results/checkpoint-12755"
model = PegasusForConditionalGeneration.from_pretrained(path_to_checkpoint)
tokenizer = PegasusTokenizer.from_pretrained(path_to_checkpoint)

original_text = """
Four almost simultaneous flares erupted from the Sun yesterday, which is considered a very rare phenomenon. As a result of the explosions, emissions can reach the Earth, causing a geomagnetic storm, reports Space.com.

Solar flares are explosions on the Sun's surface that release intense bursts of electromagnetic radiation. They occur when magnetic energy builds up in the Sun's atmosphere and is quickly released from it.

Three of the flares were caused by sunspots and the other by magnetic filaments hundreds of thousands of miles apart and connected by nearly invisible magnetic rings in the Sun's outer atmosphere known as the corona.
"""

# Encode the text into tensor and run it through the model
inputs = tokenizer(original_text, return_tensors="pt", truncation=True, padding="longest", max_length=512)
summary_ids = model.generate(inputs['input_ids'], max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)

# Decode and print the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(summary)

Three of the brightest solar flares on the Sun's surface are caused by sunspots and the other by solar wind.


In [None]:
path_to_checkpoint = "./results/checkpoint-12755"
model = PegasusForConditionalGeneration.from_pretrained(path_to_checkpoint)
tokenizer = PegasusTokenizer.from_pretrained(path_to_checkpoint)

original_text = """
The current understanding of the origin of the Universe is based on the Big Bang theory, which is supported by a huge amount of observational data. According to this theory, the universe began approximately 13.8 billion years ago: all matter, energy, space and time were concentrated at one point, known as a singularity. The Universe then underwent a rapid and violent expansion known as the Big Bang, during which space itself began to expand and cool.
"""

# Encode the text into tensor and run it through the model
inputs = tokenizer(original_text, return_tensors="pt", truncation=True, padding="longest", max_length=512)
summary_ids = model.generate(inputs['input_ids'], max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)

# Decode and print the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(summary)

The Universe has been expanding ever since the Big Bang theory was first proposed in 1859.


In [None]:
path_to_checkpoint = "./results/checkpoint-12755"
model = PegasusForConditionalGeneration.from_pretrained(path_to_checkpoint)
tokenizer = PegasusTokenizer.from_pretrained(path_to_checkpoint)

original_text = """
In the first quarter of 2024, the number of attacks on Android smartphone users in Russia increased 5.2 times compared to the same period last year. According to cybersecurity expert Dmitry Kalinin, more than 19 million Russian users became victims of hackers during the specified period.

Among the most dangerous threats were the Dwphon and Mamont viruses. Dwphon is capable of collecting personal data of the device owner, information about applications, and even downloading malicious software without the user's consent.

Mamont is aimed at extorting payment data and access to SMS on an infected device.

Some of the malware, including Dwphon, may come pre-installed on devices right out of the box. As Kalinin noted, this poses a serious threat to users, since attackers can compromise the device supply chain by introducing malware in the early stages of production without the knowledge of the manufacturer or other participants in this process.
Three of the flares were caused by sunspots and the other by magnetic filaments hundreds of thousands of miles apart and connected by nearly invisible magnetic rings in the Sun's outer atmosphere known as the corona.
"""

# Encode the text into tensor and run it through the model
inputs = tokenizer(original_text, return_tensors="pt", truncation=True, padding="longest", max_length=512)
summary_ids = model.generate(inputs['input_ids'], max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)

# Decode and print the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(summary)

The number of attacks on Android devices in Russia has more than doubled in the past year, according to a report by Kaspersky Lab.


In [None]:
path_to_checkpoint = "./results/checkpoint-12755"
model = PegasusForConditionalGeneration.from_pretrained(path_to_checkpoint)
tokenizer = PegasusTokenizer.from_pretrained(path_to_checkpoint)

original_text = """
Computer games for children should not replace the whole world around them. Nikita Kocherzhenko, CEO of Uncom OS, a company that develops operating systems, told kp.ru how parents can understand that a child is becoming addicted to computer games and what needs to be done.

According to him, when obsession with games turns into addiction, the child's real problems and needs begin to be pushed into the background.

"For the sake of playing, the child begins to sacrifice his studies or simply forgets to eat. If going out of the game causes aggression, it gives reason to think," he said.

Nikita Kocherzhenko noted that if failures in the game cause aggression in the child, and it does not disappear even after turning off the game, this is also a very alarming signal.

"If the game is the only interest, and it is not possible to distract the child with something else, then that same cheap dopamine begins to subordinate itself to the child. And parents should not lose the moment and explain to the child that he is already too close to the dangerous point. It is necessary to stop," said the expert.

At the same time, he noted that computer games can also become an opportunity for a child's development and a source of new experience that cannot be obtained elsewhere.

Researchers at the University of York have found that puzzles and strategy games improve a child's memory and concentration.
"""

# Encode the text into tensor and run it through the model
inputs = tokenizer(original_text, return_tensors="pt", truncation=True, padding="longest", max_length=512)
summary_ids = model.generate(inputs['input_ids'], max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)

# Decode and print the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(summary)

Parents should not lose the moment and explain to a child that he is already too close to the dangerous point, says an expert.


In [None]:
path_to_checkpoint = "./results/checkpoint-12755"
model = PegasusForConditionalGeneration.from_pretrained(path_to_checkpoint)
tokenizer = PegasusTokenizer.from_pretrained(path_to_checkpoint)

original_text = """
An autoimmune disease such as multiple sclerosis is thought to result in part from rare immune responses to common infections.

For this study, scientists analysed blood samples taken from 250 multiple sclerosis patients before and after diagnosis and compared them with blood samples from healthy people.

The researchers thought they would see a spike in antibody levels when multiple sclerosis patients showed the first symptoms of the disease.


The researchers thought they would see a spike in antibody levels when multiple sclerosis patients showed the first symptoms of the disease. Instead, they found that 10 per cent of multiple sclerosis patients had strikingly high levels of autoantibodies - antibodies that can attack the body itself - years before diagnosis.

About a dozen of the autoantibodies identified by the researchers had chemical compositions similar to those found in common viruses. These included the Epstein-Barr virus, which affects more than 85 per cent of all people and has been highlighted in earlier studies as a potential cause of multiple sclerosis.

In fact, these 10% of multiple sclerosis patients showed signs of an immune war raging in the brain years before diagnosis. These patients also had elevated levels of a protein that is released when neurons are destroyed.

To confirm their findings, the researchers analysed blood samples from patients participating in another study linked to neurological symptoms. Again, the same set of autoantibodies were detected in 10 per cent of patients diagnosed with multiple sclerosis.

Scientists hope that these antibodies will someday form the basis of a simple blood test to detect forms of multiple sclerosis.
"""

# Encode the text into tensor and run it through the model
inputs = tokenizer(original_text, return_tensors="pt", truncation=True, padding="longest", max_length=512)
summary_ids = model.generate(inputs['input_ids'], max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)

# Decode and print the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(summary)

Scientists have found that 10 per cent of people with multiple sclerosis have high levels of antibodies that can attack the body, a finding they hope will help them diagnose the disease.
