In [None]:
!pip install transformers datasets evaluate transformers[torch] rouge rouge_metric rouge_score

In [1]:
# import dependencies
import torch
import pprint

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import TrainingArguments, Trainer
from datasets import load_dataset, load_metric

from huggingface_hub import notebook_login


# BART - SAMSUM

## Load Model & Tokenizer

In [None]:
# Load the pre-trained BART model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


## Load Dataset

In [None]:
from datasets import load_dataset, load_metric

dataset = load_dataset('samsum')
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

Generate a sample summary with base model

In [None]:
sample = dataset['test'][0]['dialogue']
label = dataset['test'][0]['summary']

def generate_summary(input, llm):
  input_prompt = f'''
                  Summarize the following conversation.

                  {input}

                  Summary:
                  '''

  input_ids = tokenizer(input_prompt, return_tensors='pt')
  tokenized_output = llm.generate(input_ids['input_ids'], min_length=30, max_length=200)
  output = tokenizer.decode(tokenized_output[0], skip_special_tokens=True)

  return output

output = generate_summary(sample, llm=model)
print('Sample')
print(sample)
print('-'*20)
print('Model Generated Sum:')
print(output)
print('Correct Summary:')
print(label)

Sample
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him ðŸ™‚
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
--------------------
Model Generated Sum:
Hannah asks Amanda for Betty's number. Amanda tries to find the number but can't find it. She asks Hannah to text Larry, who is a friend of Betty's. Hannah says she'd rather text him.
Correct Summary:
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.


Write a preprocess function for data

In [None]:
# Define a function to preprocess the dataset
def tokenize_inputs(example):
  start_prompt = 'Summarize the following conversation. \n\n'
  end_prompt = '\n\nSummary'
  # dialogue = example["dialogue"]
  targets =[summar for summar in example["summary"]]
  prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]

  return{'input_ids': tokenizer(prompt,  max_length=1024, padding='max_length', truncation=True, return_tensors='pt').input_ids,
         'labels': tokenizer(targets,  max_length=128, padding='max_length', truncation=True, return_tensors='pt').input_ids}

tokenizer.pad_token = tokenizer.eos_token
tokenized_datasets = dataset.map(tokenize_inputs, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id'])
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Filter:   0%|          | 0/14732 [00:00<?, ? examples/s]

Filter:   0%|          | 0/819 [00:00<?, ? examples/s]

Filter:   0%|          | 0/818 [00:00<?, ? examples/s]

In [None]:
print(tokenized_datasets['train'].shape)
print(tokenized_datasets['validation'].shape)
print(tokenized_datasets['test'].shape)

(148, 4)
(9, 4)
(9, 4)


## Finetune Model

In [None]:
# Authenticate hugging face
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./bart-cnn-samsum-finetuned",
    # hub_model_id='aweosmeGuss/bart-cnn-samsum-finetuned'
    num_train_epochs=1,
    auto_find_batch_size=True,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
)

# Define the Trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.135,1.125072


TrainOutput(global_step=19, training_loss=0.9957206876654374, metrics={'train_runtime': 67.8992, 'train_samples_per_second': 2.18, 'train_steps_per_second': 0.28, 'total_flos': 320731481112576.0, 'train_loss': 0.9957206876654374, 'epoch': 1.0})

### Evaluate Perfromance

In [None]:
# Load the ROUGE metric
rouge_metric = load_metric("rouge")

# Define a function to generate summaries
def generate_summary(batch):
    inputs = tokenizer(batch["dialogue"], padding="max_length", truncation=True, max_length=512, return_tensors="pt").to('cuda') # type: 'transformers.tokenization_utils_base.BatchEncoding'
    with torch.no_grad():
        model.to('cuda')
        outputs = model.generate(**inputs) # type: 'torch.Tensor'
    return {'pred_summary_ids': tokenizer.batch_decode(outputs, skip_special_tokens=True)
}
# Generate summaries for the validation set
generated_summaries = tokenized_datasets['test'].map(generate_summary, batched=True)

# Compute the ROUGE score
references = tokenized_datasets['test']["summary"]
rouge_score = rouge_metric.compute(predictions=generated_summaries, references=references)



You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Map:   0%|          | 0/9 [00:00<?, ? examples/s]

<class 'torch.Tensor'>
<class 'transformers.tokenization_utils_base.BatchEncoding'>
ROUGE score: {'rouge1': AggregateScore(low=Score(precision=0.012816202569480524, recall=1.0, fmeasure=0.025251462380846797), mid=Score(precision=0.017023564133775447, recall=1.0, fmeasure=0.033374596137725816), high=Score(precision=0.02152335175726073, recall=1.0, fmeasure=0.0420571809752918)), 'rouge2': AggregateScore(low=Score(precision=0.012348493919827315, recall=1.0, fmeasure=0.0243316389759488), mid=Score(precision=0.01613511340512816, recall=1.0, fmeasure=0.03171769425597848), high=Score(precision=0.020288553505286515, recall=1.0, fmeasure=0.03967126273790696)), 'rougeL': AggregateScore(low=Score(precision=0.012926279187386968, recall=1.0, fmeasure=0.025462750348444002), mid=Score(precision=0.016830497366843222, recall=1.0, fmeasure=0.03305818623744286), high=Score(precision=0.021482596730482008, recall=1.0, fmeasure=0.04201288702321822)), 'rougeLsum': AggregateScore(low=Score(precision=0.0130916

In [None]:
# Pretty print the ROUGE scores
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(rouge_score)


{   'rouge1': AggregateScore(low=Score(precision=0.012816202569480524, recall=1.0, fmeasure=0.025251462380846797), mid=Score(precision=0.017023564133775447, recall=1.0, fmeasure=0.033374596137725816), high=Score(precision=0.02152335175726073, recall=1.0, fmeasure=0.0420571809752918)),
    'rouge2': AggregateScore(low=Score(precision=0.012348493919827315, recall=1.0, fmeasure=0.0243316389759488), mid=Score(precision=0.01613511340512816, recall=1.0, fmeasure=0.03171769425597848), high=Score(precision=0.020288553505286515, recall=1.0, fmeasure=0.03967126273790696)),
    'rougeL': AggregateScore(low=Score(precision=0.012926279187386968, recall=1.0, fmeasure=0.025462750348444002), mid=Score(precision=0.016830497366843222, recall=1.0, fmeasure=0.03305818623744286), high=Score(precision=0.021482596730482008, recall=1.0, fmeasure=0.04201288702321822)),
    'rougeLsum': AggregateScore(low=Score(precision=0.013091618812015164, recall=1.0, fmeasure=0.025780840783888023), mid=Score(precision=0.017

# BART - SCITLDR

## Load Model & Tokenizer

In [None]:
# Load the pre-trained BART model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

## Load Dataset

In [None]:
dataset = load_dataset('allenai/scitldr')
dataset

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetDict({
    train: Dataset({
        features: ['source', 'source_labels', 'rouge_scores', 'paper_id', 'target'],
        num_rows: 1992
    })
    test: Dataset({
        features: ['source', 'source_labels', 'rouge_scores', 'paper_id', 'target'],
        num_rows: 618
    })
    validation: Dataset({
        features: ['source', 'source_labels', 'rouge_scores', 'paper_id', 'target'],
        num_rows: 619
    })
})

Generate a sample summary with base model

In [None]:
sample = dataset['test'][0]['source']
label = dataset['test'][0]['target']

def generate_summary(input, llm):
  input_prompt = f'''
                  Summarize the following article.

                  {input}

                  Summary:
                  '''

  input_ids = tokenizer(input_prompt, return_tensors='pt')
  tokenized_output = llm.generate(input_ids['input_ids'], min_length=30, max_length=200).to('cuda')
  output = tokenizer.decode(tokenized_output[0], skip_special_tokens=True)

  return output

output = generate_summary(sample, llm=model)
print('Sample')
print(sample)
print('-'*20)
print('Model Generated Sum:')
print(output)
print('Correct Summary:')
print(label)

Sample
['Incremental class learning involves sequentially learning classes in bursts of examples from the same class.', 'This violates the assumptions that underlie  methods for training standard deep neural networks, and will cause them to suffer from catastrophic forgetting.', 'Arguably, the best method for incremental class learning is iCaRL, but it requires storing  training examples for each class, making it challenging to scale.', 'Here, we propose FearNet for incremental class learning.', 'FearNet is a generative model that does not store previous examples, making it memory efficient.', 'FearNet uses a brain-inspired dual-memory system in which new memories are consolidated from a network for recent memories inspired by the mammalian hippocampal complex to a network for long-term storage inspired by medial prefrontal cortex.', 'Memory consolidation is inspired by mechanisms that occur during sleep.', 'FearNet also uses a module inspired by the basolateral amygdala for determinin

Preprocess Input

In [None]:
# Define a function to preprocess the dataset
def tokenize_inputs(example):

  article = [''.join(article) for article in example["source"]]
  tldr = [''.join(summary) for summary in example["target"]]

  # Tokenize the article and tldr
  inputs = tokenizer(article, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
  targets = tokenizer(tldr, return_tensors="pt", max_length=128, truncation=True, padding="max_length")

  return {
      "input_ids": inputs.input_ids,
      "labels": targets.input_ids
  }
tokenizer.pad_token = tokenizer.eos_token
tokenized_datasets = dataset.map(tokenize_inputs, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['source_labels', 'rouge_scores', 'paper_id'])
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 20 == 0, with_indices=True)

Map:   0%|          | 0/1992 [00:00<?, ? examples/s]

Map:   0%|          | 0/618 [00:00<?, ? examples/s]

Map:   0%|          | 0/619 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1992 [00:00<?, ? examples/s]

Filter:   0%|          | 0/618 [00:00<?, ? examples/s]

Filter:   0%|          | 0/619 [00:00<?, ? examples/s]

In [None]:
print(tokenized_datasets)
print()
print(tokenized_datasets['train'].shape)
print(tokenized_datasets['validation'].shape)
print(tokenized_datasets['test'].shape)

DatasetDict({
    train: Dataset({
        features: ['source', 'target', 'input_ids', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['source', 'target', 'input_ids', 'labels'],
        num_rows: 31
    })
    validation: Dataset({
        features: ['source', 'target', 'input_ids', 'labels'],
        num_rows: 31
    })
})

(100, 4)
(31, 4)
(31, 4)


## Finetune Model

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./bart-cnn-scitldr-finetuned",
    # hub_model_id='aweosmeGuss/bart-cnn-scitldr-finetuned'
    num_train_epochs=1,
    auto_find_batch_size=True,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
)

# Define the Trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)



In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.6174,2.929382


TrainOutput(global_step=13, training_loss=1.4836159486037035, metrics={'train_runtime': 12.4457, 'train_samples_per_second': 8.035, 'train_steps_per_second': 1.045, 'total_flos': 108355230105600.0, 'train_loss': 1.4836159486037035, 'epoch': 1.0})

### Evaluate Perfromance

In [None]:
# Load the ROUGE metric
rouge_metric = load_metric("rouge")

# Define a function to generate summaries
def generate_summary(batch):
    articles = [''.join(article) for article in batch["source"]]
    inputs = tokenizer(articles, padding="max_length", truncation=True, max_length=512, return_tensors="pt").to('cuda') # type: 'transformers.tokenization_utils_base.BatchEncoding'
    with torch.no_grad():
        model.to('cuda')
        outputs = model.generate(**inputs) # type: 'torch.Tensor'
    return {'pred_summary_ids': tokenizer.batch_decode(outputs, skip_special_tokens=True)
}
# Generate summaries for the validation set
generated_summaries = tokenized_datasets['test'].map(generate_summary, batched=True)

# Compute the ROUGE score
references = tokenized_datasets['test']["target"]
rouge_score = rouge_metric.compute(predictions=generated_summaries, references=references)

  rouge_metric = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Map:   0%|          | 0/31 [00:00<?, ? examples/s]

In [None]:
# Pretty print the ROUGE scores
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(rouge_score)

{   'rouge1': AggregateScore(low=Score(precision=0.06570118366259305, recall=1.0, fmeasure=0.12293569463112017), mid=Score(precision=0.07241607067392214, recall=1.0, fmeasure=0.1343565073490426), high=Score(precision=0.07944603875919955, recall=1.0, fmeasure=0.14660587781887943)),
    'rouge2': AggregateScore(low=Score(precision=0.0651056861643197, recall=1.0, fmeasure=0.1217802654237229), mid=Score(precision=0.07169745546107753, recall=1.0, fmeasure=0.13314007426479968), high=Score(precision=0.07911936616972805, recall=1.0, fmeasure=0.14573968240181762)),
    'rougeL': AggregateScore(low=Score(precision=0.06563280297632383, recall=1.0, fmeasure=0.12282251179705495), mid=Score(precision=0.07256287151874372, recall=1.0, fmeasure=0.13475266240595107), high=Score(precision=0.08012661051465983, recall=1.0, fmeasure=0.14778598254783396)),
    'rougeLsum': AggregateScore(low=Score(precision=0.06598223623789497, recall=1.0, fmeasure=0.1233521003706573), mid=Score(precision=0.0724119065585816,

# PEGASUS - SAMSUM

## Load Model & Tokenizer

In [4]:
# Load the pre-trained BART model and tokenizer
model_name = "google/pegasus-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Load Dataset

In [None]:
from datasets import load_dataset, load_metric

dataset = load_dataset('samsum')
dataset

Downloading data:   0%|          | 0.00/6.06M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/347k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/335k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

Generate a sample summary with base model

In [None]:
sample = dataset['test'][0]['dialogue']
label = dataset['test'][0]['summary']

def generate_summary(input, llm):
  input_prompt = f'''
                  Summarize the following conversation.

                  {input}

                  Summary:
                  '''

  input_ids = tokenizer(input_prompt, return_tensors='pt').to('cuda')
  tokenized_output = llm.generate(input_ids['input_ids'], min_length=30, max_length=200).to('cuda')
  output = tokenizer.decode(tokenized_output[0], skip_special_tokens=True)

  return output

output = generate_summary(sample, llm=model)
print('Sample')
print(sample)
print('-'*20)
print('Model Generated Sum:')
print(output)
print('Correct Summary:')
print(label)

Sample
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him ðŸ™‚
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
--------------------
Model Generated Sum:
Amanda: Ask Larry Amanda: He called her last time we were at the park together Hannah: I don't know him well Hannah: file_gif> Amanda: Don't be shy, he's very nice Hannah: If you say so..
Correct Summary:
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.


Write a preprocess function for data

In [None]:
# Define a function to preprocess the dataset
def tokenize_inputs(example):
  start_prompt = 'Summarize the following conversation. \n\n'
  end_prompt = '\n\nSummary'
  # dialogue = example["dialogue"]
  targets =[summar for summar in example["summary"]]
  prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]

  return{'input_ids': tokenizer(prompt,  max_length=1024, padding='max_length', truncation=True, return_tensors='pt').input_ids,
         'labels': tokenizer(targets,  max_length=128, padding='max_length', truncation=True, return_tensors='pt').input_ids}

tokenizer.pad_token = tokenizer.eos_token
tokenized_datasets = dataset.map(tokenize_inputs, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id'])
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Filter:   0%|          | 0/14732 [00:00<?, ? examples/s]

Filter:   0%|          | 0/819 [00:00<?, ? examples/s]

Filter:   0%|          | 0/818 [00:00<?, ? examples/s]

In [None]:
print(tokenized_datasets['train'].shape)
print(tokenized_datasets['validation'].shape)
print(tokenized_datasets['test'].shape)

(148, 4)
(9, 4)
(9, 4)


## Finetune Model

In [None]:
# Authenticate hugging face
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./pegasus-samsum-finetuned",
    # hub_model_id='aweosmeGuss/pegasus-samsum-finetuned'
    num_train_epochs=1,
    auto_find_batch_size=True,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
)

# Define the Trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,0.7665,0.66268


TrainOutput(global_step=74, training_loss=0.6793983884759851, metrics={'train_runtime': 64.2653, 'train_samples_per_second': 2.303, 'train_steps_per_second': 1.151, 'total_flos': 439198590959616.0, 'train_loss': 0.6793983884759851, 'epoch': 1.0})

### Evaluate Perfromance

In [None]:
# Load the ROUGE metric
rouge_metric = load_metric("rouge")

# Define a function to generate summaries
def generate_summary(batch):
    inputs = tokenizer(batch["dialogue"], padding="max_length", truncation=True, max_length=512, return_tensors="pt").to('cuda') # type: 'transformers.tokenization_utils_base.BatchEncoding'
    with torch.no_grad():
        model.to('cuda')
        outputs = model.generate(**inputs) # type: 'torch.Tensor'
    return {'pred_summary_ids': tokenizer.batch_decode(outputs, skip_special_tokens=True)
}
# Generate summaries for the validation set
generated_summaries = tokenized_datasets['test'].map(generate_summary, batched=True)

# Compute the ROUGE score
references = tokenized_datasets['test']["summary"]
rouge_score = rouge_metric.compute(predictions=generated_summaries, references=references)



  rouge_metric = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Map:   0%|          | 0/9 [00:00<?, ? examples/s]

In [None]:
# Pretty print the ROUGE scores
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(rouge_score)


{   'rouge1': AggregateScore(low=Score(precision=0.012969470414046457, recall=1.0, fmeasure=0.025547083541981784), mid=Score(precision=0.01713048109609827, recall=1.0, fmeasure=0.033650526940795324), high=Score(precision=0.02149354197008536, recall=1.0, fmeasure=0.04204313805295831)),
    'rouge2': AggregateScore(low=Score(precision=0.012508806219319067, recall=1.0, fmeasure=0.02465877618107822), mid=Score(precision=0.016250340010003212, recall=1.0, fmeasure=0.031909632975450804), high=Score(precision=0.020199961205328156, recall=1.0, fmeasure=0.03955429812117059)),
    'rougeL': AggregateScore(low=Score(precision=0.013108407243789559, recall=1.0, fmeasure=0.02581532518843877), mid=Score(precision=0.01692537771834149, recall=1.0, fmeasure=0.033248222298581515), high=Score(precision=0.02140978174487773, recall=1.0, fmeasure=0.04179107320653185)),
    'rougeLsum': AggregateScore(low=Score(precision=0.013250338946540567, recall=1.0, fmeasure=0.026087860519421274), mid=Score(precision=0.01

In [None]:
output = generate_summary(sample, model)
print('Sample')
print(sample)
print('-'*20)
print('Model Generated Sum:')
print(output)
print('Correct Summary:')
print(label)

Sample
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him ðŸ™‚
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
--------------------
Model Generated Sum:
Amanda: Ask Larry Amanda: He called her last time we were at the park together Hannah: I don't know him well Hannah: file_gif> Amanda: Don't be shy, he's very nice Hannah: If you say so..
Correct Summary:
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.


# PEGASUS - SCITLDR

## Load Model & Tokenizer

In [2]:
# Load the pre-trained BART model and tokenizer
model_name = "google/pegasus-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Load Dataset

In [3]:
dataset = load_dataset('allenai/scitldr')
dataset

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetDict({
    train: Dataset({
        features: ['source', 'source_labels', 'rouge_scores', 'paper_id', 'target'],
        num_rows: 1992
    })
    test: Dataset({
        features: ['source', 'source_labels', 'rouge_scores', 'paper_id', 'target'],
        num_rows: 618
    })
    validation: Dataset({
        features: ['source', 'source_labels', 'rouge_scores', 'paper_id', 'target'],
        num_rows: 619
    })
})

Generate a sample summary with base model

In [4]:
sample = dataset['test'][0]['source']
label = dataset['test'][0]['target']

def summarize(input, llm):
  input_prompt = f'''
                  Summarize the following article.

                  {input}

                  Summary:
                  '''

  input_ids = tokenizer(input_prompt, return_tensors='pt')
  tokenized_output = llm.generate(input_ids['input_ids'], min_length=30, max_length=200).to('cuda')
  output = tokenizer.decode(tokenized_output[0], skip_special_tokens=True)

  return output

output = summarize(sample, llm=model)
print('Sample')
print(sample)
print('-'*20)
print('Model Generated Sum:')
print(output)
print('Correct Summary:')
print(label)

Sample
['Incremental class learning involves sequentially learning classes in bursts of examples from the same class.', 'This violates the assumptions that underlie  methods for training standard deep neural networks, and will cause them to suffer from catastrophic forgetting.', 'Arguably, the best method for incremental class learning is iCaRL, but it requires storing  training examples for each class, making it challenging to scale.', 'Here, we propose FearNet for incremental class learning.', 'FearNet is a generative model that does not store previous examples, making it memory efficient.', 'FearNet uses a brain-inspired dual-memory system in which new memories are consolidated from a network for recent memories inspired by the mammalian hippocampal complex to a network for long-term storage inspired by medial prefrontal cortex.', 'Memory consolidation is inspired by mechanisms that occur during sleep.', 'FearNet also uses a module inspired by the basolateral amygdala for determinin

Preprocess Input

In [4]:
# Define a function to preprocess the dataset
def tokenize_inputs(example):

  article = [''.join(article) for article in example["source"]]
  tldr = [''.join(summary) for summary in example["target"]]

  # Tokenize the article and tldr
  inputs = tokenizer(article, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
  targets = tokenizer(tldr, return_tensors="pt", max_length=128, truncation=True, padding="max_length")

  return {
      "input_ids": inputs.input_ids,
      "labels": targets.input_ids
  }
tokenizer.pad_token = tokenizer.eos_token
tokenized_datasets = dataset.map(tokenize_inputs, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['source_labels', 'rouge_scores', 'paper_id'])
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 20 == 0, with_indices=True)

In [None]:
print(tokenized_datasets)
print()
print(tokenized_datasets['train'].shape)
print(tokenized_datasets['validation'].shape)
print(tokenized_datasets['test'].shape)

DatasetDict({
    train: Dataset({
        features: ['source', 'target', 'input_ids', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['source', 'target', 'input_ids', 'labels'],
        num_rows: 31
    })
    validation: Dataset({
        features: ['source', 'target', 'input_ids', 'labels'],
        num_rows: 31
    })
})

(100, 4)
(31, 4)
(31, 4)


Record the ROUGE before finetuning

In [5]:
# Load the ROUGE metric
rouge_metric = load_metric("rouge")

# Select only 10 examples due to memory limits
tokenized_datasets['test'] = tokenized_datasets['test'].select(range(10))


# Define a function to generate summaries
def generate_summary(batch):
    articles = [''.join(article) for article in batch["source"]]
    inputs = tokenizer(articles, padding="max_length", truncation=True, max_length=512, return_tensors="pt").to('cuda') # type: 'transformers.tokenization_utils_base.BatchEncoding'
    with torch.no_grad():
        model.to('cuda')
    outputs = model.generate(**inputs) # type: 'torch.Tensor'
    return {'pred_summary_ids': tokenizer.batch_decode(outputs, skip_special_tokens=True)
}
# Generate summaries for the validation set
generated_summaries = tokenized_datasets['test'].map(generate_summary, batched=True)

# Compute the ROUGE score
references = tokenized_datasets['test']["target"]
rouge_score = rouge_metric.compute(predictions=generated_summaries, references=references)

  rouge_metric = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [6]:
# Pretty print the ROUGE scores
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(rouge_score)

{   'rouge1': AggregateScore(low=Score(precision=0.06114318965324479, recall=1.0, fmeasure=0.11499840677564652), mid=Score(precision=0.07126638062802162, recall=1.0, fmeasure=0.13238468399126507), high=Score(precision=0.08492032121804577, recall=1.0, fmeasure=0.1555506379968625)),
    'rouge2': AggregateScore(low=Score(precision=0.059774064687431266, recall=1.0, fmeasure=0.11262526513022436), mid=Score(precision=0.07003111838813572, recall=1.0, fmeasure=0.13039325955863856), high=Score(precision=0.08361391992025188, recall=1.0, fmeasure=0.15366653977719294)),
    'rougeL': AggregateScore(low=Score(precision=0.06043237995205009, recall=1.0, fmeasure=0.1137944064168069), mid=Score(precision=0.071219798025144, recall=1.0, fmeasure=0.13242353451563144), high=Score(precision=0.08501753965997473, recall=1.0, fmeasure=0.15579480489230585)),
    'rougeLsum': AggregateScore(low=Score(precision=0.06167213511697834, recall=1.0, fmeasure=0.11572871991420275), mid=Score(precision=0.0718124544132096

## Finetune Model

In [None]:
# Authenticate hugging face
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [20]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./pegasus-scitldr-finetuned",
    hub_model_id='aweosmeGuss/pegasus-scitldr-finetuned',
    num_train_epochs=1,
    auto_find_batch_size=False,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
)

# Define the Trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)



In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.8042,2.004027


TrainOutput(global_step=25, training_loss=0.8830069351196289, metrics={'train_runtime': 15.1982, 'train_samples_per_second': 6.58, 'train_steps_per_second': 1.645, 'total_flos': 144473220710400.0, 'train_loss': 0.8830069351196289, 'epoch': 1.0})

### Evaluate Perfromance

In [26]:
# Load the ROUGE metric
rouge_metric = load_metric("rouge")

# Select only 10 examples due to memory limits
tokenized_datasets['test'] = tokenized_datasets['test'].select(range(10))


# Define a function to generate summaries
def generate_summary(batch):
    articles = [''.join(article) for article in batch["source"]]
    inputs = tokenizer(articles, padding="max_length", truncation=True, max_length=512, return_tensors="pt").to('cuda') # type: 'transformers.tokenization_utils_base.BatchEncoding'
    with torch.no_grad():
        model.to('cuda')
    outputs = model.generate(**inputs) # type: 'torch.Tensor'
    return {'pred_summary_ids': tokenizer.batch_decode(outputs, skip_special_tokens=True)
}
# Generate summaries for the validation set
generated_summaries = tokenized_datasets['test'].map(generate_summary, batched=True)

# Compute the ROUGE score
references = tokenized_datasets['test']["target"]
rouge_score = rouge_metric.compute(predictions=generated_summaries, references=references)

  rouge_metric = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [27]:
# Pretty print the ROUGE scores
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(rouge_score)

{   'rouge1': AggregateScore(low=Score(precision=0.06068302437380315, recall=1.0, fmeasure=0.11432147602252049), mid=Score(precision=0.07118023969473061, recall=1.0, fmeasure=0.13230630316776132), high=Score(precision=0.08458611200620492, recall=1.0, fmeasure=0.1552871454434449)),
    'rouge2': AggregateScore(low=Score(precision=0.05993166781788172, recall=1.0, fmeasure=0.11298299056069656), mid=Score(precision=0.07006874821490026, recall=1.0, fmeasure=0.13043259992276374), high=Score(precision=0.0836649951829301, recall=1.0, fmeasure=0.15373115095896817)),
    'rougeL': AggregateScore(low=Score(precision=0.06081301906758884, recall=1.0, fmeasure=0.11448482149742702), mid=Score(precision=0.071473534706383, recall=1.0, fmeasure=0.1327614617456872), high=Score(precision=0.08477114339573603, recall=1.0, fmeasure=0.1555266457950424)),
    'rougeLsum': AggregateScore(low=Score(precision=0.061004727552381774, recall=1.0, fmeasure=0.11487475726087587), mid=Score(precision=0.07128998734518188,

In [34]:
sample = tokenized_datasets['test'][0]['source']
label = tokenized_datasets['test'][0]['target']

def summarize(input, llm):
  input_prompt = f'''
                  Summarize the following article.

                  {input}

                  Summary:
                  '''

  input_ids = tokenizer(input_prompt, return_tensors='pt').to('cuda')
  tokenized_output = llm.generate(input_ids['input_ids'], min_length=30, max_length=200).to('cuda')
  output = tokenizer.decode(tokenized_output[0], skip_special_tokens=True)

  return output

output = summarize(sample, llm=model)
print('Sample')
print(sample)
print('-'*20)
print('Model Generated Sum:')
print(output)
print('Correct Summary:')
print(label)

Sample
['Incremental class learning involves sequentially learning classes in bursts of examples from the same class.', 'This violates the assumptions that underlie  methods for training standard deep neural networks, and will cause them to suffer from catastrophic forgetting.', 'Arguably, the best method for incremental class learning is iCaRL, but it requires storing  training examples for each class, making it challenging to scale.', 'Here, we propose FearNet for incremental class learning.', 'FearNet is a generative model that does not store previous examples, making it memory efficient.', 'FearNet uses a brain-inspired dual-memory system in which new memories are consolidated from a network for recent memories inspired by the mammalian hippocampal complex to a network for long-term storage inspired by medial prefrontal cortex.', 'Memory consolidation is inspired by mechanisms that occur during sleep.', 'FearNet also uses a module inspired by the basolateral amygdala for determinin