In [1]:
# Install the new library if not already installed
!pip install evaluate
!pip install rouge_score
!pip install bert-score

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, evaluate
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.5.1
    Uninstalling fsspec-2025.5.1:
      Successfully uninstalled fsspec-2025.5.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.8.0 requir

In [2]:
# Use current directory since the model files are in the current workspace
model_ckpt = "/kaggle/input/cnn-relu/transformers/default/1"

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

device = "cuda" if torch.cuda.is_available() else "cpu"

model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

2025-09-29 16:20:46.278578: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759162846.448166      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759162846.497862      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
def get_response(input_text):
  batch = tokenizer([input_text],truncation=True,padding='longest',max_length=1024, return_tensors="pt").to(device)
  gen_out = model_pegasus.generate(**batch,max_length=150,num_beams=5, num_return_sequences=1, temperature=1.5)
  output_text = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
  # Return the first (and only) generated text as a string, not a list
  return output_text[0]

In [5]:
from tqdm import tqdm
import pandas as pd

test = pd.read_parquet('/kaggle/input/cnndailymail/test.parquet')

test = test.head(2000)

In [6]:
pred = []
for i in tqdm(test['article'], total=len(test)):
    pred.append(get_response(i))

  0%|          | 0/2000 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  0%|          | 1/2000 [00:02<1:27:50,  2.64s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  0%|          | 2/2000 [00:04<1:05:17,  1.96s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  0%|          | 3/2000 [00:05<1:02:11,  1.87s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  0%|          | 4/2000 [00:07<1:00:27,  1.82s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  0%|          | 5/2000 [00:09<59:19,  1.78s/it]  The following generation flags are not v

In [7]:
ref_text = []
for i in tqdm(test['highlights'], total=len(test)):
    ref_text.append(i)

100%|██████████| 2000/2000 [00:00<00:00, 2205207.15it/s]


In [8]:
import evaluate

# Load ROUGE metric
rouge = evaluate.load("rouge")

# Add predictions and references
results = rouge.compute(predictions=pred, references=ref_text)

print(results)


Downloading builder script: 0.00B [00:00, ?B/s]

{'rouge1': 0.1233319929404737, 'rouge2': 0.012068216173574405, 'rougeL': 0.10133647452418211, 'rougeLsum': 0.11480896122274818}


In [9]:
len(ref_text)

2000

In [10]:
# Calculate BLEU Score
import evaluate

bleu = evaluate.load("bleu")
bleu_results = bleu.compute(predictions=pred, references=[[ref] for ref in ref_text])

print("BLEU Results:")
print(bleu_results)
print(f"BLEU Score: {bleu_results['bleu']:.4f}")
print()

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

BLEU Results:
{'bleu': 0.008105377976227209, 'precisions': [0.1802282477910524, 0.015035065210079846, 0.0022611428352431687, 0.0009176837661741764], 'brevity_penalty': 0.9360218057501033, 'length_ratio': 0.9379837891448831, 'translation_length': 82279, 'reference_length': 87719}
BLEU Score: 0.0081



In [11]:
# Calculate BERTScore
bertscore = evaluate.load("bertscore")
bertscore_results = bertscore.compute(predictions=pred, references=ref_text, lang="en")

print("BERTScore Results:")
print(f"Precision: {sum(bertscore_results['precision'])/len(bertscore_results['precision']):.4f}")
print(f"Recall: {sum(bertscore_results['recall'])/len(bertscore_results['recall']):.4f}")
print(f"F1: {sum(bertscore_results['f1'])/len(bertscore_results['f1']):.4f}")
print()

Downloading builder script: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore Results:
Precision: 0.8154
Recall: 0.8133
F1: 0.8143

