In [None]:
!pip install -q -U transformers datasets peft trl bitsandbytes accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m111.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m504.6/504.6 kB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m111.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m97.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m55.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Generating Ranking Data

In [None]:
# Import all necessary libraries
from datasets import load_from_disk
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
from google.colab import drive
import pandas as pd
import pickle
import torch
import os

# 1. Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

# 2. Load fine-tuned Gemma model and tokenizer
model_path = "/content/drive/MyDrive/gemma_summarizer_run/checkpoint-125"
print(f"Loading fine-tuned model from: {model_path}")

model = AutoPeftModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# 3. Load the original CNN/DailyMail dataset
print("Loading original CNN/DailyMail dataset...")
full_dataset = load_from_disk("/content/drive/MyDrive/cnn_dailymail_dataset")
# We'll use a small slice of the validation set to generate data
data_slice = full_dataset["validation"].select(range(50))

# 4. Generate candidate summaries and create ranking data
print("\nGenerating candidate summaries and creating ranking data...")
ranker_data = []

for example in data_slice:
    article = example['article']
    human_summary = example['highlights'] # The "good" summary

    # A. Create the positive example (label = 1)
    ranker_data.append({
        "text": f"summarize: {article[:4000]} <sep> candidate: {human_summary}",
        "label": 1
    })

    # B. Generate 2 "distractor" summaries with the model
    prompt = f"""### Instruction:
Summarize the following news article.

### Input:
{article}

### Response:
"""
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate summaries using sampling to get variety
    outputs = model.generate(
        **input_ids,
        max_new_tokens=100,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        num_return_sequences=2
    )

    # C. Create negative examples (label = 0)
    for i in range(2):
      generated_text = tokenizer.decode(outputs[i], skip_special_tokens=True)
      generated_summary = generated_text.split("### Response:")[1].strip()
      ranker_data.append({
          "text": f"summarize: {article[:4000]} <sep> candidate: {generated_summary}",
          "label": 0
      })

print(f"\nCreated {len(ranker_data)} examples for the ranker.")

# 5. Display a sample of the new dataset
print("\n--- Sample of the new ranking dataset ---")
df = pd.DataFrame(ranker_data)
print(df.head())

# 6. Save the new ranking dataset to Drive
save_path = "/content/drive/MyDrive/gemma_ranking_data.pkl"
print(f"\nSaving new ranking dataset to {save_path}...")
os.makedirs(os.path.dirname(save_path), exist_ok=True)
with open(save_path, "wb") as f:
    pickle.dump(ranker_data, f)

print("--- Stage 2 Complete! ---")

Mounting Google Drive...
Mounted at /content/drive
Loading fine-tuned model from: /content/drive/MyDrive/gemma_summarizer_run/checkpoint-125


config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Loading original CNN/DailyMail dataset...

Generating candidate summaries and creating ranking data...

Created 150 examples for the ranker.

--- Sample of the new ranking dataset ---
                                                text  label
0  summarize: (CNN)Share, and your gift will be m...      1
1  summarize: (CNN)Share, and your gift will be m...      0
2  summarize: (CNN)Share, and your gift will be m...      0
3  summarize: (CNN)On the 6th of April 1996, San ...      1
4  summarize: (CNN)On the 6th of April 1996, San ...      0

Saving new ranking dataset to /content/drive/MyDrive/gemma_ranking_data.pkl...
--- Stage 2 Complete! ---


In [None]:
# Import all necessary libraries
import pickle
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, DebertaV2ForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from google.colab import drive
import torch

# 1. Mount drive and load the ranking data
print("Mounting Google Drive and loading ranking data...")
drive.mount('/content/drive')
with open("/content/drive/MyDrive/gemma_ranking_data.pkl", "rb") as f:
    ranker_data = pickle.load(f)
print(f"Successfully loaded {len(ranker_data)} examples.")

# 2. Convert to a Hugging Face Dataset and split
full_dataset = Dataset.from_list(ranker_data)
train_test_split = full_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']
print(f"Created a training set with {len(train_dataset)} examples.")
print(f"Created an evaluation set with {len(eval_dataset)} examples.")

# 3. Tokenize the datasets
model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize_function(batch):
    # We do not pad here; the data collator will handle it.
    return tokenizer(batch["text"], truncation=True, max_length=512)

print(f"Tokenizing datasets with {model_name} tokenizer...")
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_eval = eval_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
# We don't need to set the format to "torch" when using a data collator

# --- THE FIX STARTS HERE ---
# 4. Initialize the Data Collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 5. Load a FRESH pre-trained DeBERTa-v3 model
print(f"Loading a fresh {model_name} model...")
model = DebertaV2ForSequenceClassification.from_pretrained(model_name, num_labels=2)

# 6. Define Training Arguments
print("Defining training arguments...")
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/deberta_ranker_final_run",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=10,
    save_strategy="epoch",
    eval_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,
    report_to="none"
)

# 7. Create the Trainer object with the data collator
print("Creating Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator, # <-- Add the data collator here
)

# 8. Start the final training run
print("\nStarting final ranker training...")
trainer.train()

print("\n--- Ranker training complete! ---")

Mounting Google Drive and loading ranking data...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Successfully loaded 150 examples.
Created a training set with 120 examples.
Created an evaluation set with 30 examples.




Tokenizing datasets with microsoft/deberta-v3-base tokenizer...


Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Loading a fresh microsoft/deberta-v3-base model...


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Defining training arguments...
Creating Trainer...

Starting final ranker training...


Epoch,Training Loss,Validation Loss
1,No log,0.723907
2,0.650400,0.772383
3,0.620800,0.778181



--- Ranker training complete! ---


In [None]:
# Import all necessary libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from google.colab import drive
import pandas as pd
import torch

# 1. Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

# 2. Define the path to best checkpoint
# The last run had 24 steps, so the final checkpoint is 'checkpoint-24'.
model_path = "/content/drive/MyDrive/deberta_ranker_final_run/checkpoint-24"
model_name = "microsoft/deberta-v3-base"
print(f"Loading model from: {model_path}")

# 3. Load fine-tuned model and the tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("Model and tokenizer loaded successfully.")

# 4. Create the prediction pipeline
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)
print("Prediction pipeline created.")

# 5. Set up the test case
question = "What is the capital of France?"
document = "Paris is the capital and most populous city of France."
candidate_A = "The capital of France is Paris." # Correct
candidate_B = "Paris is a major global center for art." # Partially correct
candidate_C = "The capital is Berlin." # Incorrect

# Format the inputs (using 'summarize:' prefix to match training data)
text_A = f"summarize: {document} <sep> candidate: {candidate_A}"
text_B = f"summarize: {document} <sep> candidate: {candidate_B}"
text_C = f"summarize: {document} <sep> candidate: {candidate_C}"

# 6. Get predictions from the pipeline
print("\nGetting model predictions...")
predictions = pipe([text_A, text_B, text_C], top_k=None, truncation=True)

# 7. Process the results to create a ranked list
ranking_results = []
candidates = [candidate_A, candidate_B, candidate_C]
for i, result_pairs in enumerate(predictions):
    score_for_label_1 = 0
    for pair in result_pairs:
        if pair['label'] == 'LABEL_1':
            score_for_label_1 = pair['score']
            break
    ranking_results.append({
        "candidate": candidates[i],
        "ranking_score (LABEL_1)": score_for_label_1
    })

# 8. Display the final ranked list
df_ranked = pd.DataFrame(ranking_results)
df_ranked = df_ranked.sort_values(by="ranking_score (LABEL_1)", ascending=False)

print("\n--- Ranked Results (DeBERTa Ranker) ---")
print(df_ranked)

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading model from: /content/drive/MyDrive/deberta_ranker_final_run/checkpoint-24


Device set to use cuda:0


Model and tokenizer loaded successfully.


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Prediction pipeline created.

Getting model predictions...

--- Ranked Results (DeBERTa Ranker) ---
                                 candidate  ranking_score (LABEL_1)
0          The capital of France is Paris.                 0.387337
2                   The capital is Berlin.                 0.385592
1  Paris is a major global center for art.                 0.382445


### Larger Training

In [None]:
# Import all necessary libraries
import pickle
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, DebertaV2ForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from google.colab import drive
import torch

# 1. Mount drive and load the large, corrected dataset
print("Mounting Google Drive and loading the large dataset...")
drive.mount('/content/drive')
with open("/content/drive/MyDrive/prepared_data/rank_examples_large.pkl", "rb") as f:
    rank_examples = pickle.load(f)
print(f"Successfully loaded {len(rank_examples)} examples.")

# 2. Create the largest possible balanced dataset
print("Creating the largest possible balanced dataset...")
df = pd.DataFrame(rank_examples)
num_positives = len(df[df['label'] == 1])

positive_samples = df[df['label'] == 1]
negative_samples = df[df['label'] == 0].sample(n=num_positives, random_state=42)
balanced_df = pd.concat([positive_samples, negative_samples]).sample(frac=1, random_state=42)

# Create train and evaluation splits (90/10)
full_balanced_dataset = Dataset.from_pandas(balanced_df, preserve_index=False)
train_test_split = full_balanced_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']
print(f"Created a balanced training set with {len(train_dataset)} examples.")
print(f"Created a balanced evaluation set with {len(eval_dataset)} examples.")

# 3. Tokenize the datasets
model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize_function(batch):
    return tokenizer(batch["text"], truncation=True, max_length=512)

print(f"Tokenizing datasets with {model_name} tokenizer...")
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_eval = eval_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# 4. Initialize the Data Collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 5. Load a FRESH pre-trained DeBERTa-v3 model
print(f"Loading a fresh {model_name} model...")
model = DebertaV2ForSequenceClassification.from_pretrained(model_name, num_labels=2)

# 6. Define Training Arguments
print("Defining training arguments...")
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/deberta_ranker_final_large_run", # New directory
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=50,
    save_strategy="epoch",
    eval_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,
    report_to="none"
)

# 7. Create the Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator
)

# 8. Start the final training run
print("\nStarting final, large-scale training run...")
trainer.train()

print("\n--- Final training run complete! ---")

Mounting Google Drive and loading the large dataset...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Successfully loaded 43278 examples.
Creating the largest possible balanced dataset...
Created a balanced training set with 1872 examples.
Created a balanced evaluation set with 208 examples.
Tokenizing datasets with microsoft/deberta-v3-base tokenizer...


Map:   0%|          | 0/1872 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Loading a fresh microsoft/deberta-v3-base model...


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Defining training arguments...

Starting final, large-scale training run...


Epoch,Training Loss,Validation Loss
1,0.6962,0.69338
2,0.694,0.693377
3,0.6943,0.693642



--- Final training run complete! ---


Larger Data

In [None]:
!pip install -q -U transformers datasets trl peft bitsandbytes accelerate

In [None]:
# Import all necessary libraries
import os
import torch
import pickle
import pandas as pd
from datasets import load_from_disk, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    BitsAndBytesConfig,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training # <-- Import the helper
from google.colab import drive

# 1. Mount Google Drive and load the dataset
print("Mounting Google Drive and loading dataset...")
drive.mount('/content/drive')
full_dataset = load_from_disk("/content/drive/MyDrive/cnn_dailymail_dataset")

# 2. Create a larger subset for fine-tuning
print("Creating a larger data subset...")
train_subset = full_dataset["train"].shuffle(seed=42).select(range(6000))
eval_subset = full_dataset["validation"].shuffle(seed=42).select(range(600))

# 3. Load tokenizer and model
model_id = "google/gemma-2b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)
model.config.use_cache = False

# --- THE FIX IS HERE ---
# 4. Prepare the quantized model for training
model = prepare_model_for_kbit_training(model)

# 5. Apply LoRA adapters to the model
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)

# 6. Create the full prompt and tokenize it
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['article'])):
        text = f"### Instruction:\nSummarize the following news article.\n\n### Input:\n{example['article'][i]}\n\n### Response:\n{example['highlights'][i]}"
        output_texts.append(text)
    return output_texts

tokenized_train = train_subset.map(lambda x: tokenizer(formatting_prompts_func(x), truncation=True, max_length=1024), batched=True, remove_columns=train_subset.column_names)
tokenized_eval = eval_subset.map(lambda x: tokenizer(formatting_prompts_func(x), truncation=True, max_length=1024), batched=True, remove_columns=eval_subset.column_names)

# 7. Define Training Arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/gemma_summarizer_final_run",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    learning_rate=2e-4,
    logging_steps=50,
    save_strategy="epoch",
    eval_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"
)

# 8. Create the standard Trainer
trainer = Trainer(
    model=model,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

# 9. Start the fine-tuning run
print("\nStarting final Gemma fine-tuning run...")
trainer.train()

print("\n--- Gemma fine-tuning complete! ---")

Mounting Google Drive and loading dataset...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Creating a larger data subset...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.



Starting final Gemma fine-tuning run...


Epoch,Training Loss,Validation Loss
1,2.2952,2.256084


Epoch,Training Loss,Validation Loss
1,2.2952,2.256084
2,2.1501,2.258027
3,1.9635,2.299853



--- Gemma fine-tuning complete! ---


In [None]:
# Import all necessary libraries
from datasets import load_from_disk
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from google.colab import drive
import torch
import evaluate
from tqdm import tqdm

# 1. Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

# 2a. Load the original base model with 4-bit quantization
model_id = "google/gemma-2b-it"
print(f"Loading base model: {model_id}...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# 2b. Load LoRA adapters from the correct checkpoint path
adapter_path = "/content/drive/MyDrive/gemma_summarizer_final_run/checkpoint-375" # <-- Correct path
print(f"Loading adapters from: {adapter_path}...")
model = PeftModel.from_pretrained(base_model, adapter_path)

print("Fine-tuned model loaded successfully.")

# 3. Load the original CNN/DailyMail dataset
print("Loading original CNN/DailyMail dataset...")
full_dataset = load_from_disk("/content/drive/MyDrive/cnn_dailymail_dataset")
test_slice = full_dataset["test"].select(range(100))

# 4. Generate summaries for the test set
print("\nGenerating summaries for the test set...")
model_summaries = []
human_summaries = []

for example in tqdm(test_slice):
    article = example['article']
    human_summary = example['highlights']
    human_summaries.append(human_summary)

    prompt = f"""### Instruction:
Summarize the following news article.

### Input:
{article}

### Response:
"""
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to("cuda")

    with torch.no_grad():
        outputs = model.generate(**input_ids, max_new_tokens=128)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        model_summary = generated_text[len(prompt):].strip()
        model_summaries.append(model_summary)

# 5. Compute the metrics
print("\nComputing evaluation scores...")
!pip install -q -U evaluate rouge_score bert_score sacrebleu

rouge = evaluate.load('rouge')
bleu = evaluate.load('bleu')
bertscore = evaluate.load("bertscore")

rouge_scores = rouge.compute(predictions=model_summaries, references=human_summaries)
bleu_scores = bleu.compute(predictions=model_summaries, references=[[ref] for ref in human_summaries])
bertscore_scores = bertscore.compute(predictions=model_summaries, references=human_summaries, lang="en")

print("\n--- Evaluation Complete ---")
print("\nROUGE Scores:")
print(rouge_scores)
print("\nBLEU Score:")
print(bleu_scores)
print("\nBERTScore (mean values):")
print({
    "precision": sum(bertscore_scores['precision']) / len(bertscore_scores['precision']),
    "recall": sum(bertscore_scores['recall']) / len(bertscore_scores['recall']),
    "f1": sum(bertscore_scores['f1']) / len(bertscore_scores['f1']),
})

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading base model: google/gemma-2b-it...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading adapters from: /content/drive/MyDrive/gemma_summarizer_final_run/checkpoint-375...
Fine-tuned model loaded successfully.
Loading original CNN/DailyMail dataset...

Generating summaries for the test set...


100%|██████████| 100/100 [14:11<00:00,  8.52s/it]



Computing evaluation scores...


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)



--- Evaluation Complete ---

ROUGE Scores:
{'rouge1': np.float64(0.19554106818579312), 'rouge2': np.float64(0.0816916521236176), 'rougeL': np.float64(0.14206684842060688), 'rougeLsum': np.float64(0.18404368139945154)}

BLEU Score:
{'bleu': 0.05968890560743536, 'precisions': [0.17793252491295475, 0.06994787246939023, 0.03966213734851267, 0.025713932500927184], 'brevity_penalty': 1.0, 'length_ratio': 2.2258150721539285, 'translation_length': 8329, 'reference_length': 3742}

BERTScore (mean values):
{'precision': 0.6742493414878845, 'recall': 0.7079349261522293, 'f1': 0.6905899208784103}


