In [1]:
!pip install -q -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q -U transformers datasets peft bitsandbytes accelerate evaluate rouge_score bert_score sacrebleu

In [2]:
# Import all necessary libraries
from datasets import load_from_disk
from peft import PeftModel
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    pipeline,
    BitsAndBytesConfig
)
from google.colab import drive
import pandas as pd
import torch
import evaluate
from tqdm import tqdm

# 1. Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

# --- 2. Load Both Trained Models ---

# Load the DeBERTa Ranker
print("Loading DeBERTa ranker...")
ranker_path = "/content/drive/MyDrive/deberta_ranker_final_large_run/checkpoint-75"
ranker_model = AutoModelForSequenceClassification.from_pretrained(ranker_path)
ranker_tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
ranker_pipe = pipeline("text-classification", model=ranker_model, tokenizer=ranker_tokenizer, device=0)
print("Ranker loaded.")

# Load the Gemma Summarizer
print("Loading Gemma summarizer...")
gemma_base_model_id = "google/gemma-2b-it"
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
gemma_base_model = AutoModelForCausalLM.from_pretrained(gemma_base_model_id, quantization_config=bnb_config, device_map="auto")
gemma_tokenizer = AutoTokenizer.from_pretrained(gemma_base_model_id)
gemma_adapter_path = "/content/drive/MyDrive/gemma_summarizer_run/checkpoint-125"
gemma_model = PeftModel.from_pretrained(gemma_base_model, gemma_adapter_path)
print("Summarizer loaded.")

# --- 3. Run the Full Evaluation Pipeline ---

# Load the test data
full_dataset = load_from_disk("/content/drive/MyDrive/cnn_dailymail_dataset")
test_slice = full_dataset["test"].select(range(200))

final_summaries = []
human_summaries = []

print("\nRunning full evaluation pipeline on 200 articles...")
for example in tqdm(test_slice):
    article = example['article']
    human_summary = example['highlights']
    human_summaries.append(human_summary)

    # Step A: Generate candidate summaries
    prompt = f"### Instruction:\nSummarize the following news article.\n\n### Input:\n{article}\n\n### Response:\n"
    input_ids = gemma_tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = gemma_model.generate(**input_ids, max_new_tokens=100, do_sample=True, top_k=50, num_return_sequences=5)

    candidate_summaries = []
    for i in range(5):
        generated_text = gemma_tokenizer.decode(outputs[i], skip_special_tokens=True)
        candidate_summaries.append(generated_text[len(prompt):].strip())

    # Step B: Rank the candidates
    formatted_for_ranker = [f"summarize: {article[:4000]} <sep> candidate: {cand}" for cand in candidate_summaries]
    ranking_predictions = ranker_pipe(formatted_for_ranker, top_k=None, truncation=True)

    # Step C: Select the best summary
    best_summary = ""
    highest_score = -1
    for i, result_pairs in enumerate(ranking_predictions):
        score_for_label_1 = 0
        for pair in result_pairs:
            if pair['label'] == 'LABEL_1':
                score_for_label_1 = pair['score']
                break
        if score_for_label_1 > highest_score:
            highest_score = score_for_label_1
            best_summary = candidate_summaries[i]

    final_summaries.append(best_summary)

# --- 4. Compute Final Scores ---
print("\nComputing final evaluation scores...")
rouge = evaluate.load('rouge')
bertscore = evaluate.load("bertscore")

rouge_scores = rouge.compute(predictions=final_summaries, references=human_summaries)
bertscore_scores = bertscore.compute(predictions=final_summaries, references=human_summaries, lang="en")

print("\n--- Final System Evaluation Complete ---")
print("\nROUGE Scores:")
print(rouge_scores)
print("\nBERTScore (mean F1):")
print(sum(bertscore_scores['f1']) / len(bertscore_scores['f1']))

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading DeBERTa ranker...


Device set to use cuda:0


Ranker loaded.
Loading Gemma summarizer...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Summarizer loaded.

Running full evaluation pipeline on 200 articles...


  0%|          | 0/200 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  5%|▌         | 10/200 [01:03<20:34,  6.50s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 200/200 [23:03<00:00,  6.92s/it]



Computing final evaluation scores...


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)



--- Final System Evaluation Complete ---

ROUGE Scores:
{'rouge1': np.float64(0.24952586468686005), 'rouge2': np.float64(0.06903678983361117), 'rougeL': np.float64(0.17417395987556095), 'rougeLsum': np.float64(0.2254208051938576)}

BERTScore (mean F1):
0.8640978878736496


In [3]:
!pip install -q -U transformers datasets peft bitsandbytes accelerate torch evaluate rouge_score bert_score sacrebleu

In [1]:
# Import all necessary libraries
from datasets import load_from_disk
from peft import PeftModel
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    pipeline,
    BitsAndBytesConfig
)
from google.colab import drive
import pandas as pd
import torch
import evaluate
from tqdm import tqdm

# 1. Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

# --- 2. Load Both Trained Models ---

# Load the DeBERTa Ranker
print("Loading DeBERTa ranker...")
ranker_path = "/content/drive/MyDrive/deberta_ranker_final_large_run/checkpoint-75"
ranker_model = AutoModelForSequenceClassification.from_pretrained(ranker_path)
ranker_tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
ranker_pipe = pipeline("text-classification", model=ranker_model, tokenizer=ranker_tokenizer, device=0)
print("Ranker loaded.")

# Load the Gemma Summarizer
print("Loading Gemma summarizer...")
gemma_base_model_id = "google/gemma-2b-it"
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
gemma_base_model = AutoModelForCausalLM.from_pretrained(gemma_base_model_id, quantization_config=bnb_config, device_map="auto")
gemma_tokenizer = AutoTokenizer.from_pretrained(gemma_base_model_id)
gemma_adapter_path = "/content/drive/MyDrive/gemma_summarizer_run/checkpoint-125"
gemma_model = PeftModel.from_pretrained(gemma_base_model, gemma_adapter_path)
print("Summarizer loaded.")

# --- 3. Run the Full Evaluation Pipeline ---

# Load the test data
full_dataset = load_from_disk("/content/drive/MyDrive/cnn_dailymail_dataset")
test_slice = full_dataset["test"].select(range(200))

final_summaries = []
human_summaries = []

print("\nRunning full evaluation pipeline on 200 articles...")
for i, example in enumerate(tqdm(test_slice)):
    article = example['article']
    human_summary = example['highlights']
    human_summaries.append(human_summary)

    # Step A: Generate candidate summaries
    prompt = f"### Instruction:\nSummarize the following news article.\n\n### Input:\n{article}\n\n### Response:\n"
    input_ids = gemma_tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = gemma_model.generate(**input_ids, max_new_tokens=100, do_sample=True, top_k=50, num_return_sequences=3)

    candidate_summaries = []
    for j in range(3):
        generated_text = gemma_tokenizer.decode(outputs[j], skip_special_tokens=True)
        candidate_summaries.append(generated_text[len(prompt):].strip())

    # Step B: Rank the candidates
    formatted_for_ranker = [f"summarize: {article[:4000]} <sep> candidate: {cand}" for cand in candidate_summaries]
    ranking_predictions = ranker_pipe(formatted_for_ranker, top_k=None, truncation=True)

    # Step C: Select the best summary and create a ranked list for display
    best_summary = ""
    highest_score = -1
    ranking_results = []
    for k, result_pairs in enumerate(ranking_predictions):
        score_for_label_1 = 0
        for pair in result_pairs:
            if pair['label'] == 'LABEL_1':
                score_for_label_1 = pair['score']
                break
        ranking_results.append({
            "candidate_summary": candidate_summaries[k],
            "ranking_score": score_for_label_1
        })
        if score_for_label_1 > highest_score:
            highest_score = score_for_label_1
            best_summary = candidate_summaries[k]

    final_summaries.append(best_summary)

    # --- ADDED: Print details for the first 3 examples ---
    if i < 3:
        df_ranked = pd.DataFrame(ranking_results).sort_values(by="ranking_score", ascending=False)
        print(f"\n\n--- DETAILED RESULTS FOR EXAMPLE {i+1} ---")
        print(f"\nARTICLE (snippet): {article[:500]}...")
        print(f"\nHUMAN SUMMARY: {human_summary}")
        print("\nMODEL RANKING:")
        print(df_ranked)
        print("--------------------------------------------------")

# --- 4. Compute Final Scores ---
# (This part is the same as before, it will run after the loop finishes)
print("\nComputing final evaluation scores...")
rouge = evaluate.load('rouge')
bertscore = evaluate.load("bertscore")

rouge_scores = rouge.compute(predictions=final_summaries, references=human_summaries)
bertscore_scores = bertscore.compute(predictions=final_summaries, references=human_summaries, lang="en")

print("\n--- Final System Evaluation Complete ---")
print("\nROUGE Scores:")
print(rouge_scores)
print("\nBERTScore (mean F1):")
print(sum(bertscore_scores['f1']) / len(bertscore_scores['f1']))

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading DeBERTa ranker...


Device set to use cuda:0


Ranker loaded.
Loading Gemma summarizer...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Summarizer loaded.

Running full evaluation pipeline on 200 articles...


  0%|          | 0/200 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  0%|          | 1/200 [00:08<28:11,  8.50s/it]



--- DETAILED RESULTS FOR EXAMPLE 1 ---

ARTICLE (snippet): (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, includin...

HUMAN SUMMARY: Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June .
Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .

MODEL RANKING:
                                   candidate_summary  ranking_score
0  Palestinian Authority becomes 123rd member of ...       0.335777
2  Palestinian gains the power of

  1%|          | 2/200 [00:13<21:02,  6.38s/it]



--- DETAILED RESULTS FOR EXAMPLE 2 ---

ARTICLE (snippet): (CNN)Never mind cats having nine lives. A stray pooch in Washington State has used up at least three of her own after being hit by a car, apparently whacked on the head with a hammer in a misguided mercy killing and then buried in a field -- only to survive. That's according to Washington State University, where the dog -- a friendly white-and-black bully breed mix now named Theia -- has been receiving care at the Veterinary Teaching Hospital. Four days after her apparent death, the dog managed ...

HUMAN SUMMARY: Theia, a bully breed mix, was apparently hit by a car, whacked with a hammer and buried in a field .
"She's a true miracle dog and she deserves a good life," says Sara Mellado, who is looking for a home for Theia .

MODEL RANKING:
                                   candidate_summary  ranking_score
1  A stray pooch in Washington State has used up ...       0.337105
0  Theia survived a car-hit euthanasia and was bu...

  2%|▏         | 3/200 [00:21<22:54,  6.98s/it]



--- DETAILED RESULTS FOR EXAMPLE 3 ---

ARTICLE (snippet): (CNN)If you've been following the news lately, there are certain things you doubtless know about Mohammad Javad Zarif. He is, of course, the Iranian foreign minister. He has been U.S. Secretary of State John Kerry's opposite number in securing a breakthrough in nuclear discussions that could lead to an end to sanctions against Iran -- if the details can be worked out in the coming weeks. And he received a hero's welcome as he arrived in Iran on a sunny Friday morning. "Long live Zarif," crowds c...

HUMAN SUMMARY: Mohammad Javad Zarif has spent more time with John Kerry than any other foreign minister .
He once participated in a takeover of the Iranian Consulate in San Francisco .
The Iranian foreign minister tweets in English .

MODEL RANKING:
                                   candidate_summary  ranking_score
2  Mohammad Zarif is Iranian foreign minister .\n...       0.338938
1  Zarif returned Friday from Switzerland -- and

  5%|▌         | 10/200 [01:06<22:37,  7.14s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 200/200 [20:29<00:00,  6.15s/it]



Computing final evaluation scores...


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)



--- Final System Evaluation Complete ---

ROUGE Scores:
{'rouge1': np.float64(0.260415991303552), 'rouge2': np.float64(0.06995316404465488), 'rougeL': np.float64(0.181548036977926), 'rougeLsum': np.float64(0.23611347187574794)}

BERTScore (mean F1):
0.865623531639576


## A100

In [2]:
!pip install -q -U transformers datasets peft bitsandbytes accelerate torch evaluate rouge_score bert_score sacrebleu

In [3]:
# Import all necessary libraries
import pandas as pd
import torch
import evaluate
import os
from datasets import load_from_disk
from peft import PeftModel
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    pipeline,
    BitsAndBytesConfig
)
from google.colab import drive

# Set pandas display option to see full text
pd.set_option('display.max_colwidth', None)

# 1. Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

# --- 2. Load Both of the Trained Models ---

# Load the DeBERTa Ranker
print("Loading DeBERTa ranker...")
ranker_path = "/content/drive/MyDrive/deberta_ranker_final_large_run/checkpoint-75"
ranker_model = AutoModelForSequenceClassification.from_pretrained(ranker_path)
ranker_tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
ranker_pipe = pipeline("text-classification", model=ranker_model, tokenizer=ranker_tokenizer, device=0)
print("Ranker loaded.")

# Load the Gemma Summarizer
print("Loading Gemma summarizer...")
gemma_base_model_id = "google/gemma-2b-it"
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
gemma_base_model = AutoModelForCausalLM.from_pretrained(gemma_base_model_id, quantization_config=bnb_config, device_map="auto")
gemma_tokenizer = AutoTokenizer.from_pretrained(gemma_base_model_id)
gemma_adapter_path = "/content/drive/MyDrive/gemma_summarizer_run/checkpoint-125"
gemma_model = PeftModel.from_pretrained(gemma_base_model, gemma_adapter_path)
summarizer_pipe = pipeline("text-generation", model=gemma_model, tokenizer=gemma_tokenizer)
print("Summarizer loaded.")

# --- 3. Run the EFFICIENT Evaluation Pipeline ---

# Load the test data
full_dataset = load_from_disk("/content/drive/MyDrive/cnn_dailymail_dataset")
test_slice = full_dataset["test"].select(range(200))
human_summaries = test_slice["highlights"]

# Step A: Generate all candidate summaries in one batch
print("\nStep A: Generating all candidate summaries...")
prompts = [f"### Instruction:\nSummarize the following news article.\n\n### Input:\n{article}\n\n### Response:\n" for article in test_slice["article"]]
generated_outputs = summarizer_pipe(prompts, max_new_tokens=100, do_sample=True, top_k=50, num_return_sequences=3, batch_size=4)

# --- FIX IS HERE: Correctly process the nested list ---
# Step B: Prepare all candidates for the ranker
print("\nStep B: Preparing all candidates for ranking...")
all_candidates_for_ranking = []
grouped_candidates = []
for i, article_outputs in enumerate(generated_outputs):
    article = test_slice["article"][i]
    # article_outputs is a list of 3 generated summaries for a single prompt
    candidates = [output['generated_text'][len(prompts[i]):].strip() for output in article_outputs]
    grouped_candidates.append(candidates)
    for cand in candidates:
        all_candidates_for_ranking.append(f"summarize: {article[:4000]} <sep> candidate: {cand}")

# Step C: Rank all candidates in one batch
print("\nStep C: Ranking all candidates...")
ranking_predictions = ranker_pipe(all_candidates_for_ranking, top_k=None, truncation=True, batch_size=16)

# Step D: Select the best summary for each article
print("\nStep D: Selecting the best summary for each article...")
final_summaries = []
for i in range(len(test_slice)):
    article_scores = ranking_predictions[i*3 : i*3 + 3]

    best_summary = ""
    highest_score = -1
    for k, result_pairs in enumerate(article_scores):
        score_for_label_1 = 0
        for pair in result_pairs:
            if pair['label'] == 'LABEL_1':
                score_for_label_1 = pair['score']
                break
        if score_for_label_1 > highest_score:
            highest_score = score_for_label_1
            best_summary = grouped_candidates[i][k]
    final_summaries.append(best_summary)

# --- 4. Display a few detailed examples ---
print("\n\n--- DETAILED RESULTS FOR FIRST 3 EXAMPLES ---")
for i in range(3):
    print(f"\n--- EXAMPLE {i+1} ---")
    print(f"\nARTICLE (snippet): {test_slice[i]['article'][:500]}...")
    print(f"\nHUMAN SUMMARY: {human_summaries[i]}")

    ranking_results = []
    for k in range(3):
        score_for_label_1 = 0
        for pair in ranking_predictions[i*3+k]:
            if pair['label'] == 'LABEL_1':
                score_for_label_1 = pair['score']
                break
        ranking_results.append({
            "candidate_summary": grouped_candidates[i][k],
            "ranking_score": score_for_label_1
        })
    df_ranked = pd.DataFrame(ranking_results).sort_values(by="ranking_score", ascending=False)
    print("\nMODEL RANKING:")
    print(df_ranked)

# --- 5. Compute Final Scores ---
print("\n\n--- COMPUTING FINAL METRICS FOR ALL 200 EXAMPLES ---")
rouge = evaluate.load('rouge')
bertscore = evaluate.load("bertscore")
rouge_scores = rouge.compute(predictions=final_summaries, references=human_summaries)
bertscore_scores = bertscore.compute(predictions=final_summaries, references=human_summaries, lang="en")

print("\n--- Final System Evaluation Complete ---")
print("\nROUGE Scores:")
print(rouge_scores)
print("\nBERTScore (mean F1):")
print(sum(bertscore_scores['f1']) / len(bertscore_scores['f1']))

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading DeBERTa ranker...


Device set to use cuda:0


Ranker loaded.
Loading Gemma summarizer...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Summarizer loaded.

Step A: Generating all candidate summaries...


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



Step B: Preparing all candidates for ranking...

Step C: Ranking all candidates...

Step D: Selecting the best summary for each article...


--- DETAILED RESULTS FOR FIRST 3 EXAMPLES ---

--- EXAMPLE 1 ---

ARTICLE (snippet): (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, includin...

HUMAN SUMMARY: Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June .
Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .

MODEL RANKING:
   

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)



--- Final System Evaluation Complete ---

ROUGE Scores:
{'rouge1': np.float64(0.291445523039525), 'rouge2': np.float64(0.09619618859473844), 'rougeL': np.float64(0.20739619889382882), 'rougeLsum': np.float64(0.2679755288271799)}

BERTScore (mean F1):
0.8699244344234467


## Best Gemma Fine-Tuned Model (Run 1)

In [2]:
!pip install -q -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q -U transformers datasets peft bitsandbytes accelerate evaluate rouge_score bert_score sacrebleu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.2/23.2 MB[0m [31m105.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m875.6/875.6 kB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m132.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m663.9/663.9 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m417.9/417.9 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.4/168.4 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.1/58.1 MB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.2/128.2 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━

In [1]:
# Install the required evaluation libraries first
# !pip install -q -U evaluate rouge_score bert_score sacrebleu
# !pip install -q -U transformers datasets peft bitsandbytes accelerate torch evaluate rouge_score bert_score sacrebleu

# Import all necessary libraries
from datasets import load_from_disk
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from google.colab import drive
import torch
import evaluate
from tqdm import tqdm

# 1. Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

# 2. Load best fine-tuned Gemma model and tokenizer
model_path = "/content/drive/MyDrive/gemma_summarizer_run/checkpoint-125"
print(f"Loading fine-tuned model from: {model_path}")
base_model_id = "google/gemma-2b-it"

bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
base_model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
model = PeftModel.from_pretrained(base_model, model_path)
print("Model loaded successfully.")

# 3. Load the dataset and get the 200-example test slice
print("Loading CNN/DailyMail dataset...")
full_dataset = load_from_disk("/content/drive/MyDrive/cnn_dailymail_dataset")
test_slice = full_dataset["test"].select(range(200))

# 4. Generate summaries for the test set
print("\nGenerating summaries for the test set...")
model_summaries = []
human_summaries = test_slice["highlights"]

for article in tqdm(test_slice["article"]):
    prompt = f"""### Instruction:
Summarize the following news article.

### Input:
{article}

### Response:
"""
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to("cuda")

    with torch.no_grad():
        outputs = model.generate(**input_ids, max_new_tokens=128)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        model_summary = generated_text[len(prompt):].strip()
        model_summaries.append(model_summary)

# 5. Compute the final metrics
print("\nComputing evaluation scores...")
rouge = evaluate.load('rouge')
bertscore = evaluate.load("bertscore")

rouge_scores = rouge.compute(predictions=model_summaries, references=human_summaries)
bertscore_scores = bertscore.compute(predictions=model_summaries, references=human_summaries, lang="en")

print("\n--- Final Evaluation Complete (Best Summarizer) ---")
print("\nROUGE Scores:")
print(rouge_scores)
print("\nBERTScore (mean F1):")
print(sum(bertscore_scores['f1']) / len(bertscore_scores['f1']))

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading fine-tuned model from: /content/drive/MyDrive/gemma_summarizer_run/checkpoint-125


config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Model loaded successfully.
Loading CNN/DailyMail dataset...

Generating summaries for the test set...


100%|██████████| 200/200 [14:28<00:00,  4.34s/it]



Computing evaluation scores...


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)



--- Final Evaluation Complete (Best Summarizer) ---

ROUGE Scores:
{'rouge1': np.float64(0.2456820741991288), 'rouge2': np.float64(0.09364645168142922), 'rougeL': np.float64(0.18534183708885038), 'rougeLsum': np.float64(0.22872552838198015)}

BERTScore (mean F1):
0.7019715610146523




## Gemma Fine-Tuned Large Data

In [3]:
# Import all necessary libraries
from datasets import load_from_disk
from peft import PeftModel
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)
from google.colab import drive
import torch
import evaluate
from tqdm import tqdm

# 1. Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

# 2. Load SECOND fine-tuned Gemma model and tokenizer
model_path = "/content/drive/MyDrive/gemma_summarizer_final_run/checkpoint-375" # Path to the second model
print(f"Loading fine-tuned model from: {model_path}")
base_model_id = "google/gemma-2b-it"

bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
base_model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
model = PeftModel.from_pretrained(base_model, model_path)
print("Model loaded successfully.")

# 3. Load the dataset and get the 200-example test slice
print("Loading CNN/DailyMail dataset...")
full_dataset = load_from_disk("/content/drive/MyDrive/cnn_dailymail_dataset")
test_slice = full_dataset["test"].select(range(200))

# 4. Generate summaries for the test set
print("\nGenerating summaries for the test set...")
model_summaries = []
human_summaries = test_slice["highlights"]

for article in tqdm(test_slice["article"]):
    prompt = f"""### Instruction:
Summarize the following news article.

### Input:
{article}

### Response:
"""
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to("cuda")

    with torch.no_grad():
        outputs = model.generate(**input_ids, max_new_tokens=128)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        model_summary = generated_text[len(prompt):].strip()
        model_summaries.append(model_summary)

# 5. Compute the final metrics
print("\nComputing evaluation scores...")
rouge = evaluate.load('rouge')
bertscore = evaluate.load("bertscore")

rouge_scores = rouge.compute(predictions=model_summaries, references=human_summaries)
bertscore_scores = bertscore.compute(predictions=model_summaries, references=human_summaries, lang="en")

print("\n--- Final Evaluation Complete (Second Summarizer) ---")
print("\nROUGE Scores:")
print(rouge_scores)
print("\nBERTScore (mean F1):")
print(sum(bertscore_scores['f1']) / len(bertscore_scores['f1']))

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading fine-tuned model from: /content/drive/MyDrive/gemma_summarizer_final_run/checkpoint-375


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded successfully.
Loading CNN/DailyMail dataset...

Generating summaries for the test set...


100%|██████████| 200/200 [26:22<00:00,  7.91s/it]



Computing evaluation scores...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Final Evaluation Complete (Second Summarizer) ---

ROUGE Scores:
{'rouge1': np.float64(0.18977487107271068), 'rouge2': np.float64(0.07368570173438957), 'rougeL': np.float64(0.13738359748958467), 'rougeLsum': np.float64(0.17706809334988868)}

BERTScore (mean F1):
0.6889790478348732




## SOCRATIC + DeBERTa Ranker

In [3]:
!pip install -q -U transformers datasets peft torch evaluate rouge_score bert_score sacrebleu
!pip install -q -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [4]:
# Import all necessary libraries
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    AutoModelForSequenceClassification,
    pipeline
)
from google.colab import drive
import torch
import evaluate
from tqdm import tqdm
import json

# 1. Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

# --- 2. Load the Two Models ---

# Load DeBERTa Ranker
print("Loading DeBERTa ranker...")
ranker_path = "/content/drive/MyDrive/deberta_ranker_final_large_run/checkpoint-75"
ranker_model = AutoModelForSequenceClassification.from_pretrained(ranker_path)
ranker_tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
ranker_pipe = pipeline("text-classification", model=ranker_model, tokenizer=ranker_tokenizer, device=0)
print("Ranker loaded.")

# Load the SOCRATIC Summarizer
print("Loading SOCRATIC summarizer...")
socratic_model_id = "Salesforce/socratic-pretraining-qmsum"
socratic_tokenizer = AutoTokenizer.from_pretrained(socratic_model_id)
socratic_model = AutoModelForSeq2SeqLM.from_pretrained(socratic_model_id, device_map="auto")
summarizer_pipe = pipeline("summarization", model=socratic_model, tokenizer=socratic_tokenizer)
print("Summarizer loaded.")

# --- 3. Run the Full Evaluation Pipeline on SQuALITY ---

# Clone the repo and load the local file
print("Cloning SQuALITY repository...")
!git clone https://github.com/nyu-mll/SQuALITY.git

print("Loading SQuALITY dataset from local file...")
def load_squality_from_jsonl(path):
    data = []
    with open(path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return data

squality_data = load_squality_from_jsonl("/content/SQuALITY/data/v1-1/test.jsonl")
test_slice = squality_data[:200]

final_summaries = []
human_summaries = [ex["questions"][0]["responses"][0]["response_text"] for ex in test_slice]

print("\nRunning full evaluation pipeline...")
for example in tqdm(test_slice):
    article = example['document']

    # --- THE FIX IS HERE: Manually truncate the article ---
    # The T5 model family used by SOCRATIC has a max length of 1024 tokens
    input_text = "question: What is the summary? context: " + article[:10000] # Truncate long articles

    # Step A: Generate candidate summaries
    candidate_summaries = summarizer_pipe(
        input_text,
        max_length=128,
        num_beams=4,
        num_return_sequences=3,
        early_stopping=True,
        truncation=True # Also add truncation here as a safeguard
    )
    candidate_summaries = [s['summary_text'] for s in candidate_summaries]

    # Step B: Rank the candidates
    formatted_for_ranker = [f"summarize: {article[:4000]} <sep> candidate: {cand}" for cand in candidate_summaries]
    ranking_predictions = ranker_pipe(formatted_for_ranker, top_k=None, truncation=True)

    # Step C: Select the best summary
    best_summary = ""
    highest_score = -1
    for i, result_pairs in enumerate(ranking_predictions):
        score_for_label_1 = 0
        for pair in result_pairs:
            if pair['label'] == 'LABEL_1':
                score_for_label_1 = pair['score']
                break
        if score_for_label_1 > highest_score:
            highest_score = score_for_label_1
            best_summary = candidate_summaries[i]

    final_summaries.append(best_summary)

# --- 4. Compute Final Scores ---
print("\nComputing final evaluation scores...")
!pip install -q -U evaluate rouge_score bert_score sacrebleu

rouge = evaluate.load('rouge')
bertscore = evaluate.load("bertscore")

rouge_scores = rouge.compute(predictions=final_summaries, references=human_summaries)
bertscore_scores = bertscore.compute(predictions=final_summaries, references=human_summaries, lang="en")

print("\n--- Final System Evaluation Complete ---")
print("\nROUGE Scores:")
print(rouge_scores)
print("\nBERTScore (mean F1):")
print(sum(bertscore_scores['f1']) / len(bertscore_scores['f1']))

Mounting Google Drive...
Mounted at /content/drive
Loading your DeBERTa ranker...


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Device set to use cuda:0


Ranker loaded.
Loading SOCRATIC summarizer...


tokenizer_config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Device set to use cuda:0


Summarizer loaded.
Cloning SQuALITY repository...
Cloning into 'SQuALITY'...
remote: Enumerating objects: 136, done.[K
remote: Counting objects: 100% (136/136), done.[K
remote: Compressing objects: 100% (99/99), done.[K
remote: Total 136 (delta 58), reused 102 (delta 30), pack-reused 0 (from 0)[K
Receiving objects: 100% (136/136), 4.18 MiB | 2.62 MiB/s, done.
Resolving deltas: 100% (58/58), done.
Loading SQuALITY dataset from local file...

Running full evaluation pipeline...



  0%|          | 0/52 [00:00<?, ?it/s][ABoth `max_new_tokens` (=256) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.

  2%|▏         | 1/52 [00:04<03:42,  4.37s/it][ABoth `max_new_tokens` (=256) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)

  4%|▍         | 2/52 [00:07<03:11,  3.82s/it][ABoth `max_new_tokens` (=256) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/


Computing final evaluation scores...


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)



--- Final System Evaluation Complete ---

ROUGE Scores:
{'rouge1': np.float64(0.21385455326753444), 'rouge2': np.float64(0.03547305073800712), 'rougeL': np.float64(0.1151837194765627), 'rougeLsum': np.float64(0.15829336743372246)}

BERTScore (mean F1):
0.807820221552482


## Baseline Evaluation

In [7]:
!pip install -q -U bitsandbytes transformers datasets trl peft accelerate torch evaluate rouge_score bert_score sacrebleu

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/504.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m504.6/504.6 kB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
# Install all necessary libraries
# !pip install -q -U transformers datasets peft bitsandbytes accelerate torch evaluate rouge_score bert_score sacrebleu

# Import all necessary libraries
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from google.colab import drive
import torch
import evaluate
from tqdm import tqdm

# 1. Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

# 2. Load the BASE Gemma model and tokenizer
model_id = "google/gemma-2b-it"
print(f"Loading base model: {model_id}...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
print("Base model loaded successfully.")

# 3. Load the dataset and get the 200-example test slice
print("Loading CNN/DailyMail dataset...")
full_dataset = load_from_disk("/content/drive/MyDrive/cnn_dailymail_dataset")
test_slice = full_dataset["test"].select(range(200))

# 4. Generate summaries for the test set
print("\nGenerating summaries for the test set...")
model_summaries = []
human_summaries = test_slice["highlights"]

for article in tqdm(test_slice["article"]):
    prompt = f"""### Instruction:
Summarize the following news article.

### Input:
{article}

### Response:
"""
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to("cuda")

    with torch.no_grad():
        outputs = model.generate(**input_ids, max_new_tokens=128)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        model_summary = generated_text[len(prompt):].strip()
        model_summaries.append(model_summary)

# 5. Compute the final metrics
print("\nComputing evaluation scores...")
rouge = evaluate.load('rouge')
bertscore = evaluate.load("bertscore")

rouge_scores = rouge.compute(predictions=model_summaries, references=human_summaries)
bertscore_scores = bertscore.compute(predictions=model_summaries, references=human_summaries, lang="en")

print("\n--- Baseline Evaluation Complete (Base Gemma) ---")
print("\nROUGE Scores:")
print(rouge_scores)
print("\nBERTScore (mean F1):")
print(sum(bertscore_scores['f1']) / len(bertscore_scores['f1']))

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading base model: google/gemma-2b-it...


model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Base model loaded successfully.
Loading CNN/DailyMail dataset...

Generating summaries for the test set...


100%|██████████| 200/200 [10:38<00:00,  3.19s/it]



Computing evaluation scores...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)



--- Baseline Evaluation Complete (Base Gemma) ---

ROUGE Scores:
{'rouge1': np.float64(0.22653252341957597), 'rouge2': np.float64(0.08535707179839563), 'rougeL': np.float64(0.16213170144934191), 'rougeLsum': np.float64(0.19392150548146952)}

BERTScore (mean F1):
0.6767627251148224


