## 2321_07302025

In [None]:
!pip install -q -U transformers datasets peft trl bitsandbytes accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m129.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m504.6/504.6 kB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m71.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Import all necessary libraries
import pickle
import pandas as pd
from datasets import load_from_disk, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    BitsAndBytesConfig
)
from peft import PeftModel
from google.colab import drive
import torch
import os

# 1. Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

# --- STAGE 2: GENERATE A LARGER RANKING DATASET ---

# 2. Load BEST fine-tuned Gemma summarizer
print("Loading fine-tuned Gemma summarizer...")
gemma_base_model_id = "google/gemma-2b-it"
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
gemma_base_model = AutoModelForCausalLM.from_pretrained(gemma_base_model_id, quantization_config=bnb_config, device_map="auto")
gemma_tokenizer = AutoTokenizer.from_pretrained(gemma_base_model_id)
# Correct path to the best performing model from the first run
gemma_adapter_path = "/content/drive/MyDrive/gemma_summarizer_run/checkpoint-125"
gemma_model = PeftModel.from_pretrained(gemma_base_model, gemma_adapter_path)
print("Summarizer loaded.")

# 3. Load the original CNN/DailyMail dataset
print("Loading original CNN/DailyMail dataset...")
full_dataset = load_from_disk("/content/drive/MyDrive/cnn_dailymail_dataset")
data_slice = full_dataset["validation"].select(range(500))

# 4. Generate candidate summaries and create the ranking data
print("\nGenerating a larger ranking dataset of 1,500 examples...")
ranker_data = []
for example in data_slice:
    article = example['article']
    human_summary = example['highlights']
    ranker_data.append({"text": f"summarize: {article[:4000]} <sep> candidate: {human_summary}", "label": 1})

    prompt = f"### Instruction:\nSummarize the following news article.\n\n### Input:\n{article}\n\n### Response:\n"
    input_ids = gemma_tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = gemma_model.generate(**input_ids, max_new_tokens=100, do_sample=True, top_k=50, num_return_sequences=2)

    for i in range(2):
      generated_text = gemma_tokenizer.decode(outputs[i], skip_special_tokens=True)
      generated_summary = generated_text[len(prompt):].strip()
      ranker_data.append({"text": f"summarize: {article[:4000]} <sep> candidate: {generated_summary}", "label": 0})
print(f"Created {len(ranker_data)} examples for the ranker.")


# --- STAGE 3: TRAIN THE RANKER ON THE NEW DATASET ---

# 5. Convert to a Hugging Face Dataset and split
full_ranker_dataset = Dataset.from_list(ranker_data)
train_test_split = full_ranker_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']
print(f"Created a training set with {len(train_dataset)} examples.")
print(f"Created an evaluation set with {len(eval_dataset)} examples.")

# 6. Tokenize the datasets
ranker_model_name = "microsoft/deberta-v3-base"
ranker_tokenizer = AutoTokenizer.from_pretrained(ranker_model_name)
def tokenize_function(batch):
    return ranker_tokenizer(batch["text"], truncation=True, max_length=512)

print(f"Tokenizing datasets with {ranker_model_name} tokenizer...")
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_eval = eval_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# 7. Initialize the Data Collator, Model, and Trainer
data_collator = DataCollatorWithPadding(tokenizer=ranker_tokenizer)
ranker_model = AutoModelForSequenceClassification.from_pretrained(ranker_model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/deberta_ranker_final_large_run",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=20,
    save_strategy="epoch",
    eval_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=ranker_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator
)

# 8. Start the final training run
print("\nStarting final ranker training...")
trainer.train()

print("\n--- Final Ranker training complete! ---")

Mounting Google Drive...
Mounted at /content/drive
Loading fine-tuned Gemma summarizer...


config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Summarizer loaded.
Loading original CNN/DailyMail dataset...

Generating a larger ranking dataset of 1,500 examples...
Created 1500 examples for the ranker.
Created a training set with 1200 examples.
Created an evaluation set with 300 examples.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Tokenizing datasets with microsoft/deberta-v3-base tokenizer...


Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting final ranker training...


Epoch,Training Loss,Validation Loss
1,0.6382,0.652259
2,0.6732,0.654242
3,0.623,0.654196


model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]


--- Final Ranker training complete! ---


## Simple Ranking test

In [None]:
# Import all necessary libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from google.colab import drive
import pandas as pd
import torch

# 1. Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

# 2. Define the path to best checkpoint
# Based on training log, the best model was from epoch 1 (step 75).
model_path = "/content/drive/MyDrive/deberta_ranker_final_large_run/checkpoint-75"
model_name = "microsoft/deberta-v3-base"
print(f"Loading model from: {model_path}")

# 3. Load fine-tuned model and the tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("Model and tokenizer loaded successfully.")

# 4. Create the prediction pipeline
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)
print("Prediction pipeline created.")

# 5. Set up the test case
question = "What is the capital of France?"
document = "Paris is the capital and most populous city of France."
candidate_A = "The capital of France is Paris." # Correct
candidate_B = "Paris is a major global center for art." # Partially correct
candidate_C = "The capital is Berlin." # Incorrect

# Format the inputs
text_A = f"summarize: {document} <sep> candidate: {candidate_A}"
text_B = f"summarize: {document} <sep> candidate: {candidate_B}"
text_C = f"summarize: {document} <sep> candidate: {candidate_C}"

# 6. Get predictions from the pipeline
print("\nGetting model predictions...")
predictions = pipe([text_A, text_B, text_C], top_k=None, truncation=True)

# 7. Process the results to create a ranked list
ranking_results = []
candidates = [candidate_A, candidate_B, candidate_C]
for i, result_pairs in enumerate(predictions):
    score_for_label_1 = 0
    for pair in result_pairs:
        if pair['label'] == 'LABEL_1':
            score_for_label_1 = pair['score']
            break
    ranking_results.append({
        "candidate": candidates[i],
        "ranking_score (LABEL_1)": score_for_label_1
    })

# 8. Display the final ranked list
df_ranked = pd.DataFrame(ranking_results)
df_ranked = df_ranked.sort_values(by="ranking_score (LABEL_1)", ascending=False)

print("\n--- Final Ranked Results ---")
print(df_ranked)

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading model from: /content/drive/MyDrive/deberta_ranker_final_large_run/checkpoint-75


Device set to use cuda:0


Model and tokenizer loaded successfully.


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Prediction pipeline created.

Getting model predictions...

--- Final Ranked Results ---
                                 candidate  ranking_score (LABEL_1)
0          The capital of France is Paris.                 0.351275
2                   The capital is Berlin.                 0.350848
1  Paris is a major global center for art.                 0.349894


## Evaluation

In [None]:
!pip install -q -U evaluate transformers datasets peft trl bitsandbytes accelerate rouge_score bert_score sacrebleu

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [None]:
# Import all necessary libraries
from datasets import load_from_disk
from peft import PeftModel
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    pipeline,
    BitsAndBytesConfig
)
from google.colab import drive
import pandas as pd
import torch
import evaluate
from tqdm import tqdm

# 1. Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

# --- 2. Load Both Trained Models ---

# Load the DeBERTa Ranker
print("Loading DeBERTa ranker...")
ranker_path = "/content/drive/MyDrive/deberta_ranker_final_large_run/checkpoint-75"
ranker_model = AutoModelForSequenceClassification.from_pretrained(ranker_path)
ranker_tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
ranker_pipe = pipeline("text-classification", model=ranker_model, tokenizer=ranker_tokenizer, device=0)
print("Ranker loaded.")

# Load the Gemma Summarizer
print("Loading Gemma summarizer...")
gemma_base_model_id = "google/gemma-2b-it"
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
gemma_base_model = AutoModelForCausalLM.from_pretrained(gemma_base_model_id, quantization_config=bnb_config, device_map="auto")
gemma_tokenizer = AutoTokenizer.from_pretrained(gemma_base_model_id)
gemma_adapter_path = "/content/drive/MyDrive/gemma_summarizer_run/checkpoint-125"
gemma_model = PeftModel.from_pretrained(gemma_base_model, gemma_adapter_path)
print("Summarizer loaded.")

# --- 3. Run the Full Evaluation Pipeline ---

# Load the test data
full_dataset = load_from_disk("/content/drive/MyDrive/cnn_dailymail_dataset")
test_slice = full_dataset["test"].select(range(200)) # Use 200 examples for a robust evaluation

final_summaries = []
human_summaries = []

print("\nRunning full evaluation pipeline on 200 articles...")
for example in tqdm(test_slice):
    article = example['article']
    human_summary = example['highlights']
    human_summaries.append(human_summary)

    # Step A: Generate 3 candidate summaries with Gemma
    prompt = f"### Instruction:\nSummarize the following news article.\n\n### Input:\n{article}\n\n### Response:\n"
    input_ids = gemma_tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = gemma_model.generate(**input_ids, max_new_tokens=100, do_sample=True, top_k=50, num_return_sequences=3)

    candidate_summaries = []
    for i in range(3):
        generated_text = gemma_tokenizer.decode(outputs[i], skip_special_tokens=True)
        candidate_summaries.append(generated_text[len(prompt):].strip())

    # Step B: Rank the candidates with DeBERTa
    formatted_for_ranker = [f"summarize: {article[:4000]} <sep> candidate: {cand}" for cand in candidate_summaries]
    ranking_predictions = ranker_pipe(formatted_for_ranker, top_k=None, truncation=True)

    # Step C: Select the best summary
    best_summary = ""
    highest_score = -1
    for i, result_pairs in enumerate(ranking_predictions):
        score_for_label_1 = 0
        for pair in result_pairs:
            if pair['label'] == 'LABEL_1':
                score_for_label_1 = pair['score']
                break
        if score_for_label_1 > highest_score:
            highest_score = score_for_label_1
            best_summary = candidate_summaries[i]

    final_summaries.append(best_summary)

# --- 4. Compute Final Scores ---
print("\nComputing final evaluation scores...")
!pip install -q -U evaluate rouge_score bert_score sacrebleu

rouge = evaluate.load('rouge')
bertscore = evaluate.load("bertscore")

rouge_scores = rouge.compute(predictions=final_summaries, references=human_summaries)
bertscore_scores = bertscore.compute(predictions=final_summaries, references=human_summaries, lang="en")

print("\n--- Final System Evaluation Complete ---")
print("\nROUGE Scores:")
print(rouge_scores)
print("\nBERTScore (mean F1):")
print(sum(bertscore_scores['f1']) / len(bertscore_scores['f1']))

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading DeBERTa ranker...


Device set to use cuda:0


Ranker loaded.
Loading Gemma summarizer...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Summarizer loaded.

Running full evaluation pipeline on 200 articles...


  0%|          | 0/200 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  5%|▌         | 10/200 [01:04<21:58,  6.94s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 200/200 [23:41<00:00,  7.11s/it]



Computing final evaluation scores...


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)



--- Final System Evaluation Complete ---

ROUGE Scores:
{'rouge1': np.float64(0.2518837597657518), 'rouge2': np.float64(0.06838708094994625), 'rougeL': np.float64(0.17651048289333385), 'rougeLsum': np.float64(0.22997656412787987)}

BERTScore (mean F1):
0.8647138279676437
