# Load Dataset

In [1]:
from tqdm.auto import tqdm

In [2]:
from datasets import load_dataset

df = load_dataset("ai4bharat/IndicHeadlineGeneration", "ml")

In [3]:
df = df["test"].select(range(500))
print(df[:1])

{'id': ['1'], 'input': ['ദില്ലി: ദില്ലിയില് പെണ്കുട്ടിയുടെ മുടി മുറിച്ചത് സഹോദരന്മാരാണെന്ന് പോലീസ് കണ്ടെത്തിയതിനു പിന്നാലെ കഴിഞ്ഞ 12 മണിക്കൂറിനിടെ ദില്ലിയില് വീണ്ടും റിപ്പോര്ട്ട് ചെയ്യപ്പെട്ടത് സമാനമായ 4 കേസുകള്.ഇതില് രണ്ടു പേര് പോലീസില് കേസ് രജിസ്റ്റര് ചെയ്തിട്ടുണ്ട്.ഉറങ്ങിയെഴുന്നേറ്റപ്പോള് മൂന്നിഞ്ച് നീളത്തില് മുടി നഷ്ടപ്പെട്ടതായി പരാതി നല്കിയ സ്ത്രീകളിലൊരാള് പോലീസിനെ അറിയിച്ചു.സംഭവത്തില് പോലീസ് അന്വേഷണമാരംഭിച്ചിട്ടുണ്ട്.എന്നാല് ഇതിനു പിന്നില് യുക്തപരമായ ഒരു ന്യായീകരണം നല്കാന് ഇപ്പോള് കഴിയില്ലെന്നും പോലീസ് പറയുന്നു.ദില്ലിയില് ഉറങ്ങിക്കിടന്ന 14 കാരിയുടെ മുടി മുറിച്ചത് സഹോദരന്മാരാണെന്നാണ് പോലീസ് കണ്ടെത്തിയത്.ഇതോടെ ദില്ലിയില് ഇത്തരത്തില് റിപ്പോര്ട്ട് ചെയ്യപ്പെട്ട ആദ്യ കേസിന്റെ ചുരുളഴിയുകയും ചെയ്തു.സംഭവത്തിനു പിന്നില് അസ്വാഭാവികത ഒന്നും ഇല്ലെന്നാണ് പോലീസ് പറയുന്നത്.പ്രേതത്തിന്റെ പേരും പറഞ്ഞ് ജനങ്ങള്ക്കിടയില് ഭീതി പരത്തരുതെന്നും പോലീസ് അറിയിച്ചു.എന്നാല് ഉറങ്ങിക്കിടക്കുന്ന സ്ത്രീകളുടെ മുടി പിറ്റേന്ന് ഉറങ്ങിയെഴുന്നേല്ക്കുമ്പോള് മുറിക്കപ്പെട്ട രീതിയില് കാണപ്പെടുന്ന അസാധാരണ സംഭവം ഇന്ത്യയിലെ മ

In [4]:
texts = df['input']
references = df['target']

# Compute Metrics

In [9]:
import numpy as np
import evaluate
import pyiwn

# Load evaluation metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
bertscore = evaluate.load("bertscore")

# Initialize IndoWordNet for Malayalam
iwn = pyiwn.IndoWordNet(lang=pyiwn.Language.MALAYALAM)

def expand_synonyms(word):
    """
    Expand a single word into its synonym set using IndoWordNet.
    If no synsets found, return the word itself.
    """
    try:
        synsets = iwn.synsets(word)
    except KeyError:
        return {word}

    if not synsets:
        return {word}

    synonyms = set()
    for syn in synsets:
        synonyms.update(syn.lemma_names())
    return synonyms if synonyms else {word}

def sentence_to_synonym_sets(sentence):
    """
    Convert a sentence into a list of synonym sets per word.
    """
    return [expand_synonyms(w) for w in sentence.split()]

def synonym_overlap(pred, ref):
    """
    Computes the proportion of words in the predicted headline 
    that have synonym matches in the reference headline.
    """
    pred_sets = sentence_to_synonym_sets(pred)
    ref_sets = sentence_to_synonym_sets(ref)

    # Precision: how many predicted words match reference
    match_pred = sum(1 for ps in pred_sets if any(ps & rs for rs in ref_sets))
    precision = match_pred / len(pred_sets) if pred_sets else 0.0

    # Recall: how many reference words are found in prediction
    match_ref = sum(1 for rs in ref_sets if any(rs & ps for ps in pred_sets))
    recall = match_ref / len(ref_sets) if ref_sets else 0.0

    # F1 Score
    if precision + recall == 0:
        f1 = 0.0
    else:
        f1 = 2 * precision * recall / (precision + recall)

    return round(f1, 4)

2025-07-03:11:49:12,233 INFO     [iwn.py:43] Loading malayalam language synsets...


In [10]:
import evaluate
import numpy as np

# Load metrics once
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
bertscore = evaluate.load("bertscore")

def compute_metrics(predictions, references, rouge_stemmer=True):
    """
    Computes ROUGE, BLEU, and BERTScore for given predictions and references.
    
    Args:
        predictions (list): List of predicted texts.
        references (list): List of reference texts.
        rouge_stemmer (bool): Whether to use stemming for ROUGE.
    
    Returns:
        dict: ROUGE, BLEU, and BERTScore metrics rounded to four decimal places.
    """
    
    # Compute ROUGE scores
    rouge_result = rouge.compute(predictions=predictions, references=references, use_stemmer=rouge_stemmer)
    
    # Compute BLEU scores
    bleu_result = bleu.compute(
        predictions=predictions,
        references=[[r] for r in references]
    )

    # Compute BERTScore (for Malayalam use lang="ml")
    bert_result = bertscore.compute(
        predictions=predictions,
        references=references,
        lang="ml"  # Malayalam
    )

     # Synonym-aware Rouge
    SynROUGE = round(np.mean([synonym_overlap(p, r) for p, r in zip(predictions, references)]), 4)

    return {
        "ROUGE-1": round(rouge_result['rouge1'], 4),
        "ROUGE-2": round(rouge_result['rouge2'], 4),
        "ROUGE-L": round(rouge_result['rougeL'], 4),
        "BLEU": round(bleu_result["bleu"], 4),
        "BERTScore-P": round(np.mean(bert_result["precision"]), 4),
        "BERTScore-R": round(np.mean(bert_result["recall"]), 4),
        "BERTScore-F1": round(np.mean(bert_result["f1"]), 4),
        "Synonym-Aware-ROUGE(SynROUGE)": SynROUGE
    }


# Print Results

In [9]:
def print_results(predictions, num_results = 5):
    for i in range(num_results):
        print(f"Input Text:\n{texts[i]}\n")
        print(f"Generated Summary:\n{predictions[i]}\n")
        print(f"Reference Summary:\n{references[i]}\n")
        print("="*80)

# Inference function

In [7]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

def build_prompt(text):
    return f"""
Generate a **clear and concise news headline in Malayalam only** based on the following text.

Text (Malayalam): {text}

Important:
- The output must be **only a headline in Malayalam**.
- Do **not** use any other language or script.
- Do **not** include any extra commentary or formatting.
- Do **not** copy the text word-for-word.
- Start your output with: Headline:

Example (do not include this in your output):
Headline: ഇന്ത്യയിൽ പുതിയ ശാസ്ത്രീയ കണ്ടെത്തൽ
"""


In [8]:

def generate_headlines(texts, model, tokenizer, batch_size=4, max_new_tokens=100):
    device = model.device
    model.eval()

    predictions = []

    dataloader = DataLoader(texts, batch_size=batch_size)

    for batch in tqdm(dataloader, desc="Generating headlines"):
        try:
            prompts = [build_prompt(text) for text in batch]

            messages = [[
                {
                    "role": "system",
                    "content": [{"type": "text", "text": "You are a helpful assistant that generates news headlines."}]
                },
                {
                    "role": "user",
                    "content": [{"type": "text", "text": prompt}]
                }
            ] for prompt in prompts]

            # Tokenize batch with padding
            inputs = tokenizer.apply_chat_template(
                messages,
                add_generation_prompt=True,
                tokenize=True,
                return_dict=True,
                return_tensors="pt",
                padding=True,
                truncation=True
            ).to(device)

            input_lens = inputs["input_ids"].shape[1]

            # Disable graph tracing to avoid Dynamo crash (especially with Gemma)
            import torch._dynamo
            torch._dynamo.disable()

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    do_sample=False
                )

            generated_tokens = outputs[:, input_lens:]
            decoded_batch = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
            predictions.extend(decoded_batch)

        except torch.cuda.OutOfMemoryError:
            print("⚠️ CUDA OOM – Skipping this batch.")
            torch.cuda.empty_cache()
            predictions.extend(["[OOM ERROR – Skipped]"] * len(batch))

    return predictions


# Gemma_1000_2e-4

In [11]:
from peft import PeftModel, PeftConfig, get_peft_model
from transformers import Gemma3ForCausalLM, AutoTokenizer

pretrained_model_id = "google/gemma-3-4b-it"
adapter_path = "finetuned_models/gemma3_finetuned_2e-4"
device = "cuda:0"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id, trust_remote_code=True)
base_model = Gemma3ForCausalLM.from_pretrained(pretrained_model_id).to(device)

# Load PEFT config
peft_config = PeftConfig.from_pretrained(adapter_path)

# Re-wrap base model exactly as SFTTrainer does
model = get_peft_model(base_model, peft_config)
model.load_adapter(adapter_path, adapter_name="default")  # Now it matches the same nesting
model.eval()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma3ForCausalLM(
      (model): Gemma3TextModel(
        (embed_tokens): Gemma3TextScaledWordEmbedding(262208, 2560, padding_idx=0)
        (layers): ModuleList(
          (0-33): 34 x Gemma3DecoderLayer(
            (self_attn): Gemma3Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2560, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
           

In [13]:
Gemma_1000_preds = generate_headlines(texts, model, tokenizer, batch_size=4)

Generating headlines:   0%|          | 0/125 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Generating headlines:   1%|          | 1/125 [00:05<10:25,  5.05s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   2%|▏         | 2/125 [00:10<10:39,  5.20s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   2%|▏         | 3/125 [00:15<10:49,  5.32s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   3%|▎         | 4/125 [00:25<14:19,  7.10s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   4%|▍         | 5/125 [00:33<14:46,  7.39s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERB

In [14]:
import re

Gemma_1000_preds = [re.sub(r"(?i)^\s*Headline:\s*", "", p) for p in Gemma_1000_preds]

In [15]:
compute_metrics(Gemma_1000_preds, references)

2025-07-03:09:32:40,395 INFO     [rouge_scorer.py:83] Using default tokenizer.


{'ROUGE-1': np.float64(0.0459),
 'ROUGE-2': np.float64(0.0116),
 'ROUGE-L': np.float64(0.0457),
 'BLEU': 0.0755,
 'BERTScore-P': np.float64(0.7475),
 'BERTScore-R': np.float64(0.7617),
 'BERTScore-F1': np.float64(0.7534),
 'Synonym-Aware-ROUGE(SynROUGE)': np.float64(0.2603)}

# Gemma_1250

In [9]:
from peft import PeftModel, PeftConfig, get_peft_model
from transformers import Gemma3ForCausalLM, AutoTokenizer

pretrained_model_id = "google/gemma-3-4b-it"
adapter_path = "finetuned_models/gemma3_finetuned_1250_2e-4"
device = "cuda:0"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id, trust_remote_code=True)
base_model = Gemma3ForCausalLM.from_pretrained(pretrained_model_id).to(device)

# Load PEFT config
peft_config = PeftConfig.from_pretrained(adapter_path)

# Re-wrap base model exactly as SFTTrainer does
model = get_peft_model(base_model, peft_config)
model.load_adapter(adapter_path, adapter_name="default")  # Now it matches the same nesting
model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma3ForCausalLM(
      (model): Gemma3TextModel(
        (embed_tokens): Gemma3TextScaledWordEmbedding(262208, 2560, padding_idx=0)
        (layers): ModuleList(
          (0-33): 34 x Gemma3DecoderLayer(
            (self_attn): Gemma3Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2560, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
           

In [10]:
Gemma_1250_preds = generate_headlines(texts, model, tokenizer, batch_size=4)

Generating headlines:   0%|          | 0/125 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   1%|          | 1/125 [00:20<43:08, 20.88s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   2%|▏         | 2/125 [00:25<23:42, 11.56s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   2%|▏         | 3/125 [00:30<17:26,  8.58s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   3%|▎         | 4/12

In [None]:
import re

Gemma_1250_preds = [re.sub(r"(?i)^\s*Headline:\s*", "", p) for p in Gemma_1250_preds]

compute_metrics(Gemma_1250_preds, references)

2025-07-03:11:46:35,234 INFO     [rouge_scorer.py:83] Using default tokenizer.


{'ROUGE-1': np.float64(0.0647),
 'ROUGE-2': np.float64(0.008),
 'ROUGE-L': np.float64(0.0642),
 'BLEU': 0.0764,
 'BERTScore-P': np.float64(0.7558),
 'BERTScore-R': np.float64(0.759),
 'BERTScore-F1': np.float64(0.7565),
 'Synonym-Aware-ROUGE(SynROUGE)': np.float64(0.2716)}

: 

# Gemma_1500

In [11]:
from peft import PeftModel, PeftConfig, get_peft_model
from transformers import Gemma3ForCausalLM, AutoTokenizer

pretrained_model_id = "google/gemma-3-4b-it"
adapter_path = "finetuned_models/gemma3_finetuned_1500"
device = "cuda:0"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id, trust_remote_code=True)
base_model = Gemma3ForCausalLM.from_pretrained(pretrained_model_id).to(device)

# Load PEFT config
peft_config = PeftConfig.from_pretrained(adapter_path)

# Re-wrap base model exactly as SFTTrainer does
model = get_peft_model(base_model, peft_config)
model.load_adapter(adapter_path, adapter_name="default")  # Now it matches the same nesting
model.eval();


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
Gemma_1500_preds = generate_headlines(texts, model, tokenizer, batch_size=4)

Generating headlines:   0%|          | 0/125 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   1%|          | 1/125 [00:20<43:08, 20.87s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   2%|▏         | 2/125 [00:28<26:51, 13.10s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   2%|▏         | 3/125 [00:36<21:29, 10.57s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   3%|▎         | 4/12

In [None]:
import re

Gemma_1500_preds = [re.sub(r"(?i)^\s*Headline:\s*", "", p) for p in Gemma_1500_preds]

compute_metrics(Gemma_1500_preds, references)

2025-07-02:14:17:35,726 INFO     [rouge_scorer.py:83] Using default tokenizer.


{'ROUGE-1': np.float64(0.068),
 'ROUGE-2': np.float64(0.012),
 'ROUGE-L': np.float64(0.0687),
 'BLEU': 0.0683,
 'BERTScore-P': np.float64(0.7468),
 'BERTScore-R': np.float64(0.7631),
 'BERTScore-F1': np.float64(0.7538),
 'Synonym-Aware-ROUGE(SynROUGE)': np.float64(0.2617)}

: 

# Gemma_1750

In [9]:
from peft import PeftModel, PeftConfig, get_peft_model
from transformers import Gemma3ForCausalLM, AutoTokenizer

pretrained_model_id = "google/gemma-3-4b-it"
adapter_path = "finetuned_models/gemma3_finetuned_1750_2e-4"
device = "cuda:1"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id, trust_remote_code=True)
base_model = Gemma3ForCausalLM.from_pretrained(pretrained_model_id).to(device)

# Load PEFT config
peft_config = PeftConfig.from_pretrained(adapter_path)

# Re-wrap base model exactly as SFTTrainer does
model = get_peft_model(base_model, peft_config)
model.load_adapter(adapter_path, adapter_name="default")  # Now it matches the same nesting
model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma3ForCausalLM(
      (model): Gemma3TextModel(
        (embed_tokens): Gemma3TextScaledWordEmbedding(262208, 2560, padding_idx=0)
        (layers): ModuleList(
          (0-33): 34 x Gemma3DecoderLayer(
            (self_attn): Gemma3Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2560, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
           

In [10]:
Gemma_1750_preds = generate_headlines(texts, model, tokenizer, batch_size=4)

Generating headlines:   0%|          | 0/125 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   1%|          | 1/125 [00:20<42:24, 20.52s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   2%|▏         | 2/125 [00:27<25:22, 12.38s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   2%|▏         | 3/125 [00:34<20:42, 10.18s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   3%|▎         | 4/12

In [None]:
import re

Gemma_1750_preds = [re.sub(r"(?i)^\s*Headline:\s*", "", p) for p in Gemma_1750_preds]

compute_metrics(Gemma_1750_preds, references)

2025-07-03:11:21:34,973 INFO     [rouge_scorer.py:83] Using default tokenizer.


{'ROUGE-1': np.float64(0.0964),
 'ROUGE-2': np.float64(0.01),
 'ROUGE-L': np.float64(0.096),
 'BLEU': 0.0869,
 'BERTScore-P': np.float64(0.7644),
 'BERTScore-R': np.float64(0.776),
 'BERTScore-F1': np.float64(0.7694),
 'Synonym-Aware-ROUGE(SynROUGE)': np.float64(0.3012)}

: 

# Gemma_2000

In [7]:
from peft import PeftModel, PeftConfig, get_peft_model
from transformers import Gemma3ForCausalLM, AutoTokenizer

pretrained_model_id = "google/gemma-3-4b-it"
adapter_path = "finetuned_models/gemma3_finetuned_2000_2e-4"
device = "cuda:0"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id, trust_remote_code=True)
base_model = Gemma3ForCausalLM.from_pretrained(pretrained_model_id).to(device)

# Load PEFT config
peft_config = PeftConfig.from_pretrained(adapter_path)

# Re-wrap base model exactly as SFTTrainer does
model = get_peft_model(base_model, peft_config)
model.load_adapter(adapter_path, adapter_name="default")  # Now it matches the same nesting
model.eval()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma3ForCausalLM(
      (model): Gemma3TextModel(
        (embed_tokens): Gemma3TextScaledWordEmbedding(262208, 2560, padding_idx=0)
        (layers): ModuleList(
          (0-33): 34 x Gemma3DecoderLayer(
            (self_attn): Gemma3Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2560, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
           

In [13]:
Gemma_2000_preds = generate_headlines(texts, model, tokenizer, batch_size=4)

Generating headlines:   0%|          | 0/125 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   1%|          | 1/125 [00:18<39:06, 18.92s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   2%|▏         | 2/125 [00:23<21:33, 10.52s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   2%|▏         | 3/125 [00:27<15:07,  7.44s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   3%|▎         | 4/12

In [None]:
import re

Gemma_2000_preds = [re.sub(r"(?i)^\s*Headline:\s*", "", p) for p in Gemma_2000_preds]

compute_metrics(Gemma_2000_preds, references)

2025-07-03:09:53:45,749 INFO     [rouge_scorer.py:83] Using default tokenizer.


{'ROUGE-1': np.float64(0.1093),
 'ROUGE-2': np.float64(0.0116),
 'ROUGE-L': np.float64(0.1099),
 'BLEU': 0.0962,
 'BERTScore-P': np.float64(0.7708),
 'BERTScore-R': np.float64(0.7794),
 'BERTScore-F1': np.float64(0.7742),
 'Synonym-Aware-ROUGE(SynROUGE)': np.float64(0.3144)}

: 

# Gemma_2250

In [9]:
from peft import PeftModel, PeftConfig, get_peft_model
from transformers import Gemma3ForCausalLM, AutoTokenizer

pretrained_model_id = "google/gemma-3-4b-it"
adapter_path = "finetuned_models/gemma3_finetuned_2250_2e-4"
device = "cuda:0"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id, trust_remote_code=True)
base_model = Gemma3ForCausalLM.from_pretrained(pretrained_model_id).to(device)

# Load PEFT config
peft_config = PeftConfig.from_pretrained(adapter_path)

# Re-wrap base model exactly as SFTTrainer does
model = get_peft_model(base_model, peft_config)
model.load_adapter(adapter_path, adapter_name="default")  # Now it matches the same nesting
model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma3ForCausalLM(
      (model): Gemma3TextModel(
        (embed_tokens): Gemma3TextScaledWordEmbedding(262208, 2560, padding_idx=0)
        (layers): ModuleList(
          (0-33): 34 x Gemma3DecoderLayer(
            (self_attn): Gemma3Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2560, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
           

In [10]:
Gemma_2250_preds = generate_headlines(texts, model, tokenizer, batch_size=4)

Generating headlines:   0%|          | 0/125 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   1%|          | 1/125 [00:18<38:39, 18.71s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   2%|▏         | 2/125 [00:21<18:46,  9.16s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   2%|▏         | 3/125 [00:23<12:28,  6.14s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   3%|▎         | 4/12

In [None]:
import re

Gemma_2250_preds = [re.sub(r"(?i)^\s*Headline:\s*", "", p) for p in Gemma_2250_preds]

compute_metrics(Gemma_2250_preds, references)

2025-07-03:11:31:42,380 INFO     [rouge_scorer.py:83] Using default tokenizer.


{'ROUGE-1': np.float64(0.1186),
 'ROUGE-2': np.float64(0.0133),
 'ROUGE-L': np.float64(0.1188),
 'BLEU': 0.0955,
 'BERTScore-P': np.float64(0.7661),
 'BERTScore-R': np.float64(0.7896),
 'BERTScore-F1': np.float64(0.7768),
 'Synonym-Aware-ROUGE(SynROUGE)': np.float64(0.3141)}

: 

# Gemma_2500

In [9]:
from peft import PeftModel, PeftConfig, get_peft_model
from transformers import Gemma3ForCausalLM, AutoTokenizer

pretrained_model_id = "google/gemma-3-4b-it"
adapter_path = "finetuned_models/gemma3_finetuned_2500_2e-4"
device = "cuda:0"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id, trust_remote_code=True)
base_model = Gemma3ForCausalLM.from_pretrained(pretrained_model_id).to(device)

# Load PEFT config
peft_config = PeftConfig.from_pretrained(adapter_path)

# Re-wrap base model exactly as SFTTrainer does
model = get_peft_model(base_model, peft_config)
model.load_adapter(adapter_path, adapter_name="default")  # Now it matches the same nesting
model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma3ForCausalLM(
      (model): Gemma3TextModel(
        (embed_tokens): Gemma3TextScaledWordEmbedding(262208, 2560, padding_idx=0)
        (layers): ModuleList(
          (0-33): 34 x Gemma3DecoderLayer(
            (self_attn): Gemma3Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2560, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
           

In [10]:
Gemma_2500_preds = generate_headlines(texts, model, tokenizer, batch_size=4)

Generating headlines:   0%|          | 0/125 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   1%|          | 1/125 [00:16<34:48, 16.84s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   2%|▏         | 2/125 [00:19<17:15,  8.42s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   2%|▏         | 3/125 [00:21<11:31,  5.66s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   3%|▎         | 4/12

In [None]:
import re

Gemma_2500_preds = [re.sub(r"(?i)^\s*Headline:\s*", "", p) for p in Gemma_2500_preds]

compute_metrics(Gemma_2500_preds, references)

2025-07-03:10:07:41,582 INFO     [rouge_scorer.py:83] Using default tokenizer.


{'ROUGE-1': np.float64(0.1076),
 'ROUGE-2': np.float64(0.0107),
 'ROUGE-L': np.float64(0.1076),
 'BLEU': 0.0948,
 'BERTScore-P': np.float64(0.7619),
 'BERTScore-R': np.float64(0.786),
 'BERTScore-F1': np.float64(0.7729),
 'Synonym-Aware-ROUGE(SynROUGE)': np.float64(0.3094)}

: 

# Gemma_2750

In [11]:
from peft import PeftModel, PeftConfig, get_peft_model
from transformers import Gemma3ForCausalLM, AutoTokenizer

pretrained_model_id = "google/gemma-3-4b-it"
adapter_path = "finetuned_models/gemma3_finetuned_2750_2e-4"
device = "cuda:0"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id, trust_remote_code=True)
base_model = Gemma3ForCausalLM.from_pretrained(pretrained_model_id).to(device)

# Load PEFT config
peft_config = PeftConfig.from_pretrained(adapter_path)

# Re-wrap base model exactly as SFTTrainer does
model = get_peft_model(base_model, peft_config)
model.load_adapter(adapter_path, adapter_name="default")  # Now it matches the same nesting
model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma3ForCausalLM(
      (model): Gemma3TextModel(
        (embed_tokens): Gemma3TextScaledWordEmbedding(262208, 2560, padding_idx=0)
        (layers): ModuleList(
          (0-33): 34 x Gemma3DecoderLayer(
            (self_attn): Gemma3Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2560, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
           

In [16]:
Gemma_2750_preds = generate_headlines(texts, model, tokenizer, batch_size=4)

Generating headlines:   0%|          | 0/125 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Generating headlines:   1%|          | 1/125 [00:02<04:47,  2.32s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   2%|▏         | 2/125 [00:04<04:49,  2.36s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   2%|▏         | 3/125 [00:07<05:26,  2.68s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   3%|▎         | 4/125 [00:10<05:29,  2.73s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   4%|▍         | 5/125 [00:13<05:31,  2.76s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERB

In [None]:
import re

Gemma_2750_preds = [re.sub(r"(?i)^\s*Headline:\s*", "", p) for p in Gemma_2750_preds]

compute_metrics(Gemma_2750_preds, references)

2025-07-03:12:14:50,281 INFO     [rouge_scorer.py:83] Using default tokenizer.


{'ROUGE-1': np.float64(0.1109),
 'ROUGE-2': np.float64(0.0087),
 'ROUGE-L': np.float64(0.1108),
 'BLEU': 0.0992,
 'BERTScore-P': np.float64(0.7677),
 'BERTScore-R': np.float64(0.7859),
 'BERTScore-F1': np.float64(0.7758),
 'Synonym-Aware-ROUGE(SynROUGE)': np.float64(0.3164)}

: 

# Gemma_3000

In [9]:
from peft import PeftModel, PeftConfig, get_peft_model
from transformers import Gemma3ForCausalLM, AutoTokenizer

pretrained_model_id = "google/gemma-3-4b-it"
adapter_path = "finetuned_models/gemma3_finetuned_3000_2e-4"
device = "cuda:0"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id, trust_remote_code=True)
base_model = Gemma3ForCausalLM.from_pretrained(pretrained_model_id).to(device)

# Load PEFT config
peft_config = PeftConfig.from_pretrained(adapter_path)

# Re-wrap base model exactly as SFTTrainer does
model = get_peft_model(base_model, peft_config)
model.load_adapter(adapter_path, adapter_name="default")  # Now it matches the same nesting
model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma3ForCausalLM(
      (model): Gemma3TextModel(
        (embed_tokens): Gemma3TextScaledWordEmbedding(262208, 2560, padding_idx=0)
        (layers): ModuleList(
          (0-33): 34 x Gemma3DecoderLayer(
            (self_attn): Gemma3Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2560, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
           

In [10]:
Gemma_3000_preds = generate_headlines(texts, model, tokenizer, batch_size=4)

Generating headlines:   0%|          | 0/125 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   1%|          | 1/125 [00:16<33:34, 16.24s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   2%|▏         | 2/125 [00:18<16:25,  8.02s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   2%|▏         | 3/125 [00:21<11:31,  5.67s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating headlines:   3%|▎         | 4/12

In [None]:
import re

Gemma_3000_preds = [re.sub(r"(?i)^\s*Headline:\s*", "", p) for p in Gemma_3000_preds]

compute_metrics(Gemma_3000_preds, references)

2025-07-03:10:19:39,598 INFO     [rouge_scorer.py:83] Using default tokenizer.


{'ROUGE-1': np.float64(0.1038),
 'ROUGE-2': np.float64(0.01),
 'ROUGE-L': np.float64(0.1035),
 'BLEU': 0.0886,
 'BERTScore-P': np.float64(0.7619),
 'BERTScore-R': np.float64(0.7798),
 'BERTScore-F1': np.float64(0.7699),
 'Synonym-Aware-ROUGE(SynROUGE)': np.float64(0.3045)}

: 