# Load Dataset

In [1]:
from tqdm.auto import tqdm

In [2]:
import pandas as pd

# Load the CSV
df = pd.read_csv('test.csv')
df = df.dropna().reset_index(drop=True)
print(df.count())
df.head()

source_url    116
target_url    116
text          116
summary       116
dtype: int64


Unnamed: 0,source_url,target_url,text,summary
0,https://www.pmindia.gov.in/ml/news_updates/%E0...,https://www.pmindia.gov.in/ml/news_updates/%E0...,ഇലക്ട്രോണിക്‌സിലും വിവരസാങ്കേതികവിദ്യയിലുമുള്...,"ഇന്ത്യയും അംഗോളയും തമ്മില്‍ ഇലക്ട്രോണിക്‌സ്, ..."
1,https://www.pmindia.gov.in/ml/news_updates/%E0...,https://www.pmindia.gov.in/ml/news_updates/%E0...,ഒന്‍പതാമത് വൈബ്രന്റ് ഗുജറാത്ത് ഉച്ചകോടിക്ക് ഗാ...,വൈബ്രന്റ് ഗുജറാത്ത് ഉച്ചകോടിക്ക് മുന്നോടിയായുള...
2,https://www.pmindia.gov.in/ml/news_updates/%E0...,https://www.pmindia.gov.in/ml/news_updates/%E0...,ക്ഷയരോഗ നിര്‍മ്മാര്‍ജ്ജനത്തിനായുള്ള ഡല്‍ഹി ഉച്...,ക്ഷയരോഗ നിര്‍മ്മാര്‍ജ്ജന ഉച്ചകോടി പ്രധാനമന്ത്ര...
3,https://www.pmindia.gov.in/ml/news_updates/%E0...,https://www.pmindia.gov.in/ml/news_updates/%E0...,ഭക്ഷ്യസുരക്ഷയിലും അനുബന്ധ മേഖലകളിലും സഹകരിക്കു...,ഭക്ഷ്യസുരക്ഷയിലും അനുബന്ധ മേഖലകളിലും സഹകരിക്കു...
4,https://www.pmindia.gov.in/ml/news_updates/%E0...,https://www.pmindia.gov.in/ml/news_updates/%E0...,ഉത്തരാഖണ്ഡിലെ പൗരിയിൽ ബസ് അപകടത്തിലെ ജീവഹാനിയി...,ഉത്തരാഖണ്ഡിലെ പൗരി ബസ്സപകടത്തിൽ പ്രധാനമന്ത്രി...


In [3]:
df = df.rename(columns={'Text': 'text', 'Summary': 'summary'})

In [4]:
from datasets import Dataset

dataset = Dataset.from_pandas(df[['text', 'summary']])

In [5]:
texts = df['text'].tolist()
references = df['summary'].tolist()

# Compute Metrics

In [19]:
import evaluate

# Load metrics once
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

def compute_metrics(predictions, references, rouge_stemmer=True):
    """
    Computes ROUGE and BLEU scores for given predictions and references.
    
    Args:
        predictions (list): List of predicted texts.
        references (list): List of reference texts.
        rouge_stemmer (bool): Whether to use stemming for ROUGE.
    
    Returns:
        dict: ROUGE and BLEU scores rounded to four decimal places.
    """
    
    # Compute ROUGE scores
    rouge_result = rouge.compute(predictions=predictions, references=references, use_stemmer=rouge_stemmer)
    
    # Compute BLEU scores
    bleu_result = bleu.compute(
        predictions=predictions,
        references=[[r] for r in references]
    )
    
    return {
        "ROUGE-1": round(rouge_result['rouge1'], 4),
        "ROUGE-2": round(rouge_result['rouge2'], 4),
        "ROUGE-L": round(rouge_result['rougeL'], 4),
        "BLEU": round(bleu_result["bleu"], 4)
    }

# Print Results

In [23]:
def print_results(predictions, num_results = 5):
    for i in range(num_results):
        print(f"Input Text:\n{texts[i]}\n")
        print(f"Generated Summary:\n{predictions[i]}\n")
        print(f"Reference Summary:\n{references[i]}\n")
        print("="*80)

# Bart

In [6]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "ai4bharat/IndicBART"  # Malayalam IndicBART

tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False, use_fast=False, keep_accents=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

2025-06-10 12:38:19.316142: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-10 12:38:19.332035: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749555499.350010 4158442 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749555499.355780 4158442 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749555499.370925 4158442 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [7]:
bos_id = tokenizer._convert_token_to_id_with_added_voc("<s>")
eos_id = tokenizer._convert_token_to_id_with_added_voc("</s>")
pad_id = tokenizer._convert_token_to_id_with_added_voc("<pad>")


In [8]:
from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator


def malayalam_to_devanagari(input_text):
    # Check if input is a string or a list of strings
    if isinstance(input_text, str):
        return UnicodeIndicTransliterator.transliterate(input_text, 'ml', 'hi') + ' </s> <2hi>'
    elif isinstance(input_text, list):
        # If it's a list, apply the transformation to each string
        return [UnicodeIndicTransliterator.transliterate(text, 'ml', 'hi') + ' </s> <2hi>' for text in input_text]
    else:
        raise ValueError("Input must be a string or a list of strings.")

def devanagari_to_malayalam(input_text):
    # Check if input is a string or a list of strings
    if isinstance(input_text, str):
        return UnicodeIndicTransliterator.transliterate(input_text, 'hi', 'ml')
    elif isinstance(input_text, list):
        # If it's a list, apply the transformation to each string
        return [UnicodeIndicTransliterator.transliterate(text, 'hi', 'ml') for text in input_text]
    else:
        raise ValueError("Input must be a string or a list of strings.")

In [9]:
texts = texts[:16]

In [10]:
batch_size = 8  # smaller batch size to show progress in chunks
predictions = []

for i in tqdm(range(0, len(texts), batch_size), desc="Generating summaries"):
    batch_texts = texts[i:i+batch_size]
    batch_texts = malayalam_to_devanagari(batch_texts)
    # inputs = tokenizer(batch_texts, max_length=512, truncation=True, padding=True, return_tensors='pt')
    # inputs = tokenizer("मैं [MASK] हूँ </s> <2hi>", add_special_tokens=False, return_tensors="pt", padding=True).input_ids
    inputs = tokenizer(batch_texts, add_special_tokens=False, padding=True, return_tensors='pt')

    # summary_ids = model.generate(
    #     inputs['input_ids'], attention_mask=inputs['attention_mask'],
    #     max_length=128, num_beams=4, early_stopping=True
    # )

    summary_ids = model.generate(
        inputs['input_ids'], use_cache=True, num_beams=4, max_length=128,
        early_stopping=True, pad_token_id=pad_id, bos_token_id=bos_id,
        eos_token_id=eos_id,
        decoder_start_token_id=tokenizer._convert_token_to_id_with_added_voc("<2en>")
        )

    batch_preds = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    predictions.extend(batch_preds)

predictions = devanagari_to_malayalam(predictions)

Generating summaries:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
compute_metrics(predictions, references[:16])

ROUGE-1 F1 Score: 0.05
ROUGE-2 F1 Score: 0.0417
ROUGE-L F1 Score: 0.05
BLEU Score: 0.0322

Sample Predictions:

Input Text:
ഇലക്ട്രോണിക്‌സിലും വിവരസാങ്കേതികവിദ്യയിലുമുള്ള ഉഭയകക്ഷിസഹകരണം പ്രോല്‍സാഹിപ്പിക്കാനുള്ള ഇന്ത്യ-അംഗോള ധാരണാപത്രത്തെക്കുറിച്ചു പ്രധാനമന്ത്രി ശ്രീ. നരേന്ദ്ര മോദിയുടെ അധ്യക്ഷതയില്‍ ചേര്‍ന്ന കേന്ദ്ര മന്ത്രിസഭാ യോഗം മുന്‍പാകെ വിശദീകരിക്കപ്പെട്ടു. ഇ-ഭരണം, ഐ.ടി.വിദ്യാഭ്യാസത്തിനുള്ള മനുഷ്യവിഭവശേഷി വികസനം, വിവരസാങ്കേതികവിദ്യാ സുരക്ഷ, ഇലക്ട്രോണിക് ഉല്‍പന്ന നിര്‍മാണം, സോഫ്റ്റ്‌വെയര്‍ വ്യവസായം, ടെലിമെഡിസിന്‍ തുടങ്ങിയ മേഖലകളില്‍ കൂടുതല്‍ സഹകരണം ഉറപ്പാക്കാന്‍ ഉദ്ദേശിച്ചുള്ളതാണു ധാരണാപത്രം. പശ്ചാത്തലം: ഉഭയകക്ഷി, മേഖലാതല സഹകരണത്തിനായുള്ള ചട്ടക്കൂടിന്റെ അടിസ്ഥാനത്തില്‍ വിവരസാങ്കേതികവിദ്യ, ആശയവിനിമയ സാങ്കേതികവിദ്യ എന്നീ മേഖലകളില്‍ രാജ്യാന്തര സഹകരണം പ്രോല്‍സാഹിപ്പിക്കുന്നതിന് ഇലക്ട്രോണിക്‌സ്, വിവരസാങ്കേതികവിദ്യ വകുപ്പു മന്ത്രാലയ(എം.ഇ.ഐ.ടി.വൈ.)ത്തെയാണു ചുമതലപ്പെടുത്തിയിരിക്കുന്നത്. വിവരസാങ്കേതികവിദ്യയില്‍ അധിഷ്ഠിതമായ സഹകരണം ഉറപ്പാക്കുന്നതിന് മറ്റു രാജ്യങ്ങളിലെ ബന്ധപ്പെട്ട ഏജന്‍സിക

In [None]:
print_results(predictions)

# Qwen

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

model_id = "Qwen/Qwen2-1.5B-Instruct"


device = "cuda:1" if torch.cuda.is_available() else "cpu"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", trust_remote_code=True).to(device)

torch.cuda.empty_cache()

2025-06-10 12:49:57.645180: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-10 12:49:57.660335: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749556197.677513 4160294 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749556197.683141 4160294 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749556197.696938 4160294 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [16]:
predictions=[]

for text in tqdm(texts[:50], desc="Generating summaries"):
  try:
    prompt = f"""
      Generate a short and clear news headline in Malayalam based on the following text.

      Text to generate headline for: {text}

      The headline should capture the main idea of the text without copying it word-for-word.
      Return the headline in complete Malayalam.
      You are not to return the thought process or any non Malayalam texts.
      """

    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    model_inputs = tokenizer([text], return_tensors="pt").to(device)

    with torch.no_grad():
      generated_ids = model.generate(
          model_inputs.input_ids,
          max_new_tokens=512
      )

    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    # if "Summary:" in response:
    #   summary_clean = response.split("Summary:")[-1].strip()
    # else:
    #   summary_clean = response.strip()

    predictions.append(response)

    torch.cuda.empty_cache()

  except torch.cuda.OutOfMemoryError:
    print("⚠️ CUDA OOM – Skipping this input.")
    torch.cuda.empty_cache()
    predictions.append("[OOM ERROR – Skipped]")

Generating summaries:   0%|          | 0/50 [00:00<?, ?it/s]

⚠️ CUDA OOM – Skipping this input.
⚠️ CUDA OOM – Skipping this input.
⚠️ CUDA OOM – Skipping this input.
⚠️ CUDA OOM – Skipping this input.
⚠️ CUDA OOM – Skipping this input.
⚠️ CUDA OOM – Skipping this input.
⚠️ CUDA OOM – Skipping this input.


In [None]:
compute_metrics(predictions, references[:50])


ROUGE-1: 0.02
ROUGE-2: 0.0
ROUGE-L: 0.02
BLEU: 0.053


In [None]:
print_results(predictions)


Input: ഇലക്ട്രോണിക്‌സിലും വിവരസാങ്കേതികവിദ്യയിലുമുള്ള ഉഭയകക്ഷിസഹകരണം പ്രോല്‍സാഹിപ്പിക്കാനുള്ള ഇന്ത്യ-അംഗോള ധാരണാപത്രത്തെക്കുറിച്ചു പ്രധാനമന്ത്രി ശ്രീ. നരേന്ദ്ര മോദിയുടെ അധ്യക്ഷതയില്‍ ചേര്‍ന്ന കേന്ദ്ര മന്ത്രിസഭാ യോഗം മുന്‍പാകെ വിശദീകരിക്കപ്പെട്ടു. ഇ-ഭരണം, ഐ.ടി.വിദ്യാഭ്യാസത്തിനുള്ള മനുഷ്യവിഭവശേഷി വികസനം, വിവരസാങ്കേതികവിദ്യാ സുരക്ഷ, ഇലക്ട്രോണിക് ഉല്‍പന്ന നിര്‍മാണം, സോഫ്റ്റ്‌വെയര്‍ വ്യവസായം, ടെലിമെഡിസിന്‍ തുടങ്ങിയ മേഖലകളില്‍ കൂടുതല്‍ സഹകരണം ഉറപ്പാക്കാന്‍ ഉദ്ദേശിച്ചുള്ളതാണു ധാരണാപത്രം. പശ്ചാത്തലം: ഉഭയകക്ഷി, മേഖലാതല സഹകരണത്തിനായുള്ള ചട്ടക്കൂടിന്റെ അടിസ്ഥാനത്തില്‍ വിവരസാങ്കേതികവിദ്യ, ആശയവിനിമയ സാങ്കേതികവിദ്യ എന്നീ മേഖലകളില്‍ രാജ്യാന്തര സഹകരണം പ്രോല്‍സാഹിപ്പിക്കുന്നതിന് ഇലക്ട്രോണിക്‌സ്, വിവരസാങ്കേതികവിദ്യ വകുപ്പു മന്ത്രാലയ(എം.ഇ.ഐ.ടി.വൈ.)ത്തെയാണു ചുമതലപ്പെടുത്തിയിരിക്കുന്നത്. വിവരസാങ്കേതികവിദ്യയില്‍ അധിഷ്ഠിതമായ സഹകരണം ഉറപ്പാക്കുന്നതിന് മറ്റു രാജ്യങ്ങളിലെ ബന്ധപ്പെട്ട ഏജന്‍സികളുമായി എം.ഇ.ഐ.ടി.വൈ. കരാറുകളും ധാരണാപത്രങ്ങളും ഒപ്പുവെച്ചിട്ടുണ്ട്. ഡിജിറ്റല്‍ ഇന്ത്യ, മെയ്ക്ക് ഇന്‍ ഇന്ത്യ തുടങ്ങിയ

# Gemma



In [None]:
from huggingface_hub import login
TOKEN=None
login(token=TOKEN)

In [None]:
from transformers import AutoProcessor, Gemma3ForConditionalGeneration
import torch

model_id = "google/gemma-3-4b-it"
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

model = Gemma3ForConditionalGeneration.from_pretrained(model_id).to(device).eval()

processor = AutoProcessor.from_pretrained(model_id, use_fast=True)
torch.set_float32_matmul_precision('high')

2025-06-10 13:18:29.386819: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-10 13:18:29.405392: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749557909.422997 4165277 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749557909.428790 4165277 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749557909.443722 4165277 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [9]:
predictions = []

for text in tqdm(texts[:10], desc="Generating summaries"):
  try:
    prompt = f"""
Generate a **clear and concise news headline in Malayalam only** based on the following text.

Text (Malayalam): {text}

Important:
- The output must be **only a headline in Malayalam**.
- Do **not** use any other language or script.
- Do **not** include any extra commentary or formatting.
- Do **not** copy the text word-for-word.
- Start your output with: Headline:

Example (do not include this in your output):
Headline: ഇന്ത്യയിൽ പുതിയ ശാസ്ത്രീയ കണ്ടെത്തൽ
"""

    messages = [
        [
            {
                "role": "system",
                "content": [{"type": "text", "text": "You are a helpful assistant that generates news headliens."},]
            },
            {
                "role": "user",
                "content": [{"type": "text", "text": prompt},]
            },
        ],
    ]

    inputs = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
    ).to(model.device)

    input_len = inputs["input_ids"].shape[-1]

    with torch.inference_mode():
      generation = model.generate(**inputs, max_new_tokens=100, do_sample=False)
      generation = generation[0][input_len:]

    # Slice off input tokens from generated tokens to get only new tokens
    # generated_only_ids = [
    #     gen_ids[len(input_ids[idx]):] for idx, gen_ids in enumerate(generated_ids)
    # ]

    decoded = processor.decode(generation, skip_special_tokens=True)
    predictions.append(decoded)

    torch.cuda.empty_cache()

  except torch.cuda.OutOfMemoryError:
    print("⚠️ CUDA OOM – Skipping this input.")
    torch.cuda.empty_cache()
    predictions.append("[OOM ERROR – Skipped]")

Generating summaries:   0%|          | 0/10 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

In [14]:
import re

predictions = [re.sub(r"(?i)^\s*Headline:\s*", "", p) for p in predictions]

In [20]:
compute_metrics(predictions, references[:10])

{'ROUGE-1': np.float64(0.0),
 'ROUGE-2': np.float64(0.0),
 'ROUGE-L': np.float64(0.0),
 'BLEU': 0.0}

In [24]:
print_results(predictions)

Input Text:
ഇലക്ട്രോണിക്‌സിലും വിവരസാങ്കേതികവിദ്യയിലുമുള്ള ഉഭയകക്ഷിസഹകരണം പ്രോല്‍സാഹിപ്പിക്കാനുള്ള ഇന്ത്യ-അംഗോള ധാരണാപത്രത്തെക്കുറിച്ചു പ്രധാനമന്ത്രി ശ്രീ. നരേന്ദ്ര മോദിയുടെ അധ്യക്ഷതയില്‍ ചേര്‍ന്ന കേന്ദ്ര മന്ത്രിസഭാ യോഗം മുന്‍പാകെ വിശദീകരിക്കപ്പെട്ടു. ഇ-ഭരണം, ഐ.ടി.വിദ്യാഭ്യാസത്തിനുള്ള മനുഷ്യവിഭവശേഷി വികസനം, വിവരസാങ്കേതികവിദ്യാ സുരക്ഷ, ഇലക്ട്രോണിക് ഉല്‍പന്ന നിര്‍മാണം, സോഫ്റ്റ്‌വെയര്‍ വ്യവസായം, ടെലിമെഡിസിന്‍ തുടങ്ങിയ മേഖലകളില്‍ കൂടുതല്‍ സഹകരണം ഉറപ്പാക്കാന്‍ ഉദ്ദേശിച്ചുള്ളതാണു ധാരണാപത്രം. പശ്ചാത്തലം: ഉഭയകക്ഷി, മേഖലാതല സഹകരണത്തിനായുള്ള ചട്ടക്കൂടിന്റെ അടിസ്ഥാനത്തില്‍ വിവരസാങ്കേതികവിദ്യ, ആശയവിനിമയ സാങ്കേതികവിദ്യ എന്നീ മേഖലകളില്‍ രാജ്യാന്തര സഹകരണം പ്രോല്‍സാഹിപ്പിക്കുന്നതിന് ഇലക്ട്രോണിക്‌സ്, വിവരസാങ്കേതികവിദ്യ വകുപ്പു മന്ത്രാലയ(എം.ഇ.ഐ.ടി.വൈ.)ത്തെയാണു ചുമതലപ്പെടുത്തിയിരിക്കുന്നത്. വിവരസാങ്കേതികവിദ്യയില്‍ അധിഷ്ഠിതമായ സഹകരണം ഉറപ്പാക്കുന്നതിന് മറ്റു രാജ്യങ്ങളിലെ ബന്ധപ്പെട്ട ഏജന്‍സികളുമായി എം.ഇ.ഐ.ടി.വൈ. കരാറുകളും ധാരണാപത്രങ്ങളും ഒപ്പുവെച്ചിട്ടുണ്ട്. ഡിജിറ്റല്‍ ഇന്ത്യ, മെയ്ക്ക് ഇന്‍ ഇന്ത്യ തുടങ

## By-T5-Small and Flan-T5

In [25]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

# Load ByT5 and Flan-T5
byt5_model = AutoModelForSeq2SeqLM.from_pretrained("google/byt5-small").to(device)
byt5_tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")

config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.59k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

In [44]:
def generate_summary(text, model, tokenizer):
    prompt = f"""
Generate a **clear and concise news headline in Malayalam only** based on the following text.

Text (Malayalam): {text}

Important:
- The output must be **only a headline in Malayalam**.
- Do **not** use any other language or script.
- Do **not** include any extra commentary or formatting.
- Do **not** copy the text word-for-word.
- Start your output with: Headline:

Example (do not include this in your output):
Headline: ഇന്ത്യയിൽ പുതിയ ശാസ്ത്രീയ കണ്ടെത്തൽ
"""
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512, padding=True).to(device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=512,
            num_beams=2,
            early_stopping=True,
            pad_token_id=tokenizer.pad_token_id
        )

    return tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()


In [45]:
byt5_preds = []

for i, row in tqdm(df.iterrows(), total=len(df), desc="ByT5-Small"):
    text, reference = row['text'], row['summary']
    generated_summary = generate_summary(text, byt5_model, byt5_tokenizer)
    byt5_preds.append(generated_summary)

# Save outputs
df['byt5_summary'] = byt5_preds

ByT5-Small:   0%|          | 0/116 [00:00<?, ?it/s]

In [46]:
import re

# Remove any English word followed by a colon and text inside parentheses
byt5_preds = [re.sub(r"(?i)^\s*\w+:\s*|\s*\([^)]*\)|:\s*", "", p) for p in byt5_preds]


In [47]:
compute_metrics(byt5_preds, references)

{'ROUGE-1': np.float64(0.0063),
 'ROUGE-2': np.float64(0.0022),
 'ROUGE-L': np.float64(0.0063),
 'BLEU': 0.0332}

In [48]:
print_results(byt5_preds)

Input Text:
ഇലക്ട്രോണിക്‌സിലും വിവരസാങ്കേതികവിദ്യയിലുമുള്ള ഉഭയകക്ഷിസഹകരണം പ്രോല്‍സാഹിപ്പിക്കാനുള്ള ഇന്ത്യ-അംഗോള ധാരണാപത്രത്തെക്കുറിച്ചു പ്രധാനമന്ത്രി ശ്രീ. നരേന്ദ്ര മോദിയുടെ അധ്യക്ഷതയില്‍ ചേര്‍ന്ന കേന്ദ്ര മന്ത്രിസഭാ യോഗം മുന്‍പാകെ വിശദീകരിക്കപ്പെട്ടു. ഇ-ഭരണം, ഐ.ടി.വിദ്യാഭ്യാസത്തിനുള്ള മനുഷ്യവിഭവശേഷി വികസനം, വിവരസാങ്കേതികവിദ്യാ സുരക്ഷ, ഇലക്ട്രോണിക് ഉല്‍പന്ന നിര്‍മാണം, സോഫ്റ്റ്‌വെയര്‍ വ്യവസായം, ടെലിമെഡിസിന്‍ തുടങ്ങിയ മേഖലകളില്‍ കൂടുതല്‍ സഹകരണം ഉറപ്പാക്കാന്‍ ഉദ്ദേശിച്ചുള്ളതാണു ധാരണാപത്രം. പശ്ചാത്തലം: ഉഭയകക്ഷി, മേഖലാതല സഹകരണത്തിനായുള്ള ചട്ടക്കൂടിന്റെ അടിസ്ഥാനത്തില്‍ വിവരസാങ്കേതികവിദ്യ, ആശയവിനിമയ സാങ്കേതികവിദ്യ എന്നീ മേഖലകളില്‍ രാജ്യാന്തര സഹകരണം പ്രോല്‍സാഹിപ്പിക്കുന്നതിന് ഇലക്ട്രോണിക്‌സ്, വിവരസാങ്കേതികവിദ്യ വകുപ്പു മന്ത്രാലയ(എം.ഇ.ഐ.ടി.വൈ.)ത്തെയാണു ചുമതലപ്പെടുത്തിയിരിക്കുന്നത്. വിവരസാങ്കേതികവിദ്യയില്‍ അധിഷ്ഠിതമായ സഹകരണം ഉറപ്പാക്കുന്നതിന് മറ്റു രാജ്യങ്ങളിലെ ബന്ധപ്പെട്ട ഏജന്‍സികളുമായി എം.ഇ.ഐ.ടി.വൈ. കരാറുകളും ധാരണാപത്രങ്ങളും ഒപ്പുവെച്ചിട്ടുണ്ട്. ഡിജിറ്റല്‍ ഇന്ത്യ, മെയ്ക്ക് ഇന്‍ ഇന്ത്യ തുടങ

# Flan

In [64]:
flan_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small").to(device)
flan_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [65]:
def generate_summary(text):
    # Prepare prompt (instruction + input text)
    prompt = f"Summarize the following Malayalam text in malayalam:\n{text}"

    # Tokenize input text
    inputs = flan_tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(device)

    # Generate summary
    summary_ids = flan_model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=128,
        num_beams=4,
        early_stopping=True,
        do_sample=False,
    )
    # Decode the output summary
    summary = flan_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [66]:
flan_preds = []

for i, row in tqdm(df.iterrows(), total=len(df), desc="Flan-T5-Small"):
    text, reference = row['text'], row['summary']
    summary = generate_summary(text)
    flan_preds.append(summary)

# Save outputs
df['flan_summary'] = flan_preds

Flan-T5-Small:   0%|          | 0/116 [00:00<?, ?it/s]

In [67]:
compute_metrics(flan_preds, references)

{'ROUGE-1': np.float64(0.0),
 'ROUGE-2': np.float64(0.0),
 'ROUGE-L': np.float64(0.0),
 'BLEU': 0.0}

In [74]:
print_results(flan_preds)

Input Text:
ഇലക്ട്രോണിക്‌സിലും വിവരസാങ്കേതികവിദ്യയിലുമുള്ള ഉഭയകക്ഷിസഹകരണം പ്രോല്‍സാഹിപ്പിക്കാനുള്ള ഇന്ത്യ-അംഗോള ധാരണാപത്രത്തെക്കുറിച്ചു പ്രധാനമന്ത്രി ശ്രീ. നരേന്ദ്ര മോദിയുടെ അധ്യക്ഷതയില്‍ ചേര്‍ന്ന കേന്ദ്ര മന്ത്രിസഭാ യോഗം മുന്‍പാകെ വിശദീകരിക്കപ്പെട്ടു. ഇ-ഭരണം, ഐ.ടി.വിദ്യാഭ്യാസത്തിനുള്ള മനുഷ്യവിഭവശേഷി വികസനം, വിവരസാങ്കേതികവിദ്യാ സുരക്ഷ, ഇലക്ട്രോണിക് ഉല്‍പന്ന നിര്‍മാണം, സോഫ്റ്റ്‌വെയര്‍ വ്യവസായം, ടെലിമെഡിസിന്‍ തുടങ്ങിയ മേഖലകളില്‍ കൂടുതല്‍ സഹകരണം ഉറപ്പാക്കാന്‍ ഉദ്ദേശിച്ചുള്ളതാണു ധാരണാപത്രം. പശ്ചാത്തലം: ഉഭയകക്ഷി, മേഖലാതല സഹകരണത്തിനായുള്ള ചട്ടക്കൂടിന്റെ അടിസ്ഥാനത്തില്‍ വിവരസാങ്കേതികവിദ്യ, ആശയവിനിമയ സാങ്കേതികവിദ്യ എന്നീ മേഖലകളില്‍ രാജ്യാന്തര സഹകരണം പ്രോല്‍സാഹിപ്പിക്കുന്നതിന് ഇലക്ട്രോണിക്‌സ്, വിവരസാങ്കേതികവിദ്യ വകുപ്പു മന്ത്രാലയ(എം.ഇ.ഐ.ടി.വൈ.)ത്തെയാണു ചുമതലപ്പെടുത്തിയിരിക്കുന്നത്. വിവരസാങ്കേതികവിദ്യയില്‍ അധിഷ്ഠിതമായ സഹകരണം ഉറപ്പാക്കുന്നതിന് മറ്റു രാജ്യങ്ങളിലെ ബന്ധപ്പെട്ട ഏജന്‍സികളുമായി എം.ഇ.ഐ.ടി.വൈ. കരാറുകളും ധാരണാപത്രങ്ങളും ഒപ്പുവെച്ചിട്ടുണ്ട്. ഡിജിറ്റല്‍ ഇന്ത്യ, മെയ്ക്ക് ഇന്‍ ഇന്ത്യ തുടങ

# **mT5**


In [49]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [55]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
model.to(device);

In [58]:
input_text = "summarize: കേരളത്തിൽ ഇന്നത്തെ കാലാവസ്ഥയിൽ പെയ്യുന്ന കനത്ത മഴ യാത്രക്കാർക്ക് വലിയ തടസ്സം സൃഷ്ടിച്ചു."

# Tokenize input
input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(model.device)

In [59]:
summary_ids = model.generate(
    input_ids,
    max_length=64,
    num_beams=4,
    early_stopping=True
)

# Decode the output
summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
print("Summary:", summary)


Summary: ['<extra_id_0>.']


In [60]:
def generate_summary(text):
    input_text = "summarize: " + text
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
    summary_ids = model.generate(inputs, max_length=128, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [61]:
predictions = []
for text in tqdm(texts[:5]):
    pred = generate_summary(text)
    predictions.append(pred)


  0%|          | 0/5 [00:00<?, ?it/s]

In [62]:
print_results(predictions)

Input Text:
ഇലക്ട്രോണിക്‌സിലും വിവരസാങ്കേതികവിദ്യയിലുമുള്ള ഉഭയകക്ഷിസഹകരണം പ്രോല്‍സാഹിപ്പിക്കാനുള്ള ഇന്ത്യ-അംഗോള ധാരണാപത്രത്തെക്കുറിച്ചു പ്രധാനമന്ത്രി ശ്രീ. നരേന്ദ്ര മോദിയുടെ അധ്യക്ഷതയില്‍ ചേര്‍ന്ന കേന്ദ്ര മന്ത്രിസഭാ യോഗം മുന്‍പാകെ വിശദീകരിക്കപ്പെട്ടു. ഇ-ഭരണം, ഐ.ടി.വിദ്യാഭ്യാസത്തിനുള്ള മനുഷ്യവിഭവശേഷി വികസനം, വിവരസാങ്കേതികവിദ്യാ സുരക്ഷ, ഇലക്ട്രോണിക് ഉല്‍പന്ന നിര്‍മാണം, സോഫ്റ്റ്‌വെയര്‍ വ്യവസായം, ടെലിമെഡിസിന്‍ തുടങ്ങിയ മേഖലകളില്‍ കൂടുതല്‍ സഹകരണം ഉറപ്പാക്കാന്‍ ഉദ്ദേശിച്ചുള്ളതാണു ധാരണാപത്രം. പശ്ചാത്തലം: ഉഭയകക്ഷി, മേഖലാതല സഹകരണത്തിനായുള്ള ചട്ടക്കൂടിന്റെ അടിസ്ഥാനത്തില്‍ വിവരസാങ്കേതികവിദ്യ, ആശയവിനിമയ സാങ്കേതികവിദ്യ എന്നീ മേഖലകളില്‍ രാജ്യാന്തര സഹകരണം പ്രോല്‍സാഹിപ്പിക്കുന്നതിന് ഇലക്ട്രോണിക്‌സ്, വിവരസാങ്കേതികവിദ്യ വകുപ്പു മന്ത്രാലയ(എം.ഇ.ഐ.ടി.വൈ.)ത്തെയാണു ചുമതലപ്പെടുത്തിയിരിക്കുന്നത്. വിവരസാങ്കേതികവിദ്യയില്‍ അധിഷ്ഠിതമായ സഹകരണം ഉറപ്പാക്കുന്നതിന് മറ്റു രാജ്യങ്ങളിലെ ബന്ധപ്പെട്ട ഏജന്‍സികളുമായി എം.ഇ.ഐ.ടി.വൈ. കരാറുകളും ധാരണാപത്രങ്ങളും ഒപ്പുവെച്ചിട്ടുണ്ട്. ഡിജിറ്റല്‍ ഇന്ത്യ, മെയ്ക്ക് ഇന്‍ ഇന്ത്യ തുടങ