In [1]:
import pandas as pd
import numpy as np
from datasets import Dataset
from evaluate import load as load_metric
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from peft import get_peft_model, LoraConfig, TaskType
import torch
from typing import List

DATA_FILE_PATH = '/mkpucit_final_ner_input.csv'
df = pd.read_csv(DATA_FILE_PATH)

print(f"Loaded dataset with {len(df)} rows.")

def convert_to_iob(tags: List[str]) -> List[str]:
    iob_tags = []
    previous_tag = 'O'
    for tag in tags:
        # 1. Handle Non-Entity tag
        if tag == 'Other' or tag == 'OTHER':
            iob_tags.append('O')
            previous_tag = 'O'
        # 2. Handle same entity tag (Inside)
        elif tag == previous_tag:
            iob_tags.append(f'I-{tag}')
            previous_tag = tag
        # 3. Handle new entity tag (Beginning)
        else:
            iob_tags.append(f'B-{tag}')
            previous_tag = tag
    return iob_tags

df['Ground_Truth_Tags'] = df['Ground_Truth_Tags'].astype(str).apply(lambda x: x.split())
df['Ground_Truth_Tags'] = df['Ground_Truth_Tags'].apply(convert_to_iob)

NER_TAGS = ['O', 'B-Location', 'I-Location', 'B-Person', 'I-Person', 'B-Organization', 'I-Organization']
id2label = {i: tag for i, tag in enumerate(NER_TAGS)}
label2id = {tag: i for i, tag in enumerate(NER_TAGS)}

print(f"Total IOB Labels: {len(NER_TAGS)}")
print(f"Example IOB tag list (first row): {df['Ground_Truth_Tags'].iloc[0]}")

Loaded dataset with 24276 rows.
Total IOB Labels: 7
Example IOB tag list (first row): ['B-Location', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Person', 'I-Person', 'O', 'O', 'O', 'O', 'B-Location', 'O', 'O', 'O']


In [27]:
MODEL_CHECKPOINT = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)


def tokenize_and_align_labels(example, input_column):

    raw_text = str(example[input_column])
    split_text = raw_text.split()

    tokenized_inputs = tokenizer(
        split_text,  # Pass the list of words, not the full string
        truncation=True,
        is_split_into_words=True,
        max_length=128
    )

    labels = []

    ground_truth = example["Ground_Truth_Tags"]

    word_ids = tokenized_inputs.word_ids() # No batch_index needed for single example
    previous_word_idx = None
    label_ids = []

    # Defensive index check parameters
    max_label_index = len(ground_truth) - 1

    for word_idx in word_ids:
        if word_idx is None:
            # Special tokens like [CLS], [SEP]
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            # First token of a word: assign the actual IOB label ID
            if word_idx <= max_label_index:
                 label_ids.append(label2id[ground_truth[word_idx]])
            else:
                 # Fallback for mismatches
                 label_ids.append(label2id['O'])
        else:
            # Subsequent subwords: mask with -100
            label_ids.append(-100)
        previous_word_idx = word_idx

    # The trainer expects 'labels' key in the output
    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

In [24]:

# 1. LoRA CONFIGURATION (PEFT)
lora_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS, # Task is Token Classification (NER)
    inference_mode=False,
    r=16, # Rank: Controls the number of trainable parameters
    lora_alpha=16, # Scaling factor
    lora_dropout=0.1,
    target_modules=["query", "value"], # Target the key attention matrices in XLM-R
)

# 2. TRAINING ARGUMENTS
training_args = TrainingArguments(
    output_dir="./xlmroberta_ner_results",
    learning_rate=2e-4,
    per_device_train_batch_size=32, # Batch size (adjust to 16 if you get OOM errors)
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch", # Fixed deprecated argument
    logging_dir='./logs',
    logging_steps=100,
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(), 

In [25]:
#  EVALUATION METRICS
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# 2. MAIN TRAINING FUNCTION
def run_ner_fine_tuning(input_column: str, run_name: str, train_dataset, eval_dataset):
    print(f"\n=======================================================")
    print(f"| Starting Fine-Tuning for: {run_name} |")
    print(f"| Input Column: {input_column} |")
    print(f"=======================================================")

    base_model = AutoModelForTokenClassification.from_pretrained(
        MODEL_CHECKPOINT, num_labels=len(NER_TAGS), id2label=id2label, label2id=label2id
    )

    # Apply LoRA to the base model
    lora_model = get_peft_model(base_model, lora_config)
    print(f"LoRA Trainable Params: {lora_model.print_trainable_parameters()}")

    trainer = Trainer(
        model=lora_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=DataCollatorForTokenClassification(tokenizer, padding=True),
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # Evaluate the model on the evaluation set
    eval_results = trainer.evaluate()
    print(f"\n--- Final Evaluation Results for {run_name} ---")
    print(eval_results)
    print("---------------------------------------------------\n")
    return eval_results

In [28]:
# --- CELL 5: EXECUTION AND COMPARISON (FILTERED & UNBATCHED) ---

# 1. Convert DataFrame to Hugging Face Dataset format
dataset = Dataset.from_pandas(df)

dataset = dataset.train_test_split(test_size=0.2, seed=42)

def check_word_tag_match(example, input_column):
    """Returns True if the number of words equals the number of tags, False otherwise."""
    # Split the input text into words based on spaces
    input_text = str(example[input_column])
    num_words = len(input_text.split())
    num_tags = len(example['Ground_Truth_Tags'])
    return num_words == num_tags and num_words > 0


# Filter the baseline training and evaluation sets
baseline_train_filtered = dataset['train'].filter(lambda x: check_word_tag_match(x, 'Urdu_Trans_Baseline'))
baseline_eval_filtered = dataset['test'].filter(lambda x: check_word_tag_match(x, 'Urdu_Trans_Baseline'))

print(f"Baseline Train set filtered: {len(baseline_train_filtered)}/{len(dataset['train'])} rows remaining.")

print("Tokenizing and aligning labels for BASELINE data...")
# NOTE: batched=False is crucial here to avoid index errors!
baseline_tokenized_train = baseline_train_filtered.map(
    lambda x: tokenize_and_align_labels(x, 'Urdu_Trans_Baseline'),
    batched=False,
    remove_columns=dataset['train'].column_names
)
baseline_tokenized_eval = baseline_eval_filtered.map(
    lambda x: tokenize_and_align_labels(x, 'Urdu_Trans_Baseline'),
    batched=False,
    remove_columns=dataset['test'].column_names
)

baseline_metrics = run_ner_fine_tuning(
    input_column='Urdu_Trans_Baseline',
    run_name='BASELINE (Noisy Input)',
    train_dataset=baseline_tokenized_train,
    eval_dataset=baseline_tokenized_eval
)


# Filter the optimized training and evaluation sets
optimized_train_filtered = dataset['train'].filter(lambda x: check_word_tag_match(x, 'Urdu_Trans_Optimized'))
optimized_eval_filtered = dataset['test'].filter(lambda x: check_word_tag_match(x, 'Urdu_Trans_Optimized'))

print(f"Optimized Train set filtered: {len(optimized_train_filtered)}/{len(dataset['train'])} rows remaining.")

print("Tokenizing and aligning labels for OPTIMIZED data...")
optimized_tokenized_train = optimized_train_filtered.map(
    lambda x: tokenize_and_align_labels(x, 'Urdu_Trans_Optimized'),
    batched=False,
    remove_columns=dataset['train'].column_names
)
optimized_tokenized_eval = optimized_eval_filtered.map(
    lambda x: tokenize_and_align_labels(x, 'Urdu_Trans_Optimized'),
    batched=False,
    remove_columns=dataset['test'].column_names
)

optimized_metrics = run_ner_fine_tuning(
    input_column='Urdu_Trans_Optimized',
    run_name='OPTIMIZED (Normalized Input)',
    train_dataset=optimized_tokenized_train,
    eval_dataset=optimized_tokenized_eval
)

# --- FINAL COMPARISON ---
print("\n\n#######################################################")
print("#### FINAL F1-SCORE COMPARISON ####")
print(f"BASELINE (Noisy Data) F1: {baseline_metrics['eval_f1']:.4f}")
print(f"OPTIMIZED (Normalized Data) F1: {optimized_metrics['eval_f1']:.4f}")
print("#######################################################")

Filter:   0%|          | 0/19420 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4856 [00:00<?, ? examples/s]

Baseline Train set filtered: 13790/19420 rows remaining.
Tokenizing and aligning labels for BASELINE data...


Map:   0%|          | 0/13790 [00:00<?, ? examples/s]

Map:   0%|          | 0/3446 [00:00<?, ? examples/s]


| Starting Fine-Tuning for: BASELINE (Noisy Input) |
| Input Column: Urdu_Trans_Baseline |


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


trainable params: 595,207 || all params: 278,053,646 || trainable%: 0.2141
LoRA Trainable Params: None


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mk224072[0m ([33mk224072-national-university-of-computer-and-emerging-sci[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.3997,0.373212,0.363184,0.201176,0.258926,0.899059
2,0.3365,0.299042,0.389998,0.290832,0.333193,0.910076
3,0.2855,0.268548,0.432563,0.39537,0.413131,0.917114
4,0.2743,0.254278,0.454126,0.433768,0.443714,0.920483
5,0.2647,0.250001,0.462268,0.447915,0.454978,0.9215



--- Final Evaluation Results for BASELINE (Noisy Input) ---
{'eval_loss': 0.25000113248825073, 'eval_precision': 0.46226772847933256, 'eval_recall': 0.4479147528936248, 'eval_f1': 0.4549780722217039, 'eval_accuracy': 0.9214996256284095, 'eval_runtime': 7.3381, 'eval_samples_per_second': 469.603, 'eval_steps_per_second': 14.718, 'epoch': 5.0}
---------------------------------------------------



Filter:   0%|          | 0/19420 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4856 [00:00<?, ? examples/s]

Optimized Train set filtered: 14234/19420 rows remaining.
Tokenizing and aligning labels for OPTIMIZED data...


Map:   0%|          | 0/14234 [00:00<?, ? examples/s]

Map:   0%|          | 0/3555 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



| Starting Fine-Tuning for: OPTIMIZED (Normalized Input) |
| Input Column: Urdu_Trans_Optimized |


  trainer = Trainer(


trainable params: 595,207 || all params: 278,053,646 || trainable%: 0.2141
LoRA Trainable Params: None


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1542,0.151182,0.598315,0.783088,0.678344,0.947237
2,0.1415,0.14295,0.61466,0.78834,0.69075,0.950481
3,0.1385,0.136584,0.638114,0.80077,0.710248,0.952557
4,0.1352,0.133296,0.65076,0.794993,0.715682,0.954403
5,0.1287,0.132652,0.649562,0.804447,0.718755,0.954094



--- Final Evaluation Results for OPTIMIZED (Normalized Input) ---
{'eval_loss': 0.13265176117420197, 'eval_precision': 0.6495617755159739, 'eval_recall': 0.8044467787114846, 'eval_f1': 0.7187548881589237, 'eval_accuracy': 0.9540940079654233, 'eval_runtime': 6.7487, 'eval_samples_per_second': 526.764, 'eval_steps_per_second': 16.596, 'epoch': 5.0}
---------------------------------------------------



#######################################################
#### FINAL F1-SCORE COMPARISON ####
BASELINE (Noisy Data) F1: 0.4550
OPTIMIZED (Normalized Data) F1: 0.7188
#######################################################
