Task 3: Fine Tune NER Model 

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import Dataset
import numpy as np
import evaluate
from pathlib import Path

# 1. CONLL File Parser
def parse_conll(file_path):
    """Parse CONLL format file into tokens and labels"""
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    tokens = []
    labels = []
    current_tokens = []
    current_labels = []
    
    for line in lines:
        line = line.strip()
        if not line:  # Sentence boundary
            if current_tokens:
                tokens.append(current_tokens)
                labels.append(current_labels)
                current_tokens = []
                current_labels = []
            continue
        
        parts = line.split('\t')
        if len(parts) != 2:
            continue  # Skip malformed lines
            
        token, label = parts
        current_tokens.append(token)
        current_labels.append(label)
    
    # Add last sentence if file doesn't end with newline
    if current_tokens:
        tokens.append(current_tokens)
        labels.append(current_labels)
    
    return {'tokens': tokens, 'ner_tags': labels}

# 2. Load and Prepare Dataset
conll_path = Path("../CoNLL/amharic_ner.conll")  # Update with your actual path
if not conll_path.exists():
    raise FileNotFoundError(f"CONLL file not found at: {conll_path}")

# Parse and create dataset
conll_data = parse_conll(conll_path)
dataset = Dataset.from_dict({
    'tokens': conll_data['tokens'],
    'ner_tags': conll_data['ner_tags']
})

# 3. Define Label Mappings
label_list = ["O", "B-PRODUCT", "I-PRODUCT", "B-PRICE", "I-PRICE", "B-LOC", "I-LOC"]
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for i, l in enumerate(label_list)}

# 4. Initialize Tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# 5. Tokenization and Alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding='max_length', 
        max_length=128
    )
    
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        
        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# 6. Process Dataset
tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=['tokens', 'ner_tags']
)

# 7. Train/Test Split
split_datasets = tokenized_datasets.train_test_split(test_size=0.2)

# 8. Initialize Model
model = AutoModelForTokenClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

# 9. Training Arguments (Updated for latest Transformers)
training_args = TrainingArguments(
    output_dir="../results/amharic-ner-results",
    eval_strategy="epoch",  # Changed from evaluation_strategy
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='../logs',
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=3,
    fp16=torch.cuda.is_available(),
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to=None,  # Disabled reporting
)

# 10. Data Collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 11. Metrics
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# 12. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_datasets["train"],
    eval_dataset=split_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 13. Start Training
print("Starting training...")
trainer.train()

# 14. Save Model
output_dir = "../models/amharic-ner-model"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

# 15. Evaluation
print("\nFinal Evaluation:")
eval_results = trainer.evaluate()
print(f"Precision: {eval_results['eval_precision']:.3f}")
print(f"Recall: {eval_results['eval_recall']:.3f}")
print(f"F1 Score: {eval_results['eval_f1']:.3f}")
print(f"Accuracy: {eval_results['eval_accuracy']:.3f}")

  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 50/50 [00:00<00:00, 2461.68 examples/s]
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting training...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.616426,0.0,0.0,0.0,0.820847
2,No log,1.311263,0.0,0.0,0.0,0.820847
3,No log,1.148683,0.0,0.0,0.0,0.820847


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model saved to ../models/amharic-ner-model

Final Evaluation:


Precision: 0.000
Recall: 0.000
F1 Score: 0.000
Accuracy: 0.821


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
