## Install Required Libraries

In [7]:
!pip install transformers datasets seqeval torch torchvision torchaudio accelerate -q

## Load and Preprocess CoNLL Data

In [8]:
# Load the data
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [9]:
import pandas as pd
import re

def load_conll_data(file_path):
    sentences = []
    labels = []
    sentence = []
    label = []

    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            line = line.strip()
            if line == "":
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence = []
                    label = []
            else:
                parts = line.split()
                if len(parts) == 2:
                    sentence.append(parts[0])
                    label.append(parts[1])

    return sentences, labels

# Load data
data_path = "/content/drive/MyDrive/KAIM/Data/labeled_data.conll"
sentences, labels = load_conll_data(data_path)

print(f"✅ Loaded {len(sentences)} sentences for training!")

✅ Loaded 3166 sentences for training!


## Tokenization and Dataset Preparation

In [10]:
from transformers import AutoTokenizer

# Load tokenizer (Choose one: 'xlm-roberta-base', 'bert-tiny-amharic', 'afroxmlr')
MODEL_NAME = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_and_align_labels(sentences, labels, tokenizer):
    tokenized_inputs = tokenizer(sentences, truncation=True, padding=True, is_split_into_words=True)

    aligned_labels = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        new_labels = []

        for word_idx in word_ids:
            if word_idx is None:
                new_labels.append(-100)
            elif word_idx != previous_word_idx:
                new_labels.append(label[word_idx])
            else:
                new_labels.append(label[word_idx])

            previous_word_idx = word_idx

        aligned_labels.append(new_labels)

    return tokenized_inputs, aligned_labels

# Tokenize dataset
tokenized_inputs, aligned_labels = tokenize_and_align_labels(sentences, labels, tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

## Set Up Model and Training Arguments

In [11]:
from transformers import AutoModelForTokenClassification, TrainingArguments

# Load pre-trained NER model
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels=5)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10
)

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Fine-Tune the Model Using Trainer AP

In [17]:
from transformers import Trainer, DataCollatorForTokenClassification
from datasets import Dataset, Features, Sequence, Value, ClassLabel
from sklearn.model_selection import train_test_split

# Split data into train and evaluation sets
train_texts, eval_texts, train_labels, eval_labels = train_test_split(sentences, labels, test_size=0.2, random_state=42)

# Tokenize evaluation dataset
eval_tokenized_inputs, eval_aligned_labels = tokenize_and_align_labels(eval_texts, eval_labels, tokenizer)

# 1. Create a unique list of labels and map them to IDs
unique_labels = sorted(list(set([label for sublist in labels for label in sublist])))
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for i, label in enumerate(unique_labels)}

# Update the model's config with the id2label and label2id mappings
model.config.label2id = label2id
model.config.id2label = id2label

# 2. Convert string labels to numerical IDs using the label2id mapping
def convert_labels_to_ids(labels):
  return [[label2id.get(label, -100) for label in label_list] for label_list in labels]

train_label_ids = convert_labels_to_ids(train_labels)
aligned_labels_ids = convert_labels_to_ids(aligned_labels)
eval_aligned_labels_ids = convert_labels_to_ids(eval_labels)

# Define features for the dataset
features = Features({
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'labels': Sequence(feature=Value(dtype='int64'))  # Specify labels as int64
})

# Create train and evaluation datasets
train_dataset = Dataset.from_dict(
    {"input_ids": tokenized_inputs["input_ids"], "labels": aligned_labels_ids}, # Use numerical label IDs
    features=features
)
eval_dataset = Dataset.from_dict(
    {"input_ids": eval_tokenized_inputs["input_ids"], "labels": eval_aligned_labels_ids}, # Use numerical label IDs
    features=features
)

# Data collator to handle padding
data_collator = DataCollatorForTokenClassification(tokenizer)

# Define Trainer, providing eval_dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # Provide eval_dataset
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Train model
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.0734,1.432093
2,0.0114,1.966093
3,0.006,2.155185


TrainOutput(global_step=1188, training_loss=0.1300594984876738, metrics={'train_runtime': 998.4723, 'train_samples_per_second': 9.513, 'train_steps_per_second': 1.19, 'total_flos': 1808076636678780.0, 'train_loss': 0.1300594984876738, 'epoch': 3.0})

In [19]:
!pip install datasets --upgrade -q #upgrade datasets to the latest version

## Evaluate the Model on Validation Set

In [24]:
!pip install evaluate -q #Install the evaluate library

import numpy as np
from evaluate import load

# Load evaluation metric
metric = load("seqeval")

# Function to compute metrics
def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[label for label in label_row if label != -100] for label_row in labels]
    true_predictions = [[label for label in pred_row if label != -100] for pred_row in predictions]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"]}

# Evaluate model
results = trainer.evaluate()
print(f"📊 Model Evaluation Results: {results}")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

📊 Model Evaluation Results: {'eval_loss': 2.1551849842071533, 'eval_runtime': 15.911, 'eval_samples_per_second': 39.847, 'eval_steps_per_second': 5.028, 'epoch': 3.0}


## Save and Export the Fine-Tuned Model

In [25]:
# Save model
model.save_pretrained("./ner_model")
tokenizer.save_pretrained("./ner_model")

print("✅ Model saved successfully!")

✅ Model saved successfully!
