In [1]:
# ✅ Install Dependencies
!pip install transformers datasets seqeval accelerate -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.4 MB/s[0m eta [36m0:

In [2]:
# ✅ Upload CoNLL File
from google.colab import files
uploaded = files.upload()
filename = list(uploaded.keys())[0]

Saving labeled_conll_output.txt to labeled_conll_output (1).txt


In [3]:
# ✅Load CoNLL File into DataFrame
def read_conll(file_path):
    data = []
    with open(file_path, encoding='utf-8') as f:
        tokens, labels = [], []
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    data.append((tokens, labels))
                    tokens, labels = [], []
            else:
                splits = line.split()
                if len(splits) == 2:
                    token, label = splits
                    tokens.append(token)
                    labels.append(label)
        if tokens:
            data.append((tokens, labels))
    import pandas as pd
    return pd.DataFrame(data, columns=["tokens", "ner_tags"])

df = read_conll(filename)

In [4]:
#✅ Create Label Mappings
label_list = sorted({label for labels in df['ner_tags'] for label in labels})
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

In [5]:
# ✅ Tokenize and Align Labels
from transformers import AutoTokenizer
from datasets import Dataset
from transformers import DataCollatorForTokenClassification

def tokenize_and_align_labels(example):
    # Use a fixed tokenizer for mapping labels, will replace per model later if needed
    tokenized_input = tokenizer(
        example["tokens"],
        is_split_into_words=True,
        padding='max_length',
        truncation=True,
        max_length=128
    )
    word_ids = tokenized_input.word_ids()
    aligned_labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            aligned_labels.append(-100)
        elif word_idx != previous_word_idx:
            # Fix: check if word_idx is within range of labels to avoid IndexError
            if word_idx < len(example["ner_tags"]):
                aligned_labels.append(label2id.get(example["ner_tags"][word_idx], -100))
            else:
                aligned_labels.append(-100)
        else:
            aligned_labels.append(-100)
        previous_word_idx = word_idx
    tokenized_input["labels"] = aligned_labels
    return tokenized_input

dataset = Dataset.from_pandas(df)

In [6]:
# ✅ Fine-tuning & Evaluation Function
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from seqeval.metrics import classification_report
import numpy as np

def fine_tune_and_evaluate(model_name):
    global tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    data_collator = DataCollatorForTokenClassification(tokenizer)

    tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=False)

    model = AutoModelForTokenClassification.from_pretrained(
        model_name, num_labels=len(label_list), id2label=id2label, label2id=label2id
    )

    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=-1)
        true_labels = [[id2label[l] for l in example if l != -100] for example in labels]
        true_preds = [[id2label[p] for (p, l) in zip(pred, label) if l != -100]
                      for pred, label in zip(predictions, labels)]
        report = classification_report(true_labels, true_preds, output_dict=True, zero_division=0)
        return {
            "precision": report["micro avg"]["precision"],
            "recall": report["micro avg"]["recall"],
            "f1": report["micro avg"]["f1-score"]
        }

    args = TrainingArguments(
        output_dir=f"./{model_name.replace('/', '_')}_NER",
        eval_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        learning_rate=2e-5,
        logging_steps=10,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized_dataset,
        eval_dataset=tokenized_dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    trainer.train()
    return trainer.evaluate()

In [7]:
# ✅ Run Comparison
models_to_test = [
    "xlm-roberta-base",
    "distilbert-base-multilingual-cased",
    "bert-base-multilingual-cased",
    "Davlan/afro-xlmr-base"
]

results = {}
for model_name in models_to_test:
    print(f"\nEvaluating: {model_name}")
    results[model_name] = fine_tune_and_evaluate(model_name)


Evaluating: xlm-roberta-base


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/36 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,1.384974,0.0,0.0,0.0
2,1.423500,1.009707,0.0,0.0,0.0
3,1.423500,0.774855,0.295082,0.155172,0.20339
4,0.882800,0.665098,0.383333,0.198276,0.261364
5,0.882800,0.632948,0.410714,0.198276,0.267442



Evaluating: distilbert-base-multilingual-cased


Map:   0%|          | 0/36 [00:00<?, ? examples/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,1.13105,0.0,0.0,0.0
2,1.310000,0.963799,0.298246,0.146552,0.196532
3,1.310000,0.859882,0.481481,0.224138,0.305882
4,0.879700,0.811687,0.333333,0.25,0.285714
5,0.879700,0.793155,0.345238,0.25,0.29



Evaluating: bert-base-multilingual-cased


Map:   0%|          | 0/36 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,1.029508,0.021277,0.008621,0.01227
2,1.207900,0.845595,0.54386,0.267241,0.358382
3,1.207900,0.732084,0.697674,0.258621,0.377358
4,0.767900,0.65682,0.586207,0.293103,0.390805
5,0.767900,0.610612,0.465753,0.293103,0.359788



Evaluating: Davlan/afro-xlmr-base


Map:   0%|          | 0/36 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,1.580194,0.014085,0.008621,0.010695
2,1.633900,1.194991,0.0,0.0,0.0
3,1.633900,0.846119,0.058824,0.025862,0.035928
4,0.978500,0.689957,0.421053,0.137931,0.207792
5,0.978500,0.650048,0.44186,0.163793,0.238994


In [12]:
# ✅ Show Comparison
import pandas as pd
pd.DataFrame(results).T.sort_values("eval_f1", ascending=False)


Unnamed: 0,eval_loss,eval_precision,eval_recall,eval_f1,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
bert-base-multilingual-cased,0.610612,0.465753,0.293103,0.359788,18.14,1.985,0.276,5.0
distilbert-base-multilingual-cased,0.793155,0.345238,0.25,0.29,8.9276,4.032,0.56,5.0
xlm-roberta-base,0.632948,0.410714,0.198276,0.267442,17.8981,2.011,0.279,5.0
Davlan/afro-xlmr-base,0.650048,0.44186,0.163793,0.238994,17.8247,2.02,0.281,5.0
