In [1]:
# ✅ Install Required Libraries
!pip install -q transformers datasets seqeval accelerate -q


In [2]:
# ✅ Upload CoNLL-formatted 
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
from datasets import Dataset
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from seqeval.metrics import classification_report

from google.colab import files
uploaded = files.upload()
filename = list(uploaded.keys())[0]


Saving labeled_conll_output.txt to labeled_conll_output (5).txt


In [3]:
# ✅ Load CoNLL File into DataFrame
def read_conll(file_path):
    data = []
    with open(file_path, encoding='utf-8') as f:
        tokens, labels = [], []
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    data.append((tokens, labels))
                    tokens, labels = [], []
            else:
                splits = line.split()
                if len(splits) == 2:
                    token, label = splits
                    tokens.append(token)
                    labels.append(label)
        if tokens:
            data.append((tokens, labels))
    return pd.DataFrame(data, columns=["tokens", "ner_tags"])

df = read_conll(filename)
df.head()


Unnamed: 0,tokens,ner_tags
0,"[Saachi, Electric, Kettle, Borosilicate, Glass...","[B-PRODUCT, I-PRODUCT, I-PRODUCT, I-PRODUCT, I..."
1,"[pcs, Bottle, Stopper, በማንኛውም, ጠርሙስ, ጫፍ, የሚገጠም...","[B-PRODUCT, I-PRODUCT, I-PRODUCT, O, O, O, O, ..."
2,"[pcs, Bottle, Stopper, በማንኛውም, ጠርሙስ, ጫፍ, የሚገጠም...","[O, B-PRODUCT, I-PRODUCT, O, O, O, O, O, O, O,..."
3,"[pairs, Sneaker, Crease, Protector, ዋጋ፦, ብር, ው...","[B-PRODUCT, I-PRODUCT, I-PRODUCT, I-PRODUCT, B..."
4,"[pairs, Sneaker, Crease, Protector, ዋጋ፦, ብር, ው...","[B-PRODUCT, I-PRODUCT, I-PRODUCT, I-PRODUCT, B..."


In [4]:
# ✅ Create Label Mappings
label_list = sorted({label for labels in df['ner_tags'] for label in labels})
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}


In [5]:
# ✅ Tokenize and Align Labels
def tokenize_and_align_labels(example):
    tokenized_input = tokenizer(
        example["tokens"],
        is_split_into_words=True,
        padding='max_length',
        truncation=True,
        max_length=128
    )
    word_ids = tokenized_input.word_ids()
    aligned_labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            aligned_labels.append(-100)
        elif word_idx != previous_word_idx:
            if word_idx < len(example["ner_tags"]):
                aligned_labels.append(label2id.get(example["ner_tags"][word_idx], -100))
            else:
                aligned_labels.append(-100)
        else:
            aligned_labels.append(-100)
        previous_word_idx = word_idx
    tokenized_input["labels"] = aligned_labels
    return tokenized_input

dataset = Dataset.from_pandas(df)


In [6]:
# ✅ Fine-tune the Model using Trainer API
def fine_tune_and_evaluate(model_name):
    global tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    data_collator = DataCollatorForTokenClassification(tokenizer)
    tokenized_dataset = dataset.map(tokenize_and_align_labels)

    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id
    )

    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=-1)
        true_labels = [[id2label[l] for l in example if l != -100] for example in labels]
        true_preds = [[id2label[p] for (p, l) in zip(pred, label) if l != -100]
                      for pred, label in zip(predictions, labels)]
        report = classification_report(true_labels, true_preds, output_dict=True, zero_division=0)
        return {
            "precision": report["micro avg"]["precision"],
            "recall": report["micro avg"]["recall"],
            "f1": report["micro avg"]["f1-score"]
        }

    args = TrainingArguments(
        output_dir=f"./{model_name.replace('/', '_')}_NER",
        eval_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        learning_rate=2e-5,
        logging_steps=10,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized_dataset,
        eval_dataset=tokenized_dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    trainer.train()
    return model, trainer.evaluate()



In [7]:
# ✅ Run Fine-Tuning for One Model (e.g.
model, results = fine_tune_and_evaluate("xlm-roberta-base")
results



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/36 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,1.349121,0.0,0.0,0.0
2,1.407600,0.957805,0.0,0.0,0.0
3,1.407600,0.705043,0.433333,0.224138,0.295455
4,0.822700,0.610867,0.462687,0.267241,0.338798
5,0.822700,0.578715,0.515625,0.284483,0.366667


{'eval_loss': 0.578714907169342,
 'eval_precision': 0.515625,
 'eval_recall': 0.28448275862068967,
 'eval_f1': 0.3666666666666667,
 'eval_runtime': 13.5509,
 'eval_samples_per_second': 2.657,
 'eval_steps_per_second': 0.369,
 'epoch': 5.0}

In [8]:
# ✅ Save Fine-Tuned Model
model_path = "/content/xlm-roberta-base_amharic_ner"
tokenizer.save_pretrained(model_path)
model.save_pretrained(model_path)
print(f"Model saved to: {model_path}")


Model saved to: /content/xlm-roberta-base_amharic_ner


In [17]:
from google.colab import drive
drive.mount('/content/drive')

# Save the model to Drive
!cp -r /content/xlm-roberta-base_amharic_ner /content/drive/MyDrive/xlm-roberta-base_amharic_ner

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
