In [12]:

# Install the latest versions of all required libraries

!pip install -q -U transformers[torch] datasets seqeval evaluate accelerate huggingface_hub

# Import the 'transformers' library and print its version
# This step is for verification. We want to see a version >= 4.x.x
import transformers
print(f"✅ Transformers version installed: {transformers.__version__}")

print("\n\n!! IMPORTANT !!")
print("--> Now, you MUST restart the runtime. Go to 'Runtime' > 'Restart runtime' in the menu above.")
print("--> After restarting, run all cells again starting from the one BELOW this one.")

✅ Transformers version installed: 4.52.4


!! IMPORTANT !!
--> Now, you MUST restart the runtime. Go to 'Runtime' > 'Restart runtime' in the menu above.
--> After restarting, run all cells again starting from the one BELOW this one.


In [1]:



# Import necessary libraries from Hugging Face and standard Python packages.
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import numpy as np
import evaluate


print("All libraries imported successfully.")

All libraries imported successfully.


In [2]:
# Define the complete list of labels your model will predict.
# The 'O' tag is for tokens that don't belong to any entity.
label_list = ['O', 'B-PRODUCT', 'I-PRODUCT', 'B-PRICE', 'I-PRICE', 'B-LOC', 'I-LOC']

# Create mappings from label names to integer IDs and back.
# The model works with numbers (IDs), but we want to see the names (labels).
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

def parse_conll_file(filepath):
    """
    Parses a CoNLL-formatted file into a Hugging Face Dataset object.

    Args:
        filepath (str): The path to the .conll file.

    Returns:
        Dataset: A Hugging Face Dataset with 'tokens' and 'ner_tags' columns.
    """
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read().strip()

    # Each message is separated by a double newline.
    messages = content.split('\n\n')
    data = {'tokens': [], 'ner_tags': []}

    for message in messages:
        tokens, tags = [], []
        for line in message.split('\n'):
            if line: # Ensure the line is not empty
                token, tag = line.split()
                tokens.append(token)
                tags.append(label2id[tag]) # Convert the label name to its integer ID

        # Add the processed message to our data dictionary.
        data['tokens'].append(tokens)
        data['ner_tags'].append(tags)

    return Dataset.from_dict(data)

# Load your labeled data.
print("Loading and parsing CoNLL file...")
raw_dataset = parse_conll_file('/content/labeled_data.conll')

# Split the dataset into a training set (80%) and a testing set (20%).
# The 'seed' ensures the split is the same every time you run this code.
dataset = raw_dataset.train_test_split(test_size=0.2, seed=42)

print("\nDataset successfully loaded and split:")
print(dataset)

Loading and parsing CoNLL file...

Dataset successfully loaded and split:
DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 35
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 9
    })
})


In [3]:
# We'll use 'xlm-roberta-base', a powerful multilingual model.
model_checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_and_align_labels(examples):
    """
    Tokenizes text and aligns labels with the new sub-word tokens.
    """
    # Tokenize the input text. 'is_split_into_words=True' tells the tokenizer
    # that our input is already a list of words.
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # For special tokens like [CLS] and [SEP], set the label to -100 to ignore them in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # For the first sub-word token of a word, use the word's label.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For subsequent sub-word tokens of the same word, set the label to -100.
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

print("Tokenizing dataset and aligning labels...")
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)
print("Tokenization complete.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Tokenizing dataset and aligning labels...


Map:   0%|          | 0/35 [00:00<?, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

Tokenization complete.


In [4]:
# Load the model for token classification, configured with our specific labels.
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=len(label_list), id2label=id2label, label2id=label2id
)

# The Data Collator will create batches of data for training.
# It handles padding dynamically, which is more efficient.
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Load the seqeval metric, which is the standard for NER tasks.
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    """
    Computes precision, recall, F1, and accuracy for the model's predictions.
    """
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Convert IDs back to label strings.
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Compute the results using seqeval.
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

print("Model and metrics are configured.")

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and metrics are configured.


In [6]:
# ===================================================================
# CELL: CONFIGURE TRAINING ARGUMENTS (WORKAROUND METHOD)
# ===================================================================

# Step 1: Initialize TrainingArguments with only the arguments that are known to work
# in all versions. We will add the others manually in the next step.

training_args = TrainingArguments(
    output_dir="./results",  # Use a local directory in Colab
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=False, # We will handle pushing manually after training
)

print("TrainingArguments initialized with basic parameters.")

TrainingArguments initialized with basic parameters.


In [7]:
# Step 2: Manually set the attributes that were causing the error.
# This bypasses the __init__ check and sets the values directly.

training_args.evaluation_strategy = "epoch"
training_args.save_strategy = "epoch"
training_args.load_best_model_at_end = True
training_args.metric_for_best_model = "f1"

print("Advanced training arguments set successfully.")
print(f"Evaluation strategy: {training_args.evaluation_strategy}")
print(f"Load best model at end: {training_args.load_best_model_at_end}")

Advanced training arguments set successfully.
Evaluation strategy: epoch
Load best model at end: True


In [8]:
trainer = Trainer(
    model=model,
    args=training_args, # It will use the object we just configured
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



# This cell for training.
trainer.train()


  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mabeni505[0m ([33mabeni505-addis-ababa-science-and-technology-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


TrainOutput(global_step=25, training_loss=1.2781629180908203, metrics={'train_runtime': 449.3428, 'train_samples_per_second': 0.389, 'train_steps_per_second': 0.056, 'total_flos': 4512166122618.0, 'train_loss': 1.2781629180908203, 'epoch': 5.0})

In [11]:
# In a new cell in your Colab notebook

from huggingface_hub import notebook_login

# This will create a text box for you to paste your token into.
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [12]:
# The format is "YourUsername/YourModelName"
hub_model_id = "abeni505/amharic-ecommerce-ner"


# This command will upload the best model from your training run.
trainer.push_to_hub(commit_message=f"End of training upload for {hub_model_id}")

Uploading...:   0%|          | 0.00/1.13G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/abeni505/results/commit/adce358d21e3bacc9a5b59741c1e28bf7efe5793', commit_message='End of training upload for abeni505/amharic-ecommerce-ner', commit_description='', oid='adce358d21e3bacc9a5b59741c1e28bf7efe5793', pr_url=None, repo_url=RepoUrl('https://huggingface.co/abeni505/results', endpoint='https://huggingface.co', repo_type='model', repo_id='abeni505/results'), pr_revision=None, pr_num=None)

In [13]:
# The best model has already been loaded, now we save it to a final directory.
# This model will be used in all subsequent tasks.
final_model_path = "../models/final-ner-model"
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

print(f"Best model saved to {final_model_path}")


Best model saved to ../models/final-ner-model


---
### Task 3 Summary & Results

This notebook successfully fine-tuned the `xlm-roberta-base` model on our custom Amharic NER dataset.


- **Model saved locally to:** `Models/final-ner-model`
- **Model successfully uploaded to Hugging Face Hub at:** `https://huggingface.co/abeni505/amharic-ecommerce-ner`

The final trained model is now ready for use in subsequent tasks by loading it directly from the Hub.
---