In [1]:
!pip install transformers datasets accelerate peft evaluate seqeval



- Hugging Face Authentication

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

- read and parse the file into a dict

In [3]:
from google.colab import drive
drive.mount('/content/drive')

# conll labeled data path
labeled_data_path = "/content/drive/MyDrive/Telegram_Scraper_Data/my_labeled_data_conll.txt"

# parse labeled data to a dict
def parse_conll_file(file_path):
    data = {"tokens": [], "ner_tags": []}
    current_tokens = []
    current_tags = []
    with open(file_path,'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line: # Blank line indicates new message
                if current_tokens:
                    data["tokens"].append(current_tokens)
                    data["ner_tags"].append(current_tags)
                current_tokens = []
                current_tags = []
            else:
                parts = line.split('\t')
                if len(parts) == 2:
                    current_tokens.append(parts[0])
                    current_tags.append(parts[1])

        if current_tokens:
            data["tokens"].append(current_tokens)
            data["ner_tags"].append(current_tags)
    return data

raw_data_dict = parse_conll_file(labeled_data_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


- Create a Dataset object

In [4]:
from datasets import Dataset
dataset = Dataset.from_dict(raw_data_dict)

# Split into train and validation sets 80/20
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")

Train dataset size: 2880
Eval dataset size: 720


- Numerical IDs for labels

In [5]:
# Gather all unique labels from your dataset
unique_labels = sorted(list(set(label for sublist in raw_data_dict['ner_tags'] for label in sublist)))

# Define the ID to label and label to ID mappings
id2label = {i: label for i, label in enumerate(unique_labels)}
label2id = {label: i for i, label in enumerate(unique_labels)}

print(f"Unique labels found: {unique_labels}")
print(f"Label to ID mapping: {label2id}")

Unique labels found: ['B-LOC', 'B-ORG', 'B-PER', 'B-PRICE', 'B-PRODUCT', 'B-TIME', 'B-TTL', 'I-LOC', 'I-ORG', 'I-PER', 'I-PRICE', 'I-PRODUCT', 'I-TIME', 'I-TTL', 'O']
Label to ID mapping: {'B-LOC': 0, 'B-ORG': 1, 'B-PER': 2, 'B-PRICE': 3, 'B-PRODUCT': 4, 'B-TIME': 5, 'B-TTL': 6, 'I-LOC': 7, 'I-ORG': 8, 'I-PER': 9, 'I-PRICE': 10, 'I-PRODUCT': 11, 'I-TIME': 12, 'I-TTL': 13, 'O': 14}


- Tokenization

In [6]:
from transformers import AutoTokenizer

# MODEL_NAME = "xlm-roberta-base"
# MODEL_NAME = "masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0"
MODEL_NAME = "rasyosef/bert-tiny-amharic"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True, # Truncate long sequences
        is_split_into_words=True # Tell tokenizer that input is already split into words
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word_idx of None. We set their label to -100 so they are ignored.
            if word_idx is None:
                label_ids.append(-100)
            # We only label the first token of a given word.
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            # For subsequent subword tokens of the same word, label them -100
            # or with the 'I-' tag if preferred, depending on your scheme.
            # Standard practice for BPE tokens is -100 for non-first subwords
            else:
                label_ids.append(-100) # This token is part of a multi-subword token, ignore for loss
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply the tokenization and alignment to your datasets
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_and_align_labels, batched=True)

print("\nExample of tokenization and label alignment for first train sample:")
sample_tokens = tokenized_train_dataset[0]["tokens"]
sample_ner_tags = tokenized_train_dataset[0]["ner_tags"] # Original labels
sample_input_ids = tokenized_train_dataset[0]["input_ids"]
sample_labels = tokenized_train_dataset[0]["labels"] # Aligned labels

print("Original tokens:", sample_tokens)
print("Original NER tags:", sample_ner_tags)
print("Tokenized input IDs:", sample_input_ids)
print("Decoded tokens:", tokenizer.convert_ids_to_tokens(sample_input_ids))
print("Aligned labels (IDs):", sample_labels)
# To understand aligned labels: map sample_labels to actual label strings
aligned_label_strings = [id2label[lid] if lid != -100 else "IGN" for lid in sample_labels]
print("Aligned labels (strings):", aligned_label_strings)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/274k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/725k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

Map:   0%|          | 0/2880 [00:00<?, ? examples/s]

Map:   0%|          | 0/720 [00:00<?, ? examples/s]


Example of tokenization and label alignment for first train sample:
Original tokens: ['የትምህርት', 'ሚኒስቴር', 'መምህራን', 'ከወጪ', 'መጋራቱ', 'ክፍያ', 'የራቁና', 'የእነርሱ', 'ወጪ', 'በአገልግሎት', 'የሚታሰብ', 'ነው"', 'ያለ', 'ቢሆንም', 'እኛ', 'ያለ', 'አማራጮች', 'ይህን', 'ውሳኔ', 'መቀበል', 'አንፈልግም', '።']
Original NER tags: ['B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Tokenized input IDs: [2, 10685, 10345, 14580, 9972, 5686, 12507, 13524, 11400, 9780, 19555, 15940, 9804, 11077, 22737, 10560, 9110, 9015, 6, 9129, 9991, 9474, 9129, 17484, 9364, 9901, 14078, 24659, 2223, 3]
Decoded tokens: ['[CLS]', 'የትምህርት', 'ሚኒስቴር', 'መምህራን', 'ከወ', '##ጪ', 'መጋ', '##ራቱ', 'ክፍያ', 'የራ', '##ቁና', 'የእነ', '##ርሱ', 'ወጪ', 'በአገልግሎት', 'የሚታ', '##ሰብ', 'ነው', '"', 'ያለ', 'ቢሆንም', 'እኛ', 'ያለ', 'አማራጮች', 'ይህን', 'ውሳኔ', 'መቀበል', 'አንፈልግም', '።', '[SEP]']
Aligned labels (IDs): [-100, 1, 8, 14, 14, -100, 14, -100, 14, 14, -100, 14, -100, 14, 14, 14, -100, 14, -100, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, -100

- Model Setup with PEFT(LoRA)

In [7]:
from transformers import AutoModelForTokenClassification
from peft import LoraConfig, get_peft_model, TaskType

# Load the base pre-trained model for token classification
# Use your label mappings here
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME, # This should be "masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0"
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True # <--- ENSURE THIS IS PRESENT AND SET TO True
)
print(f"Model {MODEL_NAME} loaded successfully for fine-tuning, with mismatched head re-initialized.")

# Define LoRA configuration
# r: LoRA attention dimension (common values 8, 16, 32, 64)
# lora_alpha: Scaling factor (common values 16, 32)
# target_modules: Which layers to inject LoRA adapters into. Common for BERT/RoBERTa are 'query', 'value'
# lora_dropout: Dropout probability for LoRA layers
# bias: 'none' is common for LoRA
# task_type: Important for PEFT to know what kind of task (TOKEN_CLS for NER)
lora_config = LoraConfig(
    r=16, # LoRA attention dimension
    lora_alpha=32, # A scaling factor
    target_modules=["query", "value"], # Layers to apply LoRA. Often also include "key", "dense", "classifier" depending on model
    lora_dropout=0.1, # Dropout probability
    bias="none", # Bias type for LoRA layers
    task_type=TaskType.TOKEN_CLS # Indicate that this is a token classification task
)

# Get the PEFT model
peft_model = get_peft_model(model, lora_config)

# Print trainable parameters to see the dramatic reduction
print("\nPEFT Model Trainable Parameters:")
peft_model.print_trainable_parameters()

config.json:   0%|          | 0.00/643 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/16.7M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at rasyosef/bert-tiny-amharic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model rasyosef/bert-tiny-amharic loaded successfully for fine-tuning, with mismatched head re-initialized.

PEFT Model Trainable Parameters:
trainable params: 18,319 || all params: 4,152,862 || trainable%: 0.4411


- Training Setup

In [8]:
from transformers import TrainingArguments, DataCollatorForTokenClassification
import evaluate # Hugging Face's evaluate library
import numpy as np # For numerical operations

# Data Collator (handles padding batches)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Load metrics
seqeval_metric = evaluate.load("seqeval")

# Function to compute metrics for NER
def compute_metrics(p):
    predictions, labels = p
    # Predictions are logits, so take argmax
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (where label is -100)
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Compute metrics
    results = seqeval_metric.compute(predictions=true_predictions, references=true_labels)

    # Return F1, precision, recall for all labels, and also macro-averaged scores
    # We target F1-score as per your project objectives
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Training Arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Telegram_Scraper_Data/ner_peft_model_results",
    learning_rate=2e-5, # Common learning rate for fine-tuning
    num_train_epochs=5, # Number of training epochs (adjust based on validation performance)
    per_device_train_batch_size=8, # Batch size for training (adjust based on GPU memory)
    per_device_eval_batch_size=8, # Batch size for evaluation
    weight_decay=0.01, # Regularization
    eval_strategy="epoch", # <--- CHANGE THIS LINE: from evaluation_strategy to eval_strategy
    save_strategy="epoch", # Save checkpoint at the end of each epoch
    load_best_model_at_end=True, # Load the best model based on evaluation metric at the end
    metric_for_best_model="f1", # Metric to use for best model selection
    report_to="none", # You can set this to "tensorboard" or "wandb" for better logging
    logging_dir='./logs', # Directory for logs
    logging_steps=100, # Log training progress every N steps
)

- Fine Tuning with trainer API

In [9]:
from transformers import Trainer

# Initialize the Trainer
trainer = Trainer(
    model=peft_model, # Your PEFT model
    args=training_args, # Training arguments
    train_dataset=tokenized_train_dataset, # Tokenized training data
    eval_dataset=tokenized_eval_dataset, # Tokenized evaluation data
    data_collator=data_collator, # Data collator for padding
    tokenizer=tokenizer, # Tokenizer (used by collator and for logging)
    compute_metrics=compute_metrics, # Function to compute evaluation metrics
)

# Start training
print("\nStarting PEFT fine-tuning...")
trainer.train()
print("Fine-tuning complete.")

  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.



Starting PEFT fine-tuning...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,2.3192,2.0018,0.005146,0.003138,0.003899,0.862796
2,1.3597,1.153946,0.0,0.0,0.0,0.890785
3,0.8976,0.771334,0.0,0.0,0.0,0.890887
4,0.7187,0.665137,0.0,0.0,0.0,0.890887
5,0.6983,0.644878,0.0,0.0,0.0,0.890887


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fine-tuning complete.


- Evaluation

In [10]:
print("\nEvaluating the fine-tuned model on the evaluation set...")
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")


Evaluating the fine-tuned model on the evaluation set...


Evaluation Results: {'eval_loss': 2.0017998218536377, 'eval_precision': 0.005145797598627788, 'eval_recall': 0.0031380753138075313, 'eval_f1': 0.0038986354775828462, 'eval_accuracy': 0.8627958955602967, 'eval_runtime': 0.8253, 'eval_samples_per_second': 872.46, 'eval_steps_per_second': 109.057, 'epoch': 5.0}


- Save model on drive

In [11]:
# Save the fine-tuned PEFT model
# This saves only the LoRA adapters, which are very small.
# The base model weights are not saved here, as they are frozen.
output_model_dir = "/content/drive/MyDrive/Telegram_Scraper_Data/my_fine_tuned_ner_model"
peft_model.save_pretrained(output_model_dir)
tokenizer.save_pretrained(output_model_dir) # Save the tokenizer alongside

print(f"\nFine-tuned PEFT model saved to: {output_model_dir}")

# To load this model later for inference, you'd load the base model,
# then load the PEFT adapters, and then merge them.
# Example for loading later:
# from peft import PeftModel, PeftConfig
# config = PeftConfig.from_pretrained(output_model_dir)
# base_model = AutoModelForTokenClassification.from_pretrained(config.base_model_name_or_path, num_labels=len(unique_labels))
# loaded_peft_model = PeftModel.from_pretrained(base_model, output_model_dir)
# merged_model = loaded_peft_model.merge_and_unload() # Merge adapters for inference
# merged_model.save_pretrained("./merged_ner_model") # Save the full merged model


Fine-tuned PEFT model saved to: /content/drive/MyDrive/Telegram_Scraper_Data/my_fine_tuned_ner_model
