<a href="https://colab.research.google.com/github/Ybtry/EthioMart/blob/master/NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import os
from datasets import Dataset, Features, Value, ClassLabel, Sequence
from sklearn.model_selection import train_test_split as sk_train_test_split
from transformers import Trainer, TrainingArguments
import numpy as np
from seqeval.metrics import classification_report
from seqeval.scheme import IOB2
from transformers import AutoTokenizer, AutoModelForTokenClassification
from huggingface_hub import notebook_login
from transformers import DataCollatorForTokenClassification
import collections

DATA_PATH = '/content/drive/MyDrive/EthioMart_Data/'

label_list = [
    "O",
    "B-PRODUCT", "I-PRODUCT",
    "B-PRICE", "I-PRICE",
    "B-LOC", "I-LOC",
    "B-DELIVERY_FEE", "I-DELIVERY_FEE",
    "B-CONTACT_INFO", "I-CONTACT_INFO"
]
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for i, label in enumerate(label_list)}

def read_conll_file(file_path):
    data = []
    current_words = []
    current_labels = []

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()

            if not line:
                if current_words:
                    data.append({"tokens": current_words, "ner_tags": current_labels})
                current_words = []
                current_labels = []
                continue

            if line.startswith("#"):
                continue

            parts = line.split()
            if len(parts) >= 2:
                current_words.append(parts[0])
                current_labels.append(parts[1])
            else:
                if len(parts) == 1:
                    current_words.append(parts[0])
                    current_labels.append("O")
                else:
                    continue

        if current_words:
            data.append({"tokens": current_words, "ner_tags": current_labels})
    return data

conll_data = read_conll_file(os.path.join(DATA_PATH, 'labeled_telegram_product_price_location.txt'))

if not conll_data:
    raise ValueError("No data loaded from CoNLL file. Please check file path and content.")
print(f"Successfully loaded {len(conll_data)} sequences from CoNLL file.")

train_data_list, eval_data_list = sk_train_test_split(conll_data, test_size=0.2, random_state=42)

features = Features({
    'tokens': Sequence(Value(dtype='string')),
    'ner_tags': Sequence(ClassLabel(names=label_list))
})

train_dataset = Dataset.from_list(train_data_list, features=features)
eval_dataset = Dataset.from_list(eval_data_list, features=features)

print(f"Total sequences: {len(conll_data)}")
print(f"Train sequences: {len(train_dataset)}")
print(f"Eval sequences: {len(eval_dataset)}")
print("\nExample from train dataset:")
print(train_dataset[0])

model_name = "Davlan/xlm-roberta-base-finetuned-amharic"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

print("\nModel's ID to Label mapping:", model.config.id2label)
print("Model's Label to ID mapping:", model.config.label2id)

notebook_login()

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_and_align_labels, batched=True)

print("\nExample tokenized and aligned:")
print(tokenized_train_dataset[0])

print("\nChecking tokenized train dataset example labels:")
print(tokenized_train_dataset[0]["labels"])

print("\nChecking tokenized eval dataset example labels:")
print(tokenized_eval_dataset[0]["labels"])

all_train_labels = [label for sublist in tokenized_train_dataset["labels"] for label in sublist if label != -100]
all_eval_labels = [label for sublist in tokenized_eval_dataset["labels"] for label in sublist if label != -100]

print("\nTrain label distribution (excluding -100):")
print(collections.Counter([id_to_label[l] for l in all_train_labels]))

print("\nEval label distribution (excluding -100):")
print(collections.Counter([id_to_label[l] for l in all_eval_labels]))

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id_to_label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    report = classification_report(true_labels, true_predictions, output_dict=True, mode='strict', scheme=IOB2)

    micro_avg = report.get('micro avg', {})

    return {
        "precision": micro_avg.get("precision", 0.0),
        "recall": micro_avg.get("recall", 0.0),
        "f1": micro_avg.get("f1-score", 0.0),
        "accuracy": micro_avg.get("accuracy", 0.0),
    }

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none"
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Successfully loaded 8 sequences from CoNLL file.
Total sequences: 8
Train sequences: 6
Eval sequences: 2

Example from train dataset:
{'tokens': ['👋', 'BARDEFU', '2', 'IN', '1', 'Multi', 'purpose', 'juicer', '👉', 'ኳሊቲ', 'የሆነ', 'የጁስ', 'መፍጫ', '👉', 'የጀርመን', 'ቴክኖሎጂ', 'የሆነ', '👉', '3', 'ሌትር', 'ጁስ', 'የሚፈጭ', 'ጆግ', 'ያለው', '👉', 'የብና', 'እና', 'የቅመማ', 'ቅመም', 'መፍጫ', 'ያለው', '👉', '8000Watt', 'የሆነ', '👉', 'ምላጮቹ', 'ጠንካራ', 'የሆኑ', '👉', 'ለቤት', 'እንዲሁም', 'ለስራ', 'የሚሆን', 'አሪፍ', 'እቃ', '👉', 'ለአጠቃቀም', 'ቀላል', '👉', 'በረዶ', 'ይፈጫል', 'ዋጋ:-6800ብር', 'ውስን', 'ፍሬ', 'ነው', 'ያለን/', 'Limited', 'Stock', '🏢', 'አድራሻ', 'ቁ.1', '👉', 'መገናኛ', 'ታሜ', 'ጋስ', 'ህንፃ', 'ጎን', 'ስሪ', 'ኤም', 'ሲቲ', 'ሞል', 'ሁለተኛ', 'ፎቅ', 'ቢሮ', 'ቁ.', 'SL-05A(ከ', 'ሊፍቱ', 'ፊት', 'ለ', 'ፊት)', '📍ቁ.2', '👉ለቡ', 'መዳህኒዓለም', 'ቤተ/ክርስቲያን', 'ወደ', 'ሙዚቃ', 'ቤት', 'ከፍ', 'ብሎ', '2ኛ', 'ፎቅ', 'ቢሮ.ቁ', '214', '📲', '0909522840', '📲', '0923350054', '👍ለቡ', 'ቅርንጫፍ📲0973611819', '🔖', '💬', 'በTelegram', 'ለማዘዝ', '⤵️', 'ይጠቀሙ', '@shager_onlinestore', 'ለተጨማሪ', 'ማብራሪያ', 'የቴሌግራም', 'ገፃችን⤵️', 'https://t.me/Shagero

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Davlan/xlm-roberta-base-finetuned-amharic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model's ID to Label mapping: {0: 'O', 1: 'B-PRODUCT', 2: 'I-PRODUCT', 3: 'B-PRICE', 4: 'I-PRICE', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-DELIVERY_FEE', 8: 'I-DELIVERY_FEE', 9: 'B-CONTACT_INFO', 10: 'I-CONTACT_INFO'}
Model's Label to ID mapping: {'O': 0, 'B-PRODUCT': 1, 'I-PRODUCT': 2, 'B-PRICE': 3, 'I-PRICE': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-DELIVERY_FEE': 7, 'I-DELIVERY_FEE': 8, 'B-CONTACT_INFO': 9, 'I-CONTACT_INFO': 10}


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]


Example tokenized and aligned:
{'tokens': ['👋', 'BARDEFU', '2', 'IN', '1', 'Multi', 'purpose', 'juicer', '👉', 'ኳሊቲ', 'የሆነ', 'የጁስ', 'መፍጫ', '👉', 'የጀርመን', 'ቴክኖሎጂ', 'የሆነ', '👉', '3', 'ሌትር', 'ጁስ', 'የሚፈጭ', 'ጆግ', 'ያለው', '👉', 'የብና', 'እና', 'የቅመማ', 'ቅመም', 'መፍጫ', 'ያለው', '👉', '8000Watt', 'የሆነ', '👉', 'ምላጮቹ', 'ጠንካራ', 'የሆኑ', '👉', 'ለቤት', 'እንዲሁም', 'ለስራ', 'የሚሆን', 'አሪፍ', 'እቃ', '👉', 'ለአጠቃቀም', 'ቀላል', '👉', 'በረዶ', 'ይፈጫል', 'ዋጋ:-6800ብር', 'ውስን', 'ፍሬ', 'ነው', 'ያለን/', 'Limited', 'Stock', '🏢', 'አድራሻ', 'ቁ.1', '👉', 'መገናኛ', 'ታሜ', 'ጋስ', 'ህንፃ', 'ጎን', 'ስሪ', 'ኤም', 'ሲቲ', 'ሞል', 'ሁለተኛ', 'ፎቅ', 'ቢሮ', 'ቁ.', 'SL-05A(ከ', 'ሊፍቱ', 'ፊት', 'ለ', 'ፊት)', '📍ቁ.2', '👉ለቡ', 'መዳህኒዓለም', 'ቤተ/ክርስቲያን', 'ወደ', 'ሙዚቃ', 'ቤት', 'ከፍ', 'ብሎ', '2ኛ', 'ፎቅ', 'ቢሮ.ቁ', '214', '📲', '0909522840', '📲', '0923350054', '👍ለቡ', 'ቅርንጫፍ📲0973611819', '🔖', '💬', 'በTelegram', 'ለማዘዝ', '⤵️', 'ይጠቀሙ', '@shager_onlinestore', 'ለተጨማሪ', 'ማብራሪያ', 'የቴሌግራም', 'ገፃችን⤵️', 'https://t.me/Shageronlinestore'], 'ner_tags': [0, 1, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.94434,0.0,0.0,0.0,0.0
2,No log,1.736218,0.0,0.0,0.0,0.0
3,No log,1.592459,0.0,0.0,0.0,0.0
4,No log,1.501453,0.0,0.0,0.0,0.0
5,No log,1.457287,0.0,0.0,0.0,0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=5, training_loss=1.7774482727050782, metrics={'train_runtime': 284.768, 'train_samples_per_second': 0.105, 'train_steps_per_second': 0.018, 'total_flos': 5129386877700.0, 'train_loss': 1.7774482727050782, 'epoch': 5.0})