## Fine Tune NER Model
To fine-tune a Named Entity Recognition (NER) model to extract key entities (products, prices, and location) from Amharic Telegram messages, we will follow these steps.

 Step 1: Set Up Environment with GPU Support

Use Google Colab or GPU-Enabled Environment Ensure that selected a runtime with GPU in Google Colab:

Go to Runtime > Change runtime type > Select GPU.
Install Necessary Libraries

Run the following commands in a code cell to install the required libraries:

In [2]:
# Uncomment below line, and run the cell
!pip install pyarrow==10.0.1 datasets==2.4.0 seqeval

Collecting pyarrow==10.0.1
  Downloading pyarrow-10.0.1.tar.gz (994 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m994.1/994.1 kB[0m [31m450.9 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting datasets==2.4.0
  Downloading datasets-2.4.0-py3-none-any.whl.metadata (20 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting dill<0.3.6 (from datasets==2.4.0)
  Downloading dill-0.3.5.1-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting requests>=2.19.0 (from datasets==2.4.0)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting t

In [7]:
# %pip install seaborn pyarrow==14.0.0 seqeval transformers
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datasets import Dataset, Features, Sequence, Value
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

# Install missing packages

import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
# Function to load CoNLL formatted data
def load_conll(file_path):
    sentences = []
    labels = []
    with open(file_path, 'r', encoding='utf-8') as f:
        sentence = []
        label = []
        for line in f:
            if line.strip():  # Non-empty line
                token, label_item = line.split()
                sentence.append(token)
                label.append(label_item)
            else:  # Empty line indicates end of a sentence
                sentences.append(sentence)
                labels.append(label)
                sentence = []
                label = []
    return pd.DataFrame({'tokens': sentences, 'labels': labels})

# Load your CoNLL file
df = load_conll('../labeled_data_conll.conll')

In [11]:

df.head()

Unnamed: 0,tokens,labels
0,"[3, 120, ዋጋ, 1100, ብር, ውስን, ፍሬ, ነው, ያለው, አድራሻ,...","[O, O, B-PRICE, I-PRICE, I-PRICE, O, O, O, O, ..."
1,"[3, 120, ዋጋ, 1100, ብር, ውስን, ፍሬ, ነው, ያለው, አድራሻ,...","[O, O, B-PRICE, I-PRICE, I-PRICE, O, O, O, O, ..."
2,"[12, ዋጋ, 800, ብር, ውስን, ፍሬ, ነው, ያለን, አድራሻ, ቁ1መገ...","[O, B-PRICE, I-PRICE, I-PRICE, O, O, O, O, O, ..."
3,"[12, ዋጋ, 800, ብር, ውስን, ፍሬ, ነው, ያለን, አድራሻ, ቁ1መገ...","[O, B-PRICE, I-PRICE, I-PRICE, O, O, O, O, O, ..."
4,"[ዋጋ, 900, ብር, ውስን, ፍሬ, ነው, ያለው, አድራሻ, ቁ1መገናኛ, ...","[B-PRICE, I-PRICE, I-PRICE, O, O, O, O, O, O, ..."


In [12]:
unique_labels = set(label for sublist in df['labels'] for label in sublist)
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

In [13]:
unique_labels

{'B-LOCATION', 'B-PRICE', 'B-PRODUCT', 'I-PRICE', 'O'}

In [14]:
df['labels'] = df['labels'].apply(lambda x: [label2id[label] for label in x])

In [15]:
# Convert DataFrame to Hugging Face Dataset
# Make sure 'labels' is a list of lists
# Define the features with the correct data types
features = Features({
    'tokens': Sequence(Value('string')),  # List of strings for tokens
    'labels': Sequence(Value('int32'))    # List of integers for labels
})

# Convert DataFrame to Hugging Face Dataset with specified features
dataset = Dataset.from_pandas(df[['tokens', 'labels']], features=features)

In [19]:
from transformers import XLMRobertaTokenizerFast

# Initialize the Fast Tokenizer
# Use the fast tokenizer
# For XLM-Roberta
tokenizer = XLMRobertaTokenizerFast.from_pretrained(
    "xlm-roberta-base",
    clean_up_tokenization_spaces=True
    )

# Define tokenizer function
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['tokens'], 
        truncation=True, 
        is_split_into_words=True, 
        padding="max_length",  # Padding to max length
        max_length=128  # Adjust as needed
    )
    
    labels = []
    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Get word ids for each token
        previous_word_idx = None
        label_ids = []
        
        for word_idx in word_ids:
            if word_idx is None:
                # Token corresponds to special tokens like [CLS], [SEP], etc.
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # The first token of a word
                label_ids.append(label[word_idx])
            else:
                # Subword token, assign -100 so it's ignored during training
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)
    
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

# Tokenize the dataset using xlrm_berta

tokenized_xlm_dataset = dataset.map(tokenize_and_align_labels, batched=True)

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
Map: 100%|██████████| 2419/2419 [00:00<00:00, 2516.97 examples/s]


In [21]:
from transformers import XLMRobertaForTokenClassification, AutoModelForTokenClassification, AutoTokenizer, Trainer
# For mBERT
tokenizer_mbert = AutoTokenizer.from_pretrained(
    'bert-base-multilingual-cased',
    clean_up_tokenization_spaces=True
    )
# Define tokenizer function
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer_mbert(
        examples['tokens'], 
        truncation=True, 
        is_split_into_words=True, 
        padding="max_length",  # Padding to max length
        max_length=128  # Adjust as needed
    )
    
    labels = []
    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Get word ids for each token
        previous_word_idx = None
        label_ids = []
        
        for word_idx in word_ids:
            if word_idx is None:
                # Token corresponds to special tokens like [CLS], [SEP], etc.
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # The first token of a word
                label_ids.append(label[word_idx])
            else:
                # Subword token, assign -100 so it's ignored during training
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)
    
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

# Tokenize the dataset using xlrm_berta

tokenized_mbert_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████| 2419/2419 [00:01<00:00, 2201.55 examples/s]


In [22]:
#Tokenize the dataset using DistilBERT
# For DistilBERT
tokenizer_distilbert = AutoTokenizer.from_pretrained(
    'distilbert-base-multilingual-cased',
    clean_up_tokenization_spaces=True
    )
# Define tokenizer function
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer_distilbert(
        examples['tokens'], 
        truncation=True, 
        is_split_into_words=True, 
        padding="max_length",  # Padding to max length
        max_length=128  # Adjust as needed
    )
    
    labels = []
    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Get word ids for each token
        previous_word_idx = None
        label_ids = []
        
        for word_idx in word_ids:
            if word_idx is None:
                # Token corresponds to special tokens like [CLS], [SEP], etc.
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # The first token of a word
                label_ids.append(label[word_idx])
            else:
                # Subword token, assign -100 so it's ignored during training
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)
    
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

tokenized_distilbert_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████| 2419/2419 [00:00<00:00, 2567.84 examples/s]


In [23]:
# Split into train and validation datasets
train_test_split_xlm = tokenized_xlm_dataset.train_test_split(test_size=0.1)  # 90% train, 10% validation
train_test_split_mbert = tokenized_mbert_dataset.train_test_split(test_size=0.1)  # 90% train, 10% validation
train_test_split_distilbert = tokenized_distilbert_dataset.train_test_split(test_size=0.1)  # 90% train, 10% validation