1. Environment Setup (Google Colab)

In [6]:
# Install necessary libraries
!pip install transformers datasets evaluate
!pip install torch





[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


2. Load the Dataset

In [7]:
import pandas as pd

# Load the dataset from CSV
df = pd.read_csv(r'C:\Users\HP\week 5\Amharic-Named-Entity-Recognition-\data\labeled_amh.csv')

# Check the structure
df.head()


Unnamed: 0,Token,Label
0,ðŸ“£,O
1,ðŸ” ðŸ” ðŸ” ðŸ” ðŸ” ðŸ” ðŸ” ðŸ” ðŸ” ðŸ”,O
2,ðŸ“Ž,O
3,á‹­áˆ„áŠ•áŠ•ðŸ‘‰,O
4,t.me/MerttEka,O


3. Tokenization & Dataset Preparation

In [10]:
from transformers import AutoTokenizer
import numpy as np

# Load pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# Convert labels to numerical IDs (if needed, this will depend on your dataset)
label_list = ["O", "B-Product", "I-Product", "B-PRICE", "I-PRICE", "B-LOC", "I-LOC"] # Add more as per your dataset
label_to_id = {label: idx for idx, label in enumerate(label_list)}

# Tokenize and align the labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["Token"], 
        padding="max_length", 
        truncation=True, 
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["Label"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None

        # Convert each word's label to match tokenization
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignored by the model
            elif word_idx != previous_word_idx:
                # Assign the corresponding label ID
                label_ids.append(label_to_id.get(label[word_idx], -100))
            else:
                # Assign -100 to tokens that are part of the same word
                label_ids.append(-100)

            previous_word_idx = word_idx

        labels.append(label_ids)
    
    # Add labels to tokenized inputs
    tokenized_inputs["labels"] = labels
    return tokenized_inputs



4. Dataset Loading with Hugging Face's datasets

In [None]:
from datasets import Dataset

# Convert DataFrame to Hugging Face Dataset format
dataset = Dataset.from_pandas(df)

# Tokenize the dataset and align labels
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)


5. Fine-Tuning the NER Model

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

# Load the pre-trained NER model
model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-base", num_labels=3) # Adjust num_labels based on your dataset

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
)

# Fine-tune the model
trainer.train()

6. Save the Fine-Tuned Model

Task 4: Model Comparison & Selection

In [None]:
from evaluate import load

# Load the evaluation metric
metric = load("accuracy")

# Function to compute metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[label for label in labels_example if label != -100] for labels_example in labels]
    return metric.compute(predictions=predictions, references=true_labels)