In [None]:
# Task 3: Fine Tune NER Model for Amharic Telegram Messages

import pandas as pd
from datasets import Dataset
from transformers import XLMRobertaTokenizerFast, XLMRobertaForTokenClassification, Trainer, TrainingArguments

# Step 2: Load the labeled dataset in CoNLL format
def load_conll_data(file_path):
    """Loads CoNLL formatted data into a pandas DataFrame."""
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        sentence = []
        labels = []
        for line in f:
            if line.strip():
                token, label = line.strip().split()
                sentence.append(token)
                labels.append(label)
            else:
                data.append((sentence, labels))
                sentence = []
                labels = []
    if sentence:  # For the last sentence if there is no newline
        data.append((sentence, labels))
    return data

# Path to your labeled data in CoNLL format
conll_file_path = '../output/labeled_telegram_data.conll'
data = load_conll_data(conll_file_path)

# Convert data into a DataFrame
df = pd.DataFrame(data, columns=['tokens', 'labels'])

# Step 3: Tokenize the data and align the labels with tokens
def tokenize_and_align_labels(examples):
    """Tokenizes inputs and aligns labels."""
    tokenized_inputs = tokenizer(examples['tokens'], is_split_into_words=True, padding=True, truncation=True)

    labels = []
    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # get word ids
        label_ids = [-100] * len(tokenized_inputs['input_ids'][i])  # default to -100 (ignore index)

        # Align labels with tokenized inputs
        for j, label_id in enumerate(label):
            # Check if word_ids[j] exists to prevent IndexError
            if j < len(word_ids) and word_ids[j] is not None:  # only consider non-padding tokens
                if label_id in label_map:  # Check if label_id exists in label_map
                    label_ids[word_ids[j]] = label_map[label_id]  # map label to its id

        # Append the aligned labels
        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

# Load tokenizer and model
model_name = "xlm-roberta-base"  # can be replaced "bert-tiny-amharic" or "afroxmlr"
tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name)

# Map label names to IDs
label_list = list(set(label for labels in df['labels'] for label in labels))
label_list = sorted(label_list)
label_map = {label: i for i, label in enumerate(label_list)}

# Create a Hugging Face dataset
dataset = Dataset.from_pandas(df)

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=['tokens', 'labels'])

# Step 4: Set up training arguments
training_args = TrainingArguments(
    output_dir='../results',            # output directory
    evaluation_strategy="epoch",       # evaluate every epoch
    learning_rate=2e-5,                # learning rate
    per_device_train_batch_size=16,    # batch size for training
    per_device_eval_batch_size=16,     # batch size for evaluation
    num_train_epochs=3,                 # total number of training epochs
    weight_decay=0.01,                  # strength of weight decay
)

# Step 5: Use Hugging Face's Trainer API to fine-tune the model
model = XLMRobertaForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # You may want to create a validation set separately
    tokenizer=tokenizer,
)

# Start training
trainer.train()

# Step 6: Evaluate the fine-tuned model on the validation set
results = trainer.evaluate()
print("Evaluation results:", results)

# Step 7: Save the fine-tuned model
model.save_pretrained('../models/fine_tuned_ner_model')
tokenizer.save_pretrained('../models/fine_tuned_ner_model')

print("Model and tokenizer saved successfully!")
