In [None]:
# Install required packages
%pip install torch transformers datasets tokenizers scikit-learn accelerate seqeval --quiet


In [None]:
import numpy as np
import torch
import json
from sklearn.model_selection import train_test_split

# Transformers and datasets
from transformers import (
    AutoTokenizer, 
    AutoModelForTokenClassification,
    TrainingArguments, 
    Trainer,
    DataCollatorForTokenClassification
)
from datasets import Dataset
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report as seq_classification_report

import warnings
warnings.filterwarnings('ignore')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


In [None]:
def load_conll_data(file_path):
    """Load CoNLL format data and convert to list of sentences with labels"""
    sentences = []
    labels = []
    current_sentence = []
    current_labels = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            
            # Skip comments
            if line.startswith('#'):
                continue
                
            # Empty line indicates end of sentence
            if not line:
                if current_sentence:
                    sentences.append(current_sentence)
                    labels.append(current_labels)
                    current_sentence = []
                    current_labels = []
            else:
                # Split token and label
                parts = line.split('\t')
                if len(parts) == 2:
                    token, label = parts
                    current_sentence.append(token)
                    current_labels.append(label)
    
    # Add last sentence if file doesn't end with empty line
    if current_sentence:
        sentences.append(current_sentence)
        labels.append(current_labels)
    
    return sentences, labels

# Load the data
conll_file = '../data/conll_labeled/amharic_ecommerce_conll.txt'
sentences, labels = load_conll_data(conll_file)

print(f"Loaded {len(sentences)} sentences")
print(f"Example sentence: {sentences[0]}")
print(f"Example labels: {labels[0]}")


In [None]:
# Create label mappings
unique_labels = set()
for label_list in labels:
    unique_labels.update(label_list)

label_list = sorted(list(unique_labels))
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

print(f"Unique labels: {label_list}")
print(f"Number of labels: {len(label_list)}")

# Convert labels to numeric IDs
label_ids = [[label2id[label] for label in label_list] for label_list in labels]

# Split data into train and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    sentences, label_ids, test_size=0.2, random_state=42
)

print(f"Training sentences: {len(train_sentences)}")
print(f"Validation sentences: {len(val_sentences)}")


In [None]:
# Create datasets
train_dataset = Dataset.from_dict({
    'tokens': train_sentences,
    'ner_tags': train_labels
})

val_dataset = Dataset.from_dict({
    'tokens': val_sentences,
    'ner_tags': val_labels
})

print("Datasets created successfully")
print(f"Train dataset: {train_dataset}")
print(f"Validation dataset: {val_dataset}")

# Display some examples
print(f"\nExample training data:")
for i in range(2):
    print(f"Sentence {i+1}: {train_sentences[i]}")
    print(f"Labels {i+1}: {[id2label[label] for label in train_labels[i]]}")
    print()


In [None]:
# Model name - using XLM-Roberta for multilingual support
model_name = "xlm-roberta-base"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(f"Loaded tokenizer for {model_name}")
print(f"Vocab size: {tokenizer.vocab_size}")

# Load the model for token classification
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

print(f"Model loaded with {len(label_list)} labels")
model.to(device)


In [None]:
def tokenize_and_align_labels(examples):
    """Tokenize the texts and align the labels with the tokens"""
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding=False,
        max_length=512
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        
        for word_idx in word_ids:
            if word_idx is None:
                # Special tokens (CLS, SEP, PAD) get label -100
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # First token of a word gets the label
                label_ids.append(label[word_idx])
            else:
                # Subsequent tokens of the same word get -100 (ignored in loss calculation)
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply tokenization
train_tokenized = train_dataset.map(tokenize_and_align_labels, batched=True)
val_tokenized = val_dataset.map(tokenize_and_align_labels, batched=True)

print("Tokenization completed")
print(f"Train tokenized: {train_tokenized}")
print(f"Validation tokenized: {val_tokenized}")


In [None]:
# Data collator
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True
)

# Evaluation metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = {
        'precision': precision_score(true_labels, true_predictions),
        'recall': recall_score(true_labels, true_predictions),
        'f1': f1_score(true_labels, true_predictions),
    }
    return results

print("Evaluation function defined")


In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="../models/xlm-roberta-amharic-ner",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir="../logs",
    logging_steps=10,
    save_total_limit=2,
    report_to="none",  # Disable wandb logging
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Training arguments configured")
print("Trainer initialized")


In [None]:
# Train the model
print("Starting training...")
trainer.train()
print("Training completed!")

# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation Results:")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")

# Save the best model
model_save_path = "../models/xlm-roberta-amharic-ner-final"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

# Save label mappings
label_mappings = {
    'id2label': id2label,
    'label2id': label2id
}

with open(f"{model_save_path}/label_mappings.json", 'w', encoding='utf-8') as f:
    json.dump(label_mappings, f, ensure_ascii=False, indent=2)

print(f"Model saved to {model_save_path}")
print(f"Label mappings saved to {model_save_path}/label_mappings.json")
