In [None]:
# Install required packages
%pip install torch transformers datasets tokenizers scikit-learn accelerate seqeval --quiet
import pandas as pd
import numpy as np
import torch
import time
from sklearn.model_selection import train_test_split

# Transformers and datasets
from transformers import (
    AutoTokenizer, 
    AutoModelForTokenClassification,
    TrainingArguments, 
    Trainer,
    DataCollatorForTokenClassification
)
from datasets import Dataset
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report

import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


In [None]:
# Load the same data processing functions from Task 3
def load_conll_data(file_path):
    """Load CoNLL format data and convert to list of sentences with labels"""
    sentences = []
    labels = []
    current_sentence = []
    current_labels = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line.startswith('#'):
                continue
            if not line:
                if current_sentence:
                    sentences.append(current_sentence)
                    labels.append(current_labels)
                    current_sentence = []
                    current_labels = []
            else:
                parts = line.split('\t')
                if len(parts) == 2:
                    token, label = parts
                    current_sentence.append(token)
                    current_labels.append(label)
    
    if current_sentence:
        sentences.append(current_sentence)
        labels.append(current_labels)
    
    return sentences, labels

# Load data
conll_file = '../data/conll_labeled/amharic_ecommerce_conll.txt'
sentences, labels = load_conll_data(conll_file)

# Create label mappings
unique_labels = set()
for label_list in labels:
    unique_labels.update(label_list)

label_list = sorted(list(unique_labels))
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

# Convert to numeric IDs and split data
label_ids = [[label2id[label] for label in label_list] for label_list in labels]
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    sentences, label_ids, test_size=0.2, random_state=42
)

print(f"Data loaded: {len(sentences)} sentences, {len(label_list)} labels")
print(f"Train: {len(train_sentences)}, Validation: {len(val_sentences)}")


In [None]:
# Define models to compare
models_to_compare = {
    'XLM-Roberta': 'xlm-roberta-base',
    'DistilBERT': 'distilbert-base-multilingual-cased',
    'mBERT': 'bert-base-multilingual-cased',
    'XLM-Roberta-Large': 'xlm-roberta-large'  # Optional: if resources allow
}

# We'll start with the smaller models for faster comparison
quick_models = {
    'XLM-Roberta': 'xlm-roberta-base',
    'DistilBERT': 'distilbert-base-multilingual-cased',
    'mBERT': 'bert-base-multilingual-cased'
}

print("Models selected for comparison:")
for name, model_id in quick_models.items():
    print(f"- {name}: {model_id}")


In [None]:
# Model training and comparison function
def train_and_evaluate_model(model_name, model_id, train_dataset, val_dataset, id2label, label2id):
    """Train and evaluate a single model"""
    print(f"\n{'='*50}")
    print(f"Training {model_name} ({model_id})")
    print(f"{'='*50}")
    
    start_time = time.time()
    
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForTokenClassification.from_pretrained(
        model_id,
        num_labels=len(id2label),
        id2label=id2label,
        label2id=label2id
    )
    
    # Tokenization function
    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(
            examples["tokens"],
            truncation=True,
            is_split_into_words=True,
            padding=False,
            max_length=512
        )

        labels = []
        for i, label in enumerate(examples["ner_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            
            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(-100)
                previous_word_idx = word_idx

            labels.append(label_ids)

        tokenized_inputs["labels"] = labels
        return tokenized_inputs
    
    # Tokenize datasets
    train_tokenized = train_dataset.map(tokenize_and_align_labels, batched=True)
    val_tokenized = val_dataset.map(tokenize_and_align_labels, batched=True)
    
    # Data collator
    data_collator = DataCollatorForTokenClassification(
        tokenizer=tokenizer,
        padding=True
    )
    
    # Evaluation metrics
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=2)

        true_predictions = [
            [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        results = {
            'precision': precision_score(true_labels, true_predictions),
            'recall': recall_score(true_labels, true_predictions),
            'f1': f1_score(true_labels, true_predictions),
        }
        return results
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"../models/{model_name.lower()}-amharic-ner",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=2,  # Reduced for comparison speed
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        logging_dir=f"../logs/{model_name.lower()}",
        logging_steps=10,
        save_total_limit=1,
        report_to="none",
    )
    
    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized,
        eval_dataset=val_tokenized,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    
    # Train
    trainer.train()
    
    # Evaluate
    eval_results = trainer.evaluate()
    training_time = time.time() - start_time
    
    print(f"Training completed in {training_time:.2f} seconds")
    print(f"Results: F1={eval_results['eval_f1']:.4f}, P={eval_results['eval_precision']:.4f}, R={eval_results['eval_recall']:.4f}")
    
    return {
        'model_name': model_name,
        'model_id': model_id,
        'training_time': training_time,
        'f1_score': eval_results['eval_f1'],
        'precision': eval_results['eval_precision'],
        'recall': eval_results['eval_recall'],
        'model': model,
        'tokenizer': tokenizer
    }


In [None]:
# Create datasets
train_dataset = Dataset.from_dict({'tokens': train_sentences, 'ner_tags': train_labels})
val_dataset = Dataset.from_dict({'tokens': val_sentences, 'ner_tags': val_labels})

# Run comparison for all models
results = []

for model_name, model_id in quick_models.items():
    print(f"\n{'='*60}")
    print(f"Training and evaluating: {model_name}")
    print(f"{'='*60}")
    
    try:
        result = train_and_evaluate_model(
            model_name, model_id, train_dataset, val_dataset, id2label, label2id
        )
        results.append(result)
        
        # Save intermediate results
        pd.DataFrame(results).to_csv(f'../model_comparison_partial.csv', index=False)
        
    except Exception as e:
        print(f"Error training {model_name}: {str(e)}")
        continue

print(f"\nCompleted training {len(results)} models successfully!")


In [None]:
# Generate comparison results and analysis
if results:
    # Create comparison DataFrame
    comparison_df = pd.DataFrame(results)
    comparison_df = comparison_df.sort_values('f1_score', ascending=False)
    
    print("\n" + "="*60)
    print("MODEL COMPARISON RESULTS")
    print("="*60)
    print(comparison_df[['model_name', 'f1_score', 'precision', 'recall', 'training_time']].to_string(index=False))
    
    # Save results
    comparison_df.to_csv('../model_comparison_results.csv', index=False)
    
    # Best model analysis
    best_model = comparison_df.iloc[0]
    print(f"\n BEST MODEL: {best_model['model_name']}")
    print(f"F1 Score: {best_model['f1_score']:.4f}")
    print(f"Precision: {best_model['precision']:.4f}")
    print(f"Recall: {best_model['recall']:.4f}")
    print(f"Training Time: {best_model['training_time']:.2f} seconds")
    
    # Performance analysis
    print(f"\n PERFORMANCE ANALYSIS:")
    print(f"• Best F1 Score: {comparison_df['f1_score'].max():.4f}")
    print(f"• Average F1 Score: {comparison_df['f1_score'].mean():.4f}")
    print(f"• Fastest Training: {comparison_df.loc[comparison_df['training_time'].idxmin(), 'model_name']} ({comparison_df['training_time'].min():.2f}s)")
    print(f"• Average Training Time: {comparison_df['training_time'].mean():.2f} seconds")
    
    print(f"\n Task 4 completed! Results saved to model_comparison_results.csv")
else:
    print("No models were successfully trained. Please check the setup and try again.")
