In [None]:
!pip install  seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=432716500a73ebdda8c126e827f6ed8090163e991b1b52e73e384c6d86187816
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [4]:
from google.colab import drive
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from seqeval.metrics import f1_score, precision_score, recall_score
import torch
import time
from pathlib import Path
import os
import gc
from transformers.trainer_utils import EvalPrediction

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Define paths and configurations
CONLL_FILE = "/content/drive/My Drive/kifiya5/labeled_data.conll"
MODEL_CONFIGS = {
    "xlm-roberta-base": {
        "output_dir": "./ner_model_xlmroberta",
        "gdrive_output": "/content/drive/My Drive/kifiya5/ner_model_xlmroberta",
        "training_args": {
            "learning_rate": 2e-5,
            "per_device_train_batch_size": 16,
            "per_device_eval_batch_size": 16,
            "num_train_epochs": 3,
            "weight_decay": 0.01
        }
    },
    "distilbert-base-multilingual-cased": {
        "output_dir": "./ner_model_distilbert",
        "gdrive_output": "/content/drive/My Drive/kifiya5/ner_model_distilbert",
        "training_args": {
            "learning_rate": 5e-5,
            "per_device_train_batch_size": 32,
            "per_device_eval_batch_size": 32,
            "num_train_epochs": 5,
            "weight_decay": 0.01
        }
    },
    "bert-base-multilingual-cased": {
        "output_dir": "./ner_model_mbert",
        "gdrive_output": "/content/drive/My Drive/kifiya5/ner_model_mbert",
        "training_args": {
            "learning_rate": 2e-5,
            "per_device_train_batch_size": 16,
            "per_device_eval_batch_size": 16,
            "num_train_epochs": 3,
            "weight_decay": 0.01
        }
    }
}
label_list = ["O", "B-PRODUCT", "I-PRODUCT", "B-LOC", "I-LOC", "B-PRICE", "I-PRICE"]
label2id = {label: idx for idx, label in enumerate(label_list)}
id2label = {idx: label for label, idx in label2id.items()}

def read_conll(file_path):
    """
    Read CoNLL file from Google Drive and convert to list of sentences with tokens and labels.
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"CoNLL file not found at {file_path}")

    sentences = []
    current_sentence = {"tokens": [], "labels": []}

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                try:
                    token, label = line.split()
                    current_sentence["tokens"].append(token)
                    current_sentence["labels"].append(label)
                except ValueError:
                    print(f"Skipping malformed line: {line}")
            else:
                if current_sentence["tokens"]:
                    sentences.append(current_sentence)
                    current_sentence = {"tokens": [], "labels": []}

    if current_sentence["tokens"]:
        sentences.append(current_sentence)

    return sentences

def prepare_dataset(sentences):
    """
    Convert sentences to Hugging Face Dataset format.
    """
    data = {"tokens": [s["tokens"] for s in sentences], "ner_tags": [s["labels"] for s in sentences]}
    dataset = Dataset.from_dict(data)

    def convert_labels(example):
        example["ner_tags"] = [label2id[label] for label in example["ner_tags"]]
        return example

    dataset = dataset.map(convert_labels)
    return dataset

def tokenize_and_align_labels(examples, tokenizer):
    """
    Tokenize inputs and align labels with tokenized inputs.
    """
    tokenized_inputs = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding=True,
        return_tensors="pt"
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_str = id2label[label[word_idx]]
                if label_str.startswith("B-"):
                    label_str = "I-" + label_str[2:]
                    if label_str not in label2id:
                        label_str = "O"
                    label_ids.append(label2id[label_str])
                else:
                    label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

def compute_metrics(eval_pred: EvalPrediction):
    """
    Compute precision, recall, and F1-score for NER evaluation.
    """
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    predicted_labels = [[id2label[p] for p, l in zip(pred, label) if l != -100] for pred, label in zip(predictions, labels)]

    return {
        "precision": precision_score(true_labels, predicted_labels),
        "recall": recall_score(true_labels, predicted_labels),
        "f1": f1_score(true_labels, predicted_labels)
    }

def fine_tune_model(model_name, config):
    """
    Fine-tune a model for NER if not already fine-tuned.
    """
    try:
        # Check if model exists
        if not os.path.exists(config["gdrive_output"]) or not os.path.exists(os.path.join(config["gdrive_output"], "pytorch_model.bin")):
            print(f"Fine-tuning {model_name} as model not found at {config['gdrive_output']}")
            # Load dataset
            sentences = read_conll(CONLL_FILE)
            dataset = prepare_dataset(sentences)
            dataset = dataset.train_test_split(test_size=0.2, seed=42)
            train_dataset = dataset["train"]
            val_dataset = dataset["test"]

            # Initialize tokenizer and model
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModelForTokenClassification.from_pretrained(
                model_name,
                num_labels=len(label_list),
                id2label=id2label,
                label2id=label2id
            )

            # Tokenize dataset
            tokenized_train = train_dataset.map(
                lambda x: tokenize_and_align_labels(x, tokenizer),
                batched=True,
                remove_columns=["tokens", "ner_tags"]
            )
            tokenized_val = val_dataset.map(
                lambda x: tokenize_and_align_labels(x, tokenizer),
                batched=True,
                remove_columns=["tokens", "ner_tags"]
            )

            # Set up training arguments
            training_args = TrainingArguments(
                output_dir=config["output_dir"],
                eval_strategy="epoch",  # Updated to eval_strategy
                save_strategy="epoch",
                load_best_model_at_end=True,
                metric_for_best_model="f1",
                report_to="none",
                **config["training_args"]
            )

            # Initialize trainer
            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=tokenized_train,
                eval_dataset=tokenized_val,
                tokenizer=tokenizer,
                compute_metrics=compute_metrics
            )

            # Train and save
            trainer.train()
            trainer.save_model(config["output_dir"])
            tokenizer.save_pretrained(config["output_dir"])
            trainer.save_model(config["gdrive_output"])
            tokenizer.save_pretrained(config["gdrive_output"])
            print(f"{model_name} fine-tuned and saved to {config['output_dir']} and {config['gdrive_output']}")
        else:
            print(f"Model {model_name} already fine-tuned at {config['gdrive_output']}")
    except Exception as e:
        print(f"Error fine-tuning {model_name}: {str(e)}")
    finally:
        # Clean up memory
        try:
            del model, tokenizer, trainer
        except:
            pass
        gc.collect()
        torch.cuda.empty_cache()

def measure_inference_speed(model, tokenizer, dataset, device, num_iterations=10):
    """
    Measure average inference speed for a model on the dataset.
    """
    model.eval()
    model.to(device)
    total_time = 0

    for _ in range(num_iterations):
        start_time = time.time()
        for sample in dataset:
            inputs = {
                "input_ids": torch.tensor([sample["input_ids"]], device=device),
                "attention_mask": torch.tensor([sample["attention_mask"]], device=device)
            }
            with torch.no_grad():
                model(**inputs)
        total_time += time.time() - start_time

    avg_time_per_sample = (total_time / num_iterations) / len(dataset)
    return avg_time_per_sample

def evaluate_model(model_name, model_path, dataset):
    """
    Evaluate a fine-tuned model on the validation dataset and measure inference speed.
    """
    try:
        print(f"Attempting to load model from: {model_path}")
        # Verify model path
        if not os.path.exists(model_path) or not os.path.exists(os.path.join(model_path, "pytorch_model.bin")):
            raise FileNotFoundError(f"Model path does not exist or is incomplete: {model_path}")

        # Load tokenizer and model
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForTokenClassification.from_pretrained(model_path)

        # Tokenize dataset
        tokenized_dataset = dataset.map(
            lambda x: tokenize_and_align_labels(x, tokenizer),
            batched=True,
            remove_columns=["tokens", "ner_tags"]
        )

        # Set up Trainer for evaluation
        training_args = TrainingArguments(
            output_dir=f"./eval_{model_name}",
            per_device_eval_batch_size=16,
            report_to="none"
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            eval_dataset=tokenized_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
        )

        # Evaluate metrics
        metrics = trainer.evaluate()

        # Measure inference speed
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        avg_inference_time = measure_inference_speed(model, tokenizer, tokenized_dataset, device)
        metrics["avg_inference_time_per_sample"] = avg_inference_time

        print(f"Evaluation Results for {model_name}: {metrics}")
        return metrics

    except Exception as e:
        print(f"Error evaluating {model_name}: {str(e)}")
        return None
    finally:
        # Clean up memory
        try:
            del model, tokenizer, trainer
        except:
            pass
        gc.collect()
        torch.cuda.empty_cache()

def compare_and_select_model():
    """
    Compare models based on accuracy, speed, and robustness, and select the best for production.
    """
    # Load and prepare dataset
    sentences = read_conll(CONLL_FILE)
    dataset = prepare_dataset(sentences)
    dataset = dataset.train_test_split(test_size=0.2, seed=42)
    val_dataset = dataset["test"]

    # Fine-tune models if not already done
    for model_name, config in MODEL_CONFIGS.items():
        fine_tune_model(model_name, config)

    # Evaluate each model
    results = {}
    for model_name, config in MODEL_CONFIGS.items():
        print(f"Evaluating {model_name}...")
        metrics = evaluate_model(model_name, config["gdrive_output"], val_dataset)
        if metrics:
            results[model_name] = metrics

    # Compare models
    comparison = {
        "model": [],
        "f1_score": [],
        "precision": [],
        "recall": [],
        "inference_time": [],
        "robustness_notes": []
    }

    for model_name, metrics in results.items():
        comparison["model"].append(model_name)
        comparison["f1_score"].append(metrics["eval_f1"])
        comparison["precision"].append(metrics["eval_precision"])
        comparison["recall"].append(metrics["eval_recall"])
        comparison["inference_time"].append(metrics["avg_inference_time_per_sample"])

        # Qualitative robustness assessment
        if model_name == "xlm-roberta-base":
            robustness = "High robustness due to large multilingual pre-training; handles Amharic well but may require preprocessing for multi-modal data."
        elif model_name == "distilbert-base-multilingual-cased":
            robustness = "Moderate robustness; lightweight model suitable for text but less effective for complex Amharic entities or multi-modal data."
        elif model_name == "bert-base-multilingual-cased":
            robustness = "Good robustness for Amharic text; balanced performance but requires more compute than DistilBERT."

        comparison["robustness_notes"].append(robustness)

    # Create comparison DataFrame
    comparison_df = pd.DataFrame(comparison)

    # Handle empty DataFrame case
    if comparison_df.empty:
        print("\nModel Comparison: No models successfully evaluated.")
        recommendation = {
            "best_model": None,
            "reason": "No models could be evaluated due to errors in fine-tuning or missing model files."
        }
    else:
        print("\nModel Comparison:")
        print(comparison_df.to_string(index=False))

        # Select best model
        best_model = comparison_df.loc[comparison_df["f1_score"].idxmax(), "model"]
        recommendation = {
            "best_model": best_model,
            "reason": f"{best_model} achieved the highest F1-score ({comparison_df.loc[comparison_df['model'] == best_model, 'f1_score'].values[0]:.3f}), balancing accuracy and robustness for EthioMart's Amharic NER task. Inference speed ({comparison_df.loc[comparison_df['model'] == best_model, 'inference_time'].values[0]:.4f}s/sample) is suitable for production."
        }
        print("\nRecommendation:")
        print(f"Best Model: {recommendation['best_model']}")
        print(f"Reason: {recommendation['reason']}")

    return {"comparison": comparison_df, "recommendation": recommendation}

def main():
    """
    Main function to compare and select the best NER model.
    """
    results = compare_and_select_model()
    # Save comparison results to CSV
    results["comparison"].to_csv("/content/drive/My Drive/kifiya5/ner_model_comparison.csv", index=False, encoding="utf-8")
    print("\nComparison results saved to /content/drive/My Drive/kifiya5/ner_model_comparison.csv")

if __name__ == "__main__":
    main()

Mounted at /content/drive


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Fine-tuning xlm-roberta-base as model not found at /content/drive/My Drive/kifiya5/ner_model_xlmroberta


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,1.493701,0.0,0.0,0.0
2,No log,0.916465,0.0,0.0,0.0
3,No log,0.880866,0.0,0.0,0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


xlm-roberta-base fine-tuned and saved to ./ner_model_xlmroberta and /content/drive/My Drive/kifiya5/ner_model_xlmroberta
Fine-tuning distilbert-base-multilingual-cased as model not found at /content/drive/My Drive/kifiya5/ner_model_distilbert


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.837449,0.0,0.0,0.0
2,No log,0.77118,0.0,0.0,0.0
3,No log,0.757549,0.0,0.0,0.0
4,No log,0.745189,0.0,0.0,0.0
5,No log,0.737507,0.0,0.0,0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


distilbert-base-multilingual-cased fine-tuned and saved to ./ner_model_distilbert and /content/drive/My Drive/kifiya5/ner_model_distilbert
Fine-tuning bert-base-multilingual-cased as model not found at /content/drive/My Drive/kifiya5/ner_model_mbert


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.756164,0.0,0.0,0.0
2,No log,0.75918,0.0,0.0,0.0
3,No log,0.727626,0.0,0.0,0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


bert-base-multilingual-cased fine-tuned and saved to ./ner_model_mbert and /content/drive/My Drive/kifiya5/ner_model_mbert
Evaluating xlm-roberta-base...
Attempting to load model from: /content/drive/My Drive/kifiya5/ner_model_xlmroberta
Error evaluating xlm-roberta-base: Model path does not exist or is incomplete: /content/drive/My Drive/kifiya5/ner_model_xlmroberta
Evaluating distilbert-base-multilingual-cased...
Attempting to load model from: /content/drive/My Drive/kifiya5/ner_model_distilbert
Error evaluating distilbert-base-multilingual-cased: Model path does not exist or is incomplete: /content/drive/My Drive/kifiya5/ner_model_distilbert
Evaluating bert-base-multilingual-cased...
Attempting to load model from: /content/drive/My Drive/kifiya5/ner_model_mbert
Error evaluating bert-base-multilingual-cased: Model path does not exist or is incomplete: /content/drive/My Drive/kifiya5/ner_model_mbert

Model Comparison: No models successfully evaluated.

Comparison results saved to /con