<a href="https://colab.research.google.com/github/aman3013/Fine-tuning-Amharic-NER/blob/Task-4/Model_Comparison_%26_Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install scikit-learn


In [None]:
!pip install datasets

In [None]:
!pip install torch

In [None]:
!pip install evaluate


In [None]:
!pip install transformers datasets seqeval scikit-learn


In [13]:
from transformers import XLMRobertaTokenizer, DistilBertTokenizer, BertTokenizer
from transformers import TrainingArguments, Trainer
from datasets import load_dataset
import evaluate  # Updated import statement
from sklearn.model_selection import train_test_split
import time
import torch
import pandas as pd

In [14]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import TrainingArguments, Trainer
import numpy as np

In [28]:
# Step 4: Load the dataset from the specified path
data_path = '/content/drive/My Drive/merged_file.conll'  # Update this path to the correct location

try:
    # Load the data and ensure it's a DataFrame
    data = pd.read_csv(data_path, sep="\t", header=None, names=["word", "label"])
    if isinstance(data, pd.DataFrame):
        print("Data loaded successfully.")
    else:
        raise ValueError("Loaded data is not a DataFrame.")
except FileNotFoundError:
    print(f"File not found at path: {data_path}")
    data = None
except Exception as e:
    print(f"An error occurred: {e}")
    data = None

# Display the DataFrame
print(df.head())

Data loaded successfully.
   Word Label
0   ይህን     O
1   መፍጫ     O
2  ከሁሉም     O
3  የተሻለ     O
4    ሆኖ     O


In [33]:
# Check for None values in the 'label' column
none_count = data['label'].isnull().sum()
print(f"Number of None values in labels: {none_count}")


Number of None values in labels: 130150


In [34]:
# Remove rows with None labels
data = data[data['label'].notnull()]


In [35]:
# Replace None with a default label (e.g., "O")
data['label'] = data['label'].fillna('O')


In [36]:
# Display the DataFrame
print(df.head())

   Word Label
0   ይህን     O
1   መፍጫ     O
2  ከሁሉም     O
3  የተሻለ     O
4    ሆኖ     O


In [37]:
# Check for None values in the 'label' column
none_count = data['label'].isnull().sum()
print(f"Number of None values in labels: {none_count}")


Number of None values in labels: 0


In [38]:
# Proceed only if data is successfully loaded
if data is not None:
    # Step 5: Convert to Hugging Face Dataset
    dataset = Dataset.from_pandas(data)
    print("Dataset converted to Hugging Face format.")

Dataset converted to Hugging Face format.


In [39]:
    # Step 6: Define label mapping
    unique_labels = list(set(data['label'].values))
    label2id = {label: idx for idx, label in enumerate(unique_labels)}
    id2label = {idx: label for label, idx in label2id.items()}

In [40]:
 # Step 7: Tokenization function
def tokenize_and_align_labels(examples):
        tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
        tokenized_inputs = tokenizer(examples['word'], truncation=True, is_split_into_words=True)

        # Align labels with tokenized inputs
        labels = []
        for i, label in enumerate(examples['label']):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            label_ids = [-100 if idx is None else label2id[label] for idx in word_ids]
            labels.append(label_ids)

        tokenized_inputs['labels'] = labels
        return tokenized_inputs

In [41]:
from transformers import AutoTokenizer
 # Step 8: Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
tokenized_dataset

In [None]:
# Step 9: Split the dataset into training and validation sets
    train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
    train_dataset = train_test_split['train']
    val_dataset = train_test_split['test']

In [None]:
    # Step 10: Set up training arguments
    training_args = TrainingArguments(
        output_dir='./results',          # output directory
        evaluation_strategy="epoch",     # evaluation strategy
        learning_rate=2e-5,              # learning rate
        per_device_train_batch_size=16,  # batch size for training
        per_device_eval_batch_size=16,   # batch size for evaluation
        num_train_epochs=3,              # total number of training epochs
        weight_decay=0.01,               # strength of weight decay
    )

In [None]:
# Apply tokenization
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
from transformers import AutoModelForTokenClassification, Trainer, TrainingArguments
from sklearn.metrics import precision_score, recall_score, f1_score

# Define models to compare
models_to_compare = [
    "bert-base-multilingual-cased",  # mBERT
    "xlm-roberta-base",               # XLM-Roberta
    "distilbert-base-multilingual-cased",  # DistilBERT
]

results = {}

for model_name in models_to_compare:
    # Load the model
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(unique_labels), label2id=label2id)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f"./results/{model_name}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir='./logs',
    )

    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,  # You may want to split this into train/val sets
        eval_dataset=tokenized_dataset,
        compute_metrics=lambda p: {
            'precision': precision_score(p.predictions, p.label_ids, average='weighted', zero_division=0),
            'recall': recall_score(p.predictions, p.label_ids, average='weighted', zero_division=0),
            'f1': f1_score(p.predictions, p.label_ids, average='weighted', zero_division=0),
        }
    )

    # Fine-tune the model
    trainer.train()

    # Evaluate the model
    eval_results = trainer.evaluate()

    # Store the evaluation results for comparison
    results[model_name] = eval_results


In [None]:
# Display comparison results
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"Evaluation Results: {metrics}")
