In [None]:
!pip install transformers datasets torch
!pip install seqeval  # for evaluating NER performance


In [3]:
import pandas as pd
from datasets import DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification


In [4]:
import requests
from datasets import DatasetDict, Dataset
import pandas as pd

# Step 1: Download the file from Google Drive 1t64KPxLuOrsfaYbhT2I0l7IIYI8Es_gD
file_id = "1qpzy8eeqlSzkSN4g4yVLAa_ce0ZUovUh"  # Replace with your file ID
download_url = f"https://drive.google.com/uc?id={file_id}"
file_path = "dataset.conll"  # Path to save the file locally

response = requests.get(download_url)
with open(file_path, "wb") as file:
    file.write(response.content)

print(f"File downloaded and saved as {file_path}")


File downloaded and saved as dataset.conll


In [5]:
# Step 2: Load and process the data from .conll file
def load_conll_data(file_path):
    sentences = []
    labels = []
    sentence = []
    label = []

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()  # Remove leading/trailing whitespaces
            if line:  # Non-empty line
                token, entity = line.split()  # Split token and label
                sentence.append(token)
                label.append(entity)
            else:  # Empty line (end of a sentence)
                if sentence:  # Only add non-empty sentences
                    sentences.append(sentence)
                    labels.append(label)
                sentence = []  # Reset sentence and label for next sentence
                label = []  # Reset for next sentence

        # Add the last sentence (if the file doesn't end with an empty line)
        if sentence:
            sentences.append(sentence)
            labels.append(label)

    return sentences, labels

# Load dataset from the downloaded .conll file
file_path = "dataset.conll"  # Ensure the correct file path
sentences, labels = load_conll_data(file_path)

In [6]:
# Step 3: Convert to Hugging Face Dataset format
data = {"tokens": sentences, "ner_tags": labels}
dataset = DatasetDict({
    "train": Dataset.from_dict(data)
})

# Check if data is loaded correctly
print(f"Loaded {len(sentences)} sentences with {len(labels)} labels.")

Loaded 1340 sentences with 1340 labels.


In [7]:
# Optional: Inspect a sample sentence and its corresponding labels
print(f"Sample sentence: {sentences[0]}")
print(f"Sample labels: {labels[0]}")

print(f"Loaded {len(sentences)} sentences with {len(labels)} labels.")

Sample sentence: ['ለኮንዶሚኒየም', 'ለጠባብ', 'ቤቶች', 'ገላግሌ', 'የሆነ', 'ከንፁህ', 'የሲልከን', 'ጥሬ', 'እቃ', 'የተሰራ', 'የልጆች', 'ማጠቢያ', 'ምስሉ', 'ላይ', 'እንደሚያዩት', 'መታጠፍ', 'መዘርጋት', 'የሚችል', '3350ብር', 'ይደውሉልን', 'እርሶ', 'መምጣት', 'ባይመቾ', 'እኛ', 'ያሉበት', 'ድረስ', 'እናደርስሎታለን', 'ስልክ', '0905707448', '0909003864', 'ሲና', 'የተመረጡና', 'ጥራታቸውን', 'የጠበቁ', 'የልጆች', 'እቃ', 'አስመጪ', '0909003864', '0905707448', 'እቃ', 'ለማዘዝ', 'ከስር', 'ያለውን', 'ሊንኮች', 'በመጫን', 'ማዘዝ', 'ትችላላቹ', '@', '@2', 'አድራሻ', 'ቁጥር', 'ገርጂ', 'ኢምፔሪያል', 'ከሳሚ', 'ህንፃ', 'ጎን', 'አልፎዝ', 'ፕላዛ', 'ግራውንድ', 'ላይ', 'እንደገቡ', 'ያገኙናል', '2ቁጥር2', '4ኪሎ', 'ቅድስት', 'ስላሴ', 'ህንፃ', 'ማለትም', 'ከብልፅግና', 'ዋናፅፈት', 'ቤት', 'ህንፃ', 'በስተ', 'ቀኝ', 'ባለው', 'አስፓልት', '20ሜትር', 'ዝቅ', 'እንዳሉ', 'ሀበሻ', 'ኮፊ', 'የሚገኝበት', 'ቀይ', 'ሸክላ', 'ህንፃ', '2ተኛ', 'ፎቅ', 'ላይ', 'ያገኙናል', '3ቁጥር3', 'ብስራተ', 'ገብርኤል', 'ላፍቶ', 'ሞል', 'መግቢያው', 'ፊት', 'ለፊት', 'የሚገኘው', 'የብስራተ', 'ገብርኤል', 'ቤተ', 'ክርስቲያን', 'ህንፃ', 'አንደኛ', 'ፎቅ', 'ላይ', 'ደረጃ', 'እንደወጣቹ', 'በስተግራ', 'በኩል', 'ሱቅ', 'ቁጥር', '-09', 'ክቡራን', 'ደምበኞቻችን', 'ገርጂ', 'አልፎዝ', 'ፕላዛ', 'ላይ', 'አራት', 'ኪሎ', 'ቅድስት', 'ስላሴ', 'እንዲሁም', '

In [8]:
# Preprocessing the Data
unique_labels = list(set(label for sublist in labels for label in sublist))
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}


In [9]:
# Load tokenizer
model_checkpoint = "xlm-roberta-base"  # Use "bert-tiny-amharic" for Amharic or other model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [10]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [11]:
# Apply preprocessing
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/1340 [00:00<?, ? examples/s]

In [12]:
# Split dataset into train and validation
train_size = int(0.8 * len(tokenized_datasets["train"]))
eval_size = len(tokenized_datasets["train"]) - train_size
train_dataset = tokenized_datasets["train"].select(range(train_size))
eval_dataset = tokenized_datasets["train"].select(range(train_size, len(tokenized_datasets["train"])))


In [13]:
# Load the model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=len(unique_labels), id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# Define training arguments with save_strategy="epoch" to match eval_strategy
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # Use eval_strategy instead of evaluation_strategy
    save_strategy="epoch",  # Ensure save strategy matches eval strategy
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="none"
)

In [15]:

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # Add the evaluation dataset
    tokenizer=tokenizer,
    data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer)  # Fix for deprecated tokenizer warning
)


  trainer = Trainer(


In [None]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss


Evaluating the model

In [None]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = labels

    results = []
    for pred, true in zip(predictions, true_labels):
        temp_pred, temp_true = [], []
        for p, t in zip(pred, true):
            if t != -100:  # Skip padding tokens
                temp_pred.append(p)
                temp_true.append(t)
        results.append((temp_pred, temp_true))

    precision = precision_score(results)
    recall = recall_score(results)
    f1 = f1_score(results)
    return {"precision": precision, "recall": recall, "f1": f1}


In [None]:
trainer.evaluate()


In [None]:
# Save the Fine-Tuned Model
model.save_pretrained('fine_tuned_model')
tokenizer.save_pretrained('fine_tuned_model')
