In [1]:
!pip install transformers datasets torch
!pip install seqeval  # for evaluating NER performance


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [2]:
import pandas as pd
from datasets import DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification


In [3]:
import requests
from datasets import DatasetDict, Dataset
import pandas as pd

# Step 1: Download the file from Google Drive 1t64KPxLuOrsfaYbhT2I0l7IIYI8Es_gD
file_id = "1qpzy8eeqlSzkSN4g4yVLAa_ce0ZUovUh"  # Replace with your file ID
download_url = f"https://drive.google.com/uc?id={file_id}"
file_path = "dataset.conll"  # Path to save the file locally

response = requests.get(download_url)
with open(file_path, "wb") as file:
    file.write(response.content)

print(f"File downloaded and saved as {file_path}")


File downloaded and saved as dataset.conll


In [4]:
# Step 2: Load and process the data from .conll file
def load_conll_data(file_path):
    sentences = []
    labels = []
    sentence = []
    label = []

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()  # Remove leading/trailing whitespaces
            if line:  # Non-empty line
                token, entity = line.split()  # Split token and label
                sentence.append(token)
                label.append(entity)
            else:  # Empty line (end of a sentence)
                if sentence:  # Only add non-empty sentences
                    sentences.append(sentence)
                    labels.append(label)
                sentence = []  # Reset sentence and label for next sentence
                label = []  # Reset for next sentence

        # Add the last sentence (if the file doesn't end with an empty line)
        if sentence:
            sentences.append(sentence)
            labels.append(label)

    return sentences, labels

# Load dataset from the downloaded .conll file
file_path = "dataset.conll"  # Ensure the correct file path
sentences, labels = load_conll_data(file_path)

In [5]:
# Step 3: Convert to Hugging Face Dataset format
data = {"tokens": sentences, "ner_tags": labels}
dataset = DatasetDict({
    "train": Dataset.from_dict(data)
})

# Check if data is loaded correctly
print(f"Loaded {len(sentences)} sentences with {len(labels)} labels.")

Loaded 1340 sentences with 1340 labels.


In [6]:
# Optional: Inspect a sample sentence and its corresponding labels
print(f"Sample sentence: {sentences[0]}")
print(f"Sample labels: {labels[0]}")

print(f"Loaded {len(sentences)} sentences with {len(labels)} labels.")

Sample sentence: ['ለኮንዶሚኒየም', 'ለጠባብ', 'ቤቶች', 'ገላግሌ', 'የሆነ', 'ከንፁህ', 'የሲልከን', 'ጥሬ', 'እቃ', 'የተሰራ', 'የልጆች', 'ማጠቢያ', 'ምስሉ', 'ላይ', 'እንደሚያዩት', 'መታጠፍ', 'መዘርጋት', 'የሚችል', '3350ብር', 'ይደውሉልን', 'እርሶ', 'መምጣት', 'ባይመቾ', 'እኛ', 'ያሉበት', 'ድረስ', 'እናደርስሎታለን', 'ስልክ', '0905707448', '0909003864', 'ሲና', 'የተመረጡና', 'ጥራታቸውን', 'የጠበቁ', 'የልጆች', 'እቃ', 'አስመጪ', '0909003864', '0905707448', 'እቃ', 'ለማዘዝ', 'ከስር', 'ያለውን', 'ሊንኮች', 'በመጫን', 'ማዘዝ', 'ትችላላቹ', '@', '@2', 'አድራሻ', 'ቁጥር', 'ገርጂ', 'ኢምፔሪያል', 'ከሳሚ', 'ህንፃ', 'ጎን', 'አልፎዝ', 'ፕላዛ', 'ግራውንድ', 'ላይ', 'እንደገቡ', 'ያገኙናል', '2ቁጥር2', '4ኪሎ', 'ቅድስት', 'ስላሴ', 'ህንፃ', 'ማለትም', 'ከብልፅግና', 'ዋናፅፈት', 'ቤት', 'ህንፃ', 'በስተ', 'ቀኝ', 'ባለው', 'አስፓልት', '20ሜትር', 'ዝቅ', 'እንዳሉ', 'ሀበሻ', 'ኮፊ', 'የሚገኝበት', 'ቀይ', 'ሸክላ', 'ህንፃ', '2ተኛ', 'ፎቅ', 'ላይ', 'ያገኙናል', '3ቁጥር3', 'ብስራተ', 'ገብርኤል', 'ላፍቶ', 'ሞል', 'መግቢያው', 'ፊት', 'ለፊት', 'የሚገኘው', 'የብስራተ', 'ገብርኤል', 'ቤተ', 'ክርስቲያን', 'ህንፃ', 'አንደኛ', 'ፎቅ', 'ላይ', 'ደረጃ', 'እንደወጣቹ', 'በስተግራ', 'በኩል', 'ሱቅ', 'ቁጥር', '-09', 'ክቡራን', 'ደምበኞቻችን', 'ገርጂ', 'አልፎዝ', 'ፕላዛ', 'ላይ', 'አራት', 'ኪሎ', 'ቅድስት', 'ስላሴ', 'እንዲሁም', '

In [7]:
# Preprocessing the Data
unique_labels = list(set(label for sublist in labels for label in sublist))
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}


In [8]:
# Load tokenizer
model_checkpoint = "xlm-roberta-base"  # Use "bert-tiny-amharic" for Amharic or other model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [9]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [10]:
# Apply preprocessing
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/1340 [00:00<?, ? examples/s]

In [11]:
# Split dataset into train and validation
train_size = int(0.8 * len(tokenized_datasets["train"]))
eval_size = len(tokenized_datasets["train"]) - train_size
train_dataset = tokenized_datasets["train"].select(range(train_size))
eval_dataset = tokenized_datasets["train"].select(range(train_size, len(tokenized_datasets["train"])))


In [12]:
# Load the model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=len(unique_labels), id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
# Define training arguments with save_strategy="epoch" to match eval_strategy
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # Use eval_strategy instead of evaluation_strategy
    save_strategy="epoch",  # Ensure save strategy matches eval strategy
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="none"
)

In [34]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np

def compute_metrics(p):
    preds, labels = p
    # Get predictions by choosing the class with the highest probability
    preds = np.argmax(preds, axis=2)

    # Remove ignored index (usually -100) from the labels and predictions
    true_labels = [[label for label in label_seq if label != -100] for label_seq in labels]
    true_preds = [[pred for pred, label in zip(pred_seq, label_seq) if label != -100]
                  for pred_seq, label_seq in zip(preds, labels)]

    # Flatten the lists
    true_labels_flat = [item for sublist in true_labels for item in sublist]
    true_preds_flat = [item for sublist in true_preds for item in sublist]

    # Compute precision, recall, f1, and accuracy
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels_flat, true_preds_flat, average='macro')
    accuracy = accuracy_score(true_labels_flat, true_preds_flat)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


In [35]:

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # Add the evaluation dataset
    tokenizer=tokenizer,
    data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer),  # Fix for deprecated tokenizer warning
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [36]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0016,0.109147,0.985166,0.960049,0.92647,0.940361
2,0.0011,0.130257,0.982781,0.944726,0.927444,0.932332
3,0.0003,0.14449,0.979868,0.933434,0.923231,0.92415
4,0.0007,0.142594,0.982781,0.945317,0.925716,0.931423
5,0.0002,0.141167,0.979603,0.928558,0.938547,0.930084
6,0.0002,0.159464,0.982252,0.941862,0.931037,0.931025
7,0.0,0.12745,0.984636,0.952085,0.945392,0.947376
8,0.0002,0.119479,0.984901,0.954815,0.944248,0.948151
9,0.0003,0.119257,0.98543,0.956404,0.944335,0.948994
10,0.0002,0.120157,0.985695,0.957578,0.944378,0.949585


TrainOutput(global_step=1340, training_loss=0.000521552546206973, metrics={'train_runtime': 1071.7809, 'train_samples_per_second': 10.002, 'train_steps_per_second': 1.25, 'total_flos': 1707006390115488.0, 'train_loss': 0.000521552546206973, 'epoch': 10.0})

In [37]:
results = trainer.evaluate()

In [38]:
# Print results in a structured format
print("Evaluation Results:")
print(f"Loss: {results['eval_loss']:.4f}")
print(f"Accuracy: {results['eval_accuracy']:.4f}")
print(f"Precision: {results['eval_precision']:.4f}")
print(f"Recall: {results['eval_recall']:.4f}")
print(f"F1-Score: {results['eval_f1']:.4f}")
print(f"Runtime: {results['eval_runtime']:.4f} seconds")
print(f"Samples per second: {results['eval_samples_per_second']:.2f}")
print(f"Steps per second: {results['eval_steps_per_second']:.2f}")
print(f"Epoch: {results['epoch']:.1f}")

Evaluation Results:
Loss: 0.1091
Accuracy: 0.9852
Precision: 0.9600
Recall: 0.9265
F1-Score: 0.9404
Runtime: 1.3962 seconds
Samples per second: 191.95
Steps per second: 24.35
Epoch: 10.0


In [None]:
# Save the Fine-Tuned Model
model.save_pretrained('fine_tuned_model')
tokenizer.save_pretrained('fine_tuned_model')


('fine_tuned_model/tokenizer_config.json',
 'fine_tuned_model/special_tokens_map.json',
 'fine_tuned_model/sentencepiece.bpe.model',
 'fine_tuned_model/added_tokens.json',
 'fine_tuned_model/tokenizer.json')