In [4]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [5]:
import pandas as pd
from datasets import DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification


In [7]:
# Step 1: Download the file from Google Drive
file_id = "1qpzy8eeqlSzkSN4g4yVLAa_ce0ZUovUh"  # Replace with your file ID
download_url = f"https://drive.google.com/uc?id={file_id}"
file_path = "dataset.conll"  # Path to save the file locally

response = requests.get(download_url)
with open(file_path, "wb") as file:
    file.write(response.content)

print(f"File downloaded and saved as {file_path}")

File downloaded and saved as dataset.conll


In [9]:
# Step 2: Load and process the data from .conll file
def load_conll_data(file_path):
    sentences = []
    labels = []
    sentence = []
    label = []

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()  # Remove leading/trailing whitespaces
            if line:  # Non-empty line
                token, entity = line.split()  # Split token and label
                sentence.append(token)
                label.append(entity)
            else:  # Empty line (end of a sentence)
                if sentence:  # Only add non-empty sentences
                    sentences.append(sentence)
                    labels.append(label)
                sentence = []  # Reset sentence and label for next sentence
                label = []  # Reset for next sentence

        # Add the last sentence (if the file doesn't end with an empty line)
        if sentence:
            sentences.append(sentence)
            labels.append(label)

    return sentences, labels

# Load dataset from the downloaded .conll file
sentences, labels = load_conll_data(file_path)

In [44]:
# Create label mapping
unique_labels = list(set(label for sublist in labels for label in sublist))
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

# Apply label mapping to 'labels'
numerical_labels = [[label2id[label] for label in sublist] for sublist in labels]


In [50]:
# Step 3: Convert to Hugging Face Dataset format
# Updated to use numerical_labels instead of labels
data = {"tokens": sentences, "ner_tags": numerical_labels}
dataset = DatasetDict({
    "train": Dataset.from_dict(data)
})

# Check if data is loaded correctly
print(f"Loaded {len(sentences)} sentences with {len(labels)} labels.")

Loaded 1340 sentences with 1340 labels.


In [34]:
# Step 4: Load the BERT Tiny Amharic model and tokenizer
model_name = "rasyosef/bert-tiny-amharic"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=5)  # Adjust `num_labels` to your data

Some weights of BertForTokenClassification were not initialized from the model checkpoint at rasyosef/bert-tiny-amharic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
def tokenize_function(examples):
    return tokenizer(examples["tokens"], truncation=True, padding="max_length", is_split_into_words=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/1340 [00:00<?, ? examples/s]

In [36]:
# Step 5: Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["tokens"], truncation=True, padding="max_length", is_split_into_words=True, return_tensors="pt")

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("ner_tags", "labels")


Map:   0%|          | 0/1340 [00:00<?, ? examples/s]

In [37]:
# Step 6: Set up data collator for token classification
data_collator = DataCollatorForTokenClassification(tokenizer)

In [38]:
# Split dataset into train and validation
train_size = int(0.8 * len(tokenized_datasets["train"]))
eval_size = len(tokenized_datasets["train"]) - train_size
train_dataset = tokenized_datasets["train"].select(range(train_size))
eval_dataset = tokenized_datasets["train"].select(range(train_size, len(tokenized_datasets["train"])))


In [52]:
# Step 8: Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # Use eval_strategy instead of evaluation_strategy
    save_strategy="epoch",  # Ensure save strategy matches eval strategy
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="none",
    label_names=["ner_tags"]  # Add label_names to training arguments
)

In [53]:
# Step 9: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # Add the evaluation dataset
    tokenizer=tokenizer,
    data_collator=data_collator # Fix for deprecated tokenizer warning
)

  trainer = Trainer(


In [54]:
# Start training
trainer.train()

ValueError: too many dimensions 'str'