In [1]:
import pandas as pd
from datasets import DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification

import requests

# Step 1: Download the file from Google Drive
file_id = "1qpzy8eeqlSzkSN4g4yVLAa_ce0ZUovUh"  # Replace with your file ID
download_url = f"https://drive.google.com/uc?id={file_id}"
file_path = "dataset.conll"  # Path to save the file locally

response = requests.get(download_url)
with open(file_path, "wb") as file:
    file.write(response.content)

print(f"File downloaded and saved as {file_path}")

# Step 2: Load and process the data from .conll file
def load_conll_data(file_path):
    sentences = []
    labels = []
    sentence = []
    label = []

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()  # Remove leading/trailing whitespaces
            if line:  # Non-empty line
                token, entity = line.split()  # Split token and label
                sentence.append(token)
                label.append(entity)
            else:  # Empty line (end of a sentence)
                if sentence:  # Only add non-empty sentences
                    sentences.append(sentence)
                    labels.append(label)
                sentence = []  # Reset sentence and label for next sentence
                label = []  # Reset for next sentence

        # Add the last sentence (if the file doesn't end with an empty line)
        if sentence:
            sentences.append(sentence)
            labels.append(label)

    return sentences, labels

# Load dataset from the downloaded .conll file
sentences, labels = load_conll_data(file_path)

# Step 3: Convert to Hugging Face Dataset format
data = {"tokens": sentences, "ner_tags": labels}
dataset = DatasetDict({
    "train": Dataset.from_dict(data)
})

# Check if data is loaded correctly
print(f"Loaded {len(sentences)} sentences with {len(labels)} labels.")

# Step 4: Load the BERT Tiny Amharic model and tokenizer
model_name = "olivertab/bert-tiny-amharic-uncased"  # Replace with the actual model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=5)  # Adjust `num_labels` to your data

# Step 5: Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["tokens"], truncation=True, padding="max_length", is_split_into_words=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Step 6: Set up data collator for token classification
data_collator = DataCollatorForTokenClassification(tokenizer)

# Step 7: Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Step 8: Train the model using Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# Start training
trainer.train()

# Optionally: Save the model
trainer.save_model("bert_tiny_amharic_model")


test
