In [None]:
import requests
import pandas as pd
from datasets import DatasetDict, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
)

# Step 1: Download the file
file_id = "1qpzy8eeqlSzkSN4g4yVLAa_ce0ZUovUh"
download_url = f"https://drive.google.com/uc?id={file_id}"
file_path = "dataset.conll"

response = requests.get(download_url)
with open(file_path, "wb") as file:
    file.write(response.content)

# Step 2: Load .conll data
def load_conll_data(file_path):
    sentences, labels, sentence, label = [], [], [], []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                token, entity = line.split()
                sentence.append(token)
                label.append(entity)
            else:
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                sentence, label = [], []
        if sentence:
            sentences.append(sentence)
            labels.append(label)
    return sentences, labels

sentences, labels = load_conll_data(file_path)
unique_labels = list(set(label for sublist in labels for label in sublist))
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}
numerical_labels = [[label2id[label] for label in sublist] for sublist in labels]

data = {"tokens": sentences, "ner_tags": numerical_labels}
dataset = DatasetDict({"train": Dataset.from_dict(data)})

# Step 3: Tokenize and align labels
model_name = "masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0"  # Change to Masakhane NER model
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        is_split_into_words=True
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100 if word_idx is None else label[word_idx] for word_idx in word_ids]
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Step 4: Load model and set up trainer
num_labels = len(unique_labels)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

data_collator = DataCollatorForTokenClassification(tokenizer)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
)

train_size = int(0.8 * len(tokenized_datasets["train"]))
train_dataset = tokenized_datasets["train"].select(range(train_size))
eval_dataset = tokenized_datasets["train"].select(range(train_size, len(tokenized_datasets["train"])))

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

# Step 5: Train the model
trainer.train()

# Step 6: Evaluation
metrics = trainer.evaluate(eval_dataset)
print("Evaluation Metrics:", metrics)
