In [None]:
!pip install transformers datasets torch
!pip install seqeval  # for evaluating NER performance


In [6]:
!pip install gdown




In [7]:
import gdown

# URL for Google Drive file
file_url = "https://drive.google.com/uc?export=download&id=1qpzy8eeqlSzkSN4g4yVLAa_ce0ZUovUh"
output = 'labeled_ner_data.conll'

# Download the file
gdown.download(file_url, output, quiet=False)


Downloading...
From: https://drive.google.com/uc?export=download&id=1qpzy8eeqlSzkSN4g4yVLAa_ce0ZUovUh
To: /content/labeled_ner_data.conll
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1.29M/1.29M [00:00<00:00, 112MB/s]


'labeled_ner_data.conll'

In [57]:
import pandas as pd

# Load your CoNLL file into a pandas dataframe
# Assuming 'data.conll' is your file, adjust accordingly
data = []

with open('labeled_ner_data.conll', 'r') as f:
    sentence = []
    labels = []
    for line in f:
        if line.strip():  # Not an empty line
            token, label = line.strip().split()  # Assuming token and label are space-separated
            sentence.append(token)
            labels.append(label)
        else:
            data.append((sentence, labels))
            sentence = []
            labels = []

# Create DataFrame
df = pd.DataFrame(data, columns=["tokens", "labels"])


In [79]:
from transformers import AutoTokenizer

# Load the tokenizer for BERT-tiny-Amharic
tokenizer = AutoTokenizer.from_pretrained("rasyosef/bert-tiny-amharic")


# Label to ID mapping (adjust according to your labels)
label2id = {
    "B-Product": 0,
    "I-Product": 1,
    "B-LOC": 2,
    "I-LOC": 3,
    "O": 4
}

# Tokenize the data and align the labels
def tokenize_and_align_labels(examples):
    # Tokenize the input tokens
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, padding=True, is_split_into_words=True)

    labels = examples['labels']
    new_labels = []

    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Get word ids for each tokenized example
        current_labels = []

        # Convert string labels to integers using the label2id mapping
        for word_id in word_ids:
            if word_id is None:
                current_labels.append(-100)  # Masked token (use -100 to ignore during loss calculation)
            else:
                current_labels.append(label2id.get(label[0], label2id["O"]))  # Default to "O" if unknown label

        new_labels.append(current_labels)

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs



from datasets import Dataset

# Convert the dataframe into Hugging Face Dataset format
dataset = Dataset.from_pandas(df)

# Tokenize and align labels
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/1369 [00:00<?, ? examples/s]

IndexError: list index out of range

In [75]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="epoch",     # evaluate every epoch
    learning_rate=2e-5,              # learning rate
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    num_train_epochs=3,              # number of training epochs
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)




In [77]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("rasyosef/bert-tiny-amharic")
model = AutoModelForMaskedLM.from_pretrained("rasyosef/bert-tiny-amharic")
# Assuming 3 labels: B-Product, I-Product, B-LOC, and O (Outside)


config.json:   0%|          | 0.00/643 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/16.7M [00:00<?, ?B/s]

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From ðŸ‘‰v4.50ðŸ‘ˆ onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


generation_config.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

In [78]:
from transformers import Trainer, TrainingArguments

# Define Trainer
trainer = Trainer(
    model=model,                         # the model to train
    args=training_args,                  # training arguments
    train_dataset=tokenized_dataset,     # training dataset
    eval_dataset=tokenized_dataset,      # evaluation dataset
    tokenizer=tokenizer,                 # tokenizer
)

# Train the model
trainer.train()


  trainer = Trainer(


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [67]:
from huggingface_hub import login
login()  # This will prompt you to enter your Hugging Face token


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [68]:
from transformers import AutoModelForTokenClassification

# For an alternative multilingual model (e.g., XLM-RoBERTa)
model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-base")
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
# Assuming 3 labels: B-Product, I-Product, B-LOC, and O (Outside)


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [71]:
from datasets import Dataset

# Split dataset into training and validation sets (80/20 split)
train_dataset = tokenized_dataset.shuffle(seed=42).select([i for i in list(range(int(0.8 * len(tokenized_dataset))))])
eval_dataset = tokenized_dataset.shuffle(seed=42).select([i for i in list(range(int(0.8 * len(tokenized_dataset)), len(tokenized_dataset)))])

# Alternatively, you can split using train_test_split (from the 'datasets' library)
# train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
# train_dataset = train_test_split['train']
# eval_dataset = train_test_split['test']


In [72]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)  # Convert logits to predictions
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    return {"accuracy": accuracy_score(labels, predictions), "precision": precision, "recall": recall, "f1": f1}


In [73]:
from transformers import Trainer

trainer = Trainer(
    model=model,                         # the model to train
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
    eval_dataset=eval_dataset,           # evaluation dataset
    compute_metrics=compute_metrics,     # evaluation metrics (optional)
)

# Train the model
trainer.train()




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 Â·Â·Â·Â·Â·Â·Â·Â·Â·Â·


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


ValueError: expected sequence of length 512 at dim 1 (got 145)

Evaluating the model

In [None]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = labels

    results = []
    for pred, true in zip(predictions, true_labels):
        temp_pred, temp_true = [], []
        for p, t in zip(pred, true):
            if t != -100:  # Skip padding tokens
                temp_pred.append(p)
                temp_true.append(t)
        results.append((temp_pred, temp_true))

    precision = precision_score(results)
    recall = recall_score(results)
    f1 = f1_score(results)
    return {"precision": precision, "recall": recall, "f1": f1}


In [None]:
trainer.evaluate()


In [None]:
# Save the Fine-Tuned Model
model.save_pretrained('fine_tuned_model')
tokenizer.save_pretrained('fine_tuned_model')
