## Introduction

This notebook demonstrates the complete workflow for fine-tuning a pre-trained BERT model on an Amharic news classification task. The dataset utilized for this task is an Amharic news corpus, which includes a variety of news articles labeled by category. Our goal is to create a robust classifier that can accurately categorize these articles based on their content.

In [108]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import load_dataset

import warnings
warnings.filterwarnings("ignore")

In [None]:
import sys
sys.path.append('../')
from scripts.utils import preprocess_article, compute_metrics, calculate_length, tokenize_function, evaluate_model, generate_predictions

In [110]:
# Load dataset
data = load_dataset("csv", data_files="../Amharic_corpus_merged_2023-04-16.csv")
data

DatasetDict({
    train: Dataset({
        features: ['article', 'category'],
        num_rows: 61915
    })
})

In [111]:
# Preprocess and calculate length
data['train'] = data['train'].map(calculate_length, batched=False)
data['train'] = data['train'].map(preprocess_article, batched=False)

In [112]:
# Split dataset
raw_datasets = data['train'].train_test_split(train_size=0.8, seed=42)
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['article', 'category', 'word_count'],
        num_rows: 49532
    })
    test: Dataset({
        features: ['article', 'category', 'word_count'],
        num_rows: 12383
    })
})

In [113]:
# Tokenize dataset
categories = list(set(data['train']['category']))
category_to_id = {cat: idx for idx, cat in enumerate(categories)}
id_to_category = {idx: cat for cat, idx in category_to_id.items()}

model_name = "rasyosef/bert-small-amharic"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_datasets = raw_datasets.map(lambda x: tokenize_function(x, tokenizer, category_to_id))

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='pt')

# Set format for datasets
tokenized_datasets.set_format("torch")

Map:   0%|          | 0/12383 [00:00<?, ? examples/s]

In [114]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(categories),
    id2label = {i: lbl for i, lbl in enumerate(categories)},
    label2id = {lbl: i for i, lbl in enumerate(categories)},
    device_map="cuda"
)


embedding_layer = model.base_model.embeddings
print(f"Embedding layer: {embedding_layer}")
print(f"Embedding details: {embedding_layer.word_embeddings.weight.shape}")

print(f"Model configuration: {model.config}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at rasyosef/bert-small-amharic and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding layer: BertEmbeddings(
  (word_embeddings): Embedding(24576, 512, padding_idx=0)
  (position_embeddings): Embedding(512, 512)
  (token_type_embeddings): Embedding(2, 512)
  (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)
Embedding details: torch.Size([24576, 512])
Model configuration: BertConfig {
  "_name_or_path": "rasyosef/bert-small-amharic",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 512,
  "id2label": {
    "0": "Others",
    "1": "Local News",
    "2": "Sports",
    "3": "Entertainment",
    "4": "Business",
    "5": "International News",
    "6": "Politics"
  },
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "label2id": {
    "Business": 4,
    "Entertainment": 3,
    "International News": 5,
    "Local News": 1,
    "Others": 0,
    "Politics": 6,
 

## Model evaluation before finetuning 

In [115]:
before_finetuning_predictions = generate_predictions(model, tokenized_datasets, device="cuda", id_to_category=id_to_category,num_samples=5)
before_finetuning_predictions

Unnamed: 0,article,predicted_labels,true_labels,true_category,predicted_category
0,ለዘንድሮ የትንሳኤ በአል የ3ሺ በሬዎችና የ1ሺ በግና ፍየል እርድ እንደሚ...,0,tensor(1),Local News,Others
1,ደቡብ አፍሪካ የኮሮናቫይረስን መዛመት ለመቆጣጠር ስትል ለሶስት ወራት ያህ...,4,tensor(5),International News,Business
2,ሳላዲን ሰኢድ ከጥቂት ደጋፊዎች ጋር በተፈጠረ ግብግብ እጁ ላይ ጉዳት ደር...,0,tensor(2),Sports,Others
3,በሚድሮክ ቴክኖሎጂ ግሩፕ ስር የሚገኘው ሁዳ ሪል ስቴት በመሀል ፒያሳ መን...,3,tensor(4),Business,Entertainment
4,ዶር አብይ አህመድ አሜሪካን በመጭው ወር ሲጎበኙ በሚያደርጉላቸው አቀባበል...,3,tensor(5),International News,Entertainment


In [116]:
# Evaluate the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
test_before_metrics = evaluate_model(model, tokenized_datasets, data_collator, device,'test')
print(test_before_metrics)

{'accuracy': 0.09157716223855286, 'precision': 0.15273352229252438, 'recall': 0.09157716223855286, 'f1': 0.07619819080073575}


In [118]:
# Evaluate the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_before_metrics = evaluate_model(model, tokenized_datasets, data_collator, device, "train")
print(train_before_metrics)

{'accuracy': 0.09052733586368408, 'precision': 0.13258395920569163, 'recall': 0.09052733586368408, 'f1': 0.07701403982450998}


In [119]:
training_args = TrainingArguments(
    output_dir=model_name + "-finetuned",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=5,
    weight_decay=0.1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=True,
    seed=42,
)

In [120]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6022,0.424701,0.83865,0.840363,0.83865,0.838162
2,0.3861,0.371743,0.862392,0.86436,0.862392,0.861875
3,0.3267,0.359084,0.866349,0.867202,0.866349,0.865774
4,0.2864,0.343543,0.874909,0.874195,0.874909,0.874351
5,0.2628,0.345294,0.875071,0.874773,0.875071,0.874665


TrainOutput(global_step=3870, training_loss=0.3728362327398256, metrics={'train_runtime': 2141.8686, 'train_samples_per_second': 115.628, 'train_steps_per_second': 1.807, 'total_flos': 9796823106416640.0, 'train_loss': 0.3728362327398256, 'epoch': 5.0})

## Model evaluation after finetuning 

In [121]:
after_finetuning_predictions = generate_predictions(model, tokenized_datasets, device="cuda", id_to_category=id_to_category,num_samples=5)
after_finetuning_predictions

Unnamed: 0,article,predicted_labels,true_labels,true_category,predicted_category
0,ለዘንድሮ የትንሳኤ በአል የ3ሺ በሬዎችና የ1ሺ በግና ፍየል እርድ እንደሚ...,4,tensor(1),Local News,Business
1,ደቡብ አፍሪካ የኮሮናቫይረስን መዛመት ለመቆጣጠር ስትል ለሶስት ወራት ያህ...,5,tensor(5),International News,International News
2,ሳላዲን ሰኢድ ከጥቂት ደጋፊዎች ጋር በተፈጠረ ግብግብ እጁ ላይ ጉዳት ደር...,2,tensor(2),Sports,Sports
3,በሚድሮክ ቴክኖሎጂ ግሩፕ ስር የሚገኘው ሁዳ ሪል ስቴት በመሀል ፒያሳ መን...,1,tensor(4),Business,Local News
4,ዶር አብይ አህመድ አሜሪካን በመጭው ወር ሲጎበኙ በሚያደርጉላቸው አቀባበል...,5,tensor(5),International News,International News
