In [None]:
import os
from random import randrange
from functools import partial
import torch
from datasets import load_dataset
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          HfArgumentParser,
                          Trainer,
                          TrainingArguments,
                          DataCollatorForLanguageModeling,
                          EarlyStoppingCallback,
                          pipeline,
                          logging,
                          set_seed)

#import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel, AutoPeftModelForCausalLM
#from trl import SFTTrainer
from contextlib import nullcontext
from transformers import (
    LlamaForCausalLM,
    LlamaTokenizer,
    TrainerCallback,
    default_data_collator,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig,
    Dataset
)
from peft import (
    LoraConfig,
    TaskType,
    #prepare_model_for_int8_training,
    PeftModel
)

In [None]:
from transformers import pipeline
unmasker = pipeline('fill-mask', model='rasyosef/bert-small-amharic')
unmasker("ከሀገራቸው ከኢትዮጵያ ከወጡ ግማሽ ምዕተ [MASK] ተቆጥሯል።")

In [None]:
def load_model(model_path, device):
    tokenizer = LlamaTokenizer.from_pretrained(model_path)
    # Load model
    model = LlamaForCausalLM.from_pretrained(
        model_path,
        device_map = "auto",
        torch_dtype=torch.float16
    ).to(device)
    tokenizer.pad_token = tokenizer.eos_token
    return model, tokenizer

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

def tokenize_function(tokenizer, examples, truncation=True, max_length=512):
    return tokenizer(examples['article'], padding="max_length", truncation=truncation, max_length=max_length)

def tokenize_dataset(dataset, tokenizer, batch_size=16):
  """
  This function tokenizes a dataset in batches.

  Args:
      dataset: The dataset to tokenize (from Hugging Face Datasets library).
      tokenizer: The tokenizer to use for tokenization (from Transformers library).
      batch_size: The batch size for tokenization (default: 16).
      truncation: Whether to truncate sequences to a maximum length (default: True).
      max_length: The maximum length for sequences (default: 512).

  Returns:
      A tokenized dataset.
  """

  tokenized_datasets = dataset.map(tokenize_function, batched=True, batch_size=batch_size)
  return tokenized_datasets

In [None]:
import evaluate
import numpy as np

def compute_metrics(predict):
  metric1 = evaluate.load("accuracy")
  metric2 = evaluate.load("precision")
  metric3 = evaluate.load("recall")
  metric4 = evaluate.load("f1")

  logits, labels = predict
  predictions = np.argmax(logits, axis=-1)

  accuracy = metric1.compute(predictions=predictions, references=labels)["accuracy"]
  precision = metric2.compute(predictions=predictions, references=labels, average='macro')["precision"]
  recall = metric3.compute(predictions=predictions, references=labels, average='macro')["recall"]
  f1 = metric4.compute(predictions=predictions, references=labels, average='macro')["f1"]

  return {
      "accuracy": accuracy,
      "precision": precision,
      "recall": recall,
      "f1": f1
  }


In [None]:
def train_model(model, training_args, train_dataset, eval_dataset, compute_metrics):
  """
  This function trains a model using the Trainer class from Transformers.

  Args:
      model: The model to be trained.
      training_args: Training arguments (from Transformers TrainingArguments class).
      tokenized_datasets: A dictionary containing tokenized datasets (train and test).
      tokenizer: The tokenizer used for tokenization.
      data_collator: The data collator used for batching data.
      compute_metrics: A function that computes evaluation metrics.

  Returns:
      None
  """

  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=eval_dataset,
      compute_metrics=compute_metrics,
  )

  trainer.train()


In [None]:
dataset = load_dataset("csv", data_files="../data/Amharic_corpus_merged_2023-04-16.csv")

train_data = dataset.iloc[:int(len(dataset) * 0.8)]  # 80% for training
test_data = dataset.iloc[int(len(dataset) * 0.8):]

In [None]:
from transformers import AutoModelForSequenceClassification

model_name = "rasyosef/bert-small-amharic"

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(set(train_data["category"])))  # Adjust num_labels


# Preprocess and tokenize the training and test data
train_data_tokenized = tokenize_dataset(train_data, tokenizer)
test_data_tokenized = tokenize_dataset(test_data, tokenizer)

feature_name = "article"
label_name = "category"

# Create training and validation datasets with feature and label access
train_dataset = Dataset.from_dict({
  feature_name: train_data_tokenized["train"]["input_ids"],
  label_name: train_data_tokenized["train"]["labels"]
})
eval_dataset = Dataset.from_dict({
  feature_name: test_data_tokenized["test"]["input_ids"],
  label_name: test_data_tokenized["test"]["labels"]
})



training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_steps=500,
    evaluation_strategy="epoch",
)


train_model(model, training_args, train_dataset,eval_dataset,compute_metrics)