In [1]:

"""Loading accelerator for later use-case """
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)
accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

Load Base Model
Now, we load the Mistral 7B base model using 4-bit quantization.

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
base_model_id = "mistralai/Mistral-7B-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config)

#Loading the dataset from hugging face repository
1. train set: used for training the model
2. validation: used for validation set
3. test_dataset = used for testing the dataset and checking the perfomance
the data splitted into 70% trainingset 30% for testing and validation and further the 30% splitted into 50% test_set and 50%  validation_set


In [7]:
from datasets import load_dataset

# Load the 'Tvsybkzkmapab/Amharic_ad_generation' dataset
dataset = load_dataset('Tvsybkzkmapab/Amharic_ad_generation')

# Access the 'train', 'validation', and 'test' splits
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

# Print the first few samples from each split
print("\nTraining Dataset:")
print(train_dataset[:5])

print("\nValidation Dataset:")
print(validation_dataset[:5])

print("\nTest Dataset:")
print(test_dataset[:5])


Training Dataset:
{'text': ['ADVERTISMENT አፊያ ሙሀመድ ከፍተኛ የባህል ሕክምና እና ዘመናዊ የዋግምት አገልግሎት የምንሰጣቸው የባህል ህክምናዎች ➢ ለውጭና ለውስጥ ኪንታሮት ➢ ለማድያት ➢ ለሱኳር በሽታ ➢ለጉበት(ለወፍ በሽታ) ➢ለጨጎራ ህመም ➢ለስፈተወሲብ ➢ለደም ግፊት ➢ለአስም ወይም ሳይነስ ➢ለሚጥል በሺታ ➢ ለእሪህና ቁርጥማት ➢ለራስ ህመም (ማይግሪን) ➢ለቺፌ ና ለጭርት ➢ለቋቁቻና ፎረፎር ➢ለእጢና ለእባጭ ➢ለወገብ ህመም ➢ለመካንነት ለወድም ለሴትም ➢ለጆሮና ለአይን ህመም ➢ለሆድ ህመም ➢ዘመናዊ የዋግምት አገልግሎት በተቋማችን እንሰጣለን። 👉ከኢትዮጵያ ባህላዊ ህክምና አዋቂዎች ማህበር በዘርፉ ህጋዊ የባህል ህክምና ፍቃድ ያለን ነን። አድራሻ:አዲስ አበባ አየር ጤና ስልክ ቁጥር 📲0927506650 📲0987133734 📲0939605455 ቴሌግራም ቻናላችን ntvE5NmM0', 'ከአዝናኝ ጨዋታዎች እና ሽልማቶች ጋር በያላችሁበት እየደረስን ነው፡፡ ዕድሉ አያምልጣችሁ፤ አረንጓዴው የሳፋሪኮም አውቶብሳችን ባለበት በኩል ስታልፉ ተቀላቀሉን በአብሮነት ቆንጆ ጊዜ እናሳልፍ!\n\n#Gursha\n#SafaricomEthiopia #FurtherAheadTogether', 'ይነበብ! ቴዲ አፍሮ ከአሜሪካ መልክት ለኢትዮጵያውያን አስተላላፊ። ዶክተር አብይን ሀገር ውስጥ ሆኜ ባለመደገፌ በተሰምቶኛል።', 'በሳፋሪኮም ምቹ የድምጽ ፣ የፅሁፍ መልዕክት እና ፈጣን 4G የኢንተርኔት ድህረ ክፍያ ጥቅሎች ያለሃሳብ እንስራ ፤ ንግዳችንን እናሳድግ!\n\nለአገልግሎቱ ለመመዝገብ የሳፋሪኮምን ድህረ - ገፅ እንጎብኝ፣ ወደ 0700 755 755 or 0700 700 755 እንደውል ወይም ወደ enterprisesupport@safaricom.et ኢሜይል እንላክ።\n\n#Safarico

tokenization steps using pretrained tokenizer from huggingface 
(iocuydi/llama-2-amharic-3784m)

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

tokenizer = AutoTokenizer.from_pretrained("iocuydi/llama-2-amharic-3784m")  # Load your trained tokenizer

tokenizer.model_max_length = 512  # Set maximum sequence length
tokenizer.padding_side = "left"  # Set padding strategy
tokenizer.add_eos_token = True 

In [4]:
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

Instruction to the model and the description of the attributes for the model
to have context and instruct it

In [None]:
def generate_and_tokenize_prompt(data_point):
    text = data_point["text"]
    labels = data_point["Human_Label"]

    # Generate prompt template
    template = f"""Given an Amharic advertisement sentence:

"{text}"

What are the main categories (tags) associated with this advertisement?

Possible categories are: ['healthcare &pharma', 'Telecom', 'Media', 'financial service', 'consumer products', 'computing device', 'Realstate', 'retail', 'training', 'Entertainment', 'software development', 'Other']

Target categories: "{labels}"
"""

    # Tokenize the prompt
    return tokenize(template)

In [None]:
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = validation_dataset.map(generate_and_tokenize_prompt)

Set Up LoRA
Now, we prepare the model for fine-tuning by applying LoRA adapters to the linear layers of the model.

In [None]:
from peft import prepare_model_for_kbit_training
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
from peft import LoraConfig, get_peft_model
config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
print_trainable_parameters(model)
# Apply the accelerator. You can comment this out to remove the accelerator.
model = accelerator.prepare_model(model)

Run Training
In this step, we start training the fine-tuned model. You can adjust the training parameters according to your needs.

In [None]:
if torch.cuda.device_count() > 1: # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True

In [None]:
import transformers
from datetime import datetime
import sentencepiece as spm



project = "Mistral_Amharic-finetuning"
base_model_name = "mistral"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name
tokenizer.pad_token = tokenizer.eos_token


trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=5,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        max_steps=1000,
        learning_rate=2.5e-5, # Want about 10x smaller than the Mistral learning rate
        logging_steps=50,
        bf16=True,
        optim="paged_adamw_8bit",
        logging_dir="./logs",        # Directory for storing logs
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=50,                # Save checkpoints every 50 steps
        evaluation_strategy="steps", # Evaluate the model every logging step
        eval_steps=50,               # Evaluate and save checkpoints every 50 steps
        do_eval=True,                # Perform evaluation at the end of training
        report_to="wandb",           # Comment this out if you don't want to use weights & baises
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"          # Name of the W&B run (optional)
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()