### INKUBALM: FINETUNING AFRICA'S FIRST SMALL LANGUAGE MODEL

The Lelapa AI Buzuzu-Mavi Challenge is a challenge to make the SLM released by Lelapa AI (Inkuba LM) smaller and smarter. Participants are able to improve their score by improving the average model performance, by making the model smaller, or by doing both. Improving average model performance involves making the model perform better for one or two languages across one or more tasks. The languages in question are Swahili and Hausa, and the tasks include sentiment analysis, AfriXNLI (true and false) question answering ability, and machine translation from English to the language in question.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/train-data/inkuba_combined_RAW.csv
/kaggle/input/train-data/translation_train.csv
/kaggle/input/train-data/train_Mega_v2.csv
/kaggle/input/train-data/augmented_sentiment.csv
/kaggle/input/train-data/augmented_translation.csv
/kaggle/input/train-data/nli_train.csv
/kaggle/input/train-data/inkuba_instruction_tuned_RAW.csv
/kaggle/input/train-data/combined_aug_train.csv
/kaggle/input/train-data/sentiment_train.csv
/kaggle/input/train-data/augmented_qa.csv


In [21]:
! pip install -U peft bitsandbytes accelerate -q

In [22]:
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import torch

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
token = user_secrets.get_secret("HF_TOKEN")

In [None]:
import pandas as pd
from transformers import pipeline

# read in the datasets
base_dir = "/kaggle/input/train-data/"
sentiment_df  = pd.read_csv(base_dir + "sentiment_train.csv")
qa_df = pd.read_csv(base_dir + "nli_train.csv")
translation_df = pd.read_csv(base_dir + "translation_train.csv")


In [24]:
class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.task_templates = {
            "sentiment": (
                "{instruction}\n"
                "{inputs}\n"
                "Output:"
            ),
            "mmt": (
                "{instruction}\n"
                "{inputs}\n"
                "Output:"
            ),
            "nli": (
                "{premise}\n"
                "{instruction}\n"
                "{inputs}\n"
                "Output:"
            )
}

    def __len__(self):
        return len(self.data)

    def build_prompt(self, example):
        template = self.task_templates[example["task"]]
        return template.format(**example)

    def __getitem__(self, idx):
        example = self.data.iloc[idx]
        prompt = self.build_prompt(example)
        # Use 'targets' instead of 'label' since that's what's in the dataframe
        full_text = prompt + " " + example["targets"]

        # Tokenize everything
        tokenized = self.tokenizer(
            full_text,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"
        )

        input_ids = tokenized["input_ids"][0]
        attention_mask = tokenized["attention_mask"][0]

        # Create labels and apply loss masking
        labels = input_ids.clone()

        # Mask everything before the target
        output_start = full_text.index("Output: ") + len("Output: ")
        output_token_start = self.tokenizer(full_text[:output_start], return_tensors="pt")["input_ids"][0]
        prefix_len = len(output_token_start)

        labels[:prefix_len] = -100  # Ignore prompt in loss

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }



In [25]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
import pandas as pd

model_name = "lelapa/InkubaLM-0.4B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=token)
tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    trust_remote_code=True,
    token=token
)

In [26]:
# # Define task-specific configs
# tasks = ["sentiment", "mmt", "translation"]

# data_paths = {
#     "sentiment": "/kaggle/input/train-data/augmented_sentiment.csv",
#     "nli": "/kaggle/input/train-data/augmented_qa.csv",
#     "mmt": "/kaggle/input/train-data/augmented_translation.csv"
# }

# for task in tasks:
#     print(f"\n🔧 Training LoRA adapter for: {task}")

#     # Create a fresh copy of the base model
#     model = get_peft_model(base_model, LoraConfig(
#         r=8,
#         lora_alpha=16,
#         target_modules=["q_proj", "v_proj"],
#         lora_dropout=0.05,
#         bias="none",
#         task_type="CAUSAL_LM"
#     ))
#     model.print_trainable_parameters()

#     # Load and tokenize task-specific dataset
#     df = pd.read_csv(data_paths[task])
    
#     dataset = InstructionDataset(df, tokenizer)

#     training_args = TrainingArguments(
#         output_dir=f"/kaggle/working/outputs/lora_{task}_adapter",
#         per_device_train_batch_size=4,
#         gradient_accumulation_steps=4,
#         num_train_epochs=10,
#         learning_rate=3e-4,
#         fp16=True,
#         logging_steps=20,
#         save_strategy="no",
#         report_to="none"
#     )

#     trainer = Trainer(
#         model=model,
#         args=training_args,
#         train_dataset=dataset
#     )
#     trainer.train()

#     # Save task-specific adapter
#     model.save_pretrained(f"/kaggle/outputs/lora_{task}_adapter")


In [27]:
print(f"🚀 Detected {torch.cuda.device_count()} GPU(s)")


🚀 Detected 2 GPU(s)


In [None]:
# 
tasks = ["sentiment", "mmt", "nli"]

data_paths = {
    "sentiment": "/kaggle/input/train-data/augmented_sentiment.csv",
    "nli": "/kaggle/input/train-data/augmented_qa.csv",
    "mmt": "/kaggle/input/train-data/augmented_translation.csv"
}

import torch

print(f"🚀 Detected {torch.cuda.device_count()} GPU(s)")

for task in tasks:
    print(f"\n🔧 Training LoRA adapter for: {task}")

    # Create a fresh LoRA-wrapped model
    model = get_peft_model(base_model, LoraConfig(
        r=32,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    ))

    model.print_trainable_parameters()

    # Load and tokenize dataset
    df = pd.read_csv(data_paths[task])
    dataset = InstructionDataset(df, tokenizer)

    training_args = TrainingArguments(
        output_dir=f"/kaggle/working/outputs/lora_{task}_adapter",
        per_device_train_batch_size=3,
        gradient_accumulation_steps=2,
        num_train_epochs=20,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=20,
        save_strategy="no",
        report_to="none",
        dataloader_pin_memory=True,
        dataloader_num_workers=2,
    )

    # Train the model
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset
    )

    # 
    trainer.train()

    # Save adapter weight(s)
    model.save_pretrained(f"/kaggle/working/lora_{task}_adapter")


🚀 Detected 2 GPU(s)

🔧 Training LoRA adapter for: sentiment
trainable params: 2,097,152 || all params: 424,036,352 || trainable%: 0.4946


Step,Training Loss
20,7.7287
40,5.1352
60,4.4692
80,4.2971
100,4.0825
120,3.8491
140,3.7339
160,3.7297
180,3.2551
200,3.5104



Cannot access gated repo for url https://huggingface.co/lelapa/InkubaLM-0.4B/resolve/main/config.json.
Access to model lelapa/InkubaLM-0.4B is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in lelapa/InkubaLM-0.4B.



🔧 Training LoRA adapter for: mmt
trainable params: 2,097,152 || all params: 424,036,352 || trainable%: 0.4946


Step,Training Loss
20,9.4963
40,8.3429
60,7.6061
80,7.3421
100,7.2955
120,7.2806
140,7.109
160,7.1887
180,7.1291
200,6.9422



Cannot access gated repo for url https://huggingface.co/lelapa/InkubaLM-0.4B/resolve/main/config.json.
Access to model lelapa/InkubaLM-0.4B is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in lelapa/InkubaLM-0.4B.



🔧 Training LoRA adapter for: nli
trainable params: 2,097,152 || all params: 424,036,352 || trainable%: 0.4946


Step,Training Loss
20,9.092
40,6.7068
60,6.1333
80,5.9604
100,5.908
120,5.5616
140,5.6493
160,5.3619
180,5.3847
200,5.3174



Cannot access gated repo for url https://huggingface.co/lelapa/InkubaLM-0.4B/resolve/main/config.json.
Access to model lelapa/InkubaLM-0.4B is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in lelapa/InkubaLM-0.4B.


In [None]:
from peft import PeftModel

# Load base model
base = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", token=token)

# Load adapter for a specific task
task = "sentiment"  # or "sentiment", "translation"
model = PeftModel.from_pretrained(base, f"/kaggle/working/lora_{task}_adapter")
model.eval().to("cuda")

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): VulavulaLlamaForCausalLM(
      (model): VulavulaLlamaModel(
        (embed_tokens): Embedding(61788, 2048)
        (layers): ModuleList(
          (0-7): 8 x VulavulaLlamaDecoderLayer(
            (self_attn): VulavulaLlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
      

In [34]:
import pandas as pd
test_sent = pd.read_parquet("hf://datasets/lelapa/SentimentTest/data/train-00000-of-00001.parquet")
test_nli = pd.read_parquet("hf://datasets/lelapa/XNLITest/data/train-00000-of-00001.parquet")
test_mmt = pd.read_parquet("hf://datasets/lelapa/MTTest/data/train-00000-of-00001.parquet")

test_nli["task"] = "nli"
print(f"Sentiment Columns {test_sent.columns}")
print(f"NLI Columns {test_nli.columns}")
print(f"mmt Columns {test_mmt.columns}")

test_data = pd.concat([test_sent, test_nli, test_mmt])
test_data.head()

Sentiment Columns Index(['ID', 'task', 'langs', 'data_source', 'instruction', 'inputs',
       'targets'],
      dtype='object')
NLI Columns Index(['ID', 'langs', 'premise', 'inputs', 'instruction', 'targets', 'task'], dtype='object')
mmt Columns Index(['ID', 'task', 'langs', 'data_source', 'instruction', 'inputs',
       'targets'],
      dtype='object')


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,ID,task,langs,data_source,instruction,inputs,targets,premise
0,ID_f3c74c7b_sentiment_test__hausa,sentiment,hausa,afrisenti,Gano ra'ayin da aka bayyana a cikin wannan rub...,@user ynxu fha da kanada kudi shikenan duk kay...,,
1,ID_aad19dbf_sentiment_test__hausa,sentiment,hausa,naijasenti,Za ka iya tantance yanayin wannan rubutu? Bi w...,@user alhamdulillah babu abinda zamuce sai god...,,
2,ID_f6de0381_sentiment_test__hausa,sentiment,hausa,afrisenti,Za ka iya tantance yanayin wannan rubutu? Bi w...,@user ke ina ruwan ki 😬 ba harkar film bane ba,,
3,ID_cbec84fe_sentiment_test__swahili,sentiment,swahili,afrisenti,Changanua mawazo ya matini yanayofuata na uain...,matokeo chanya liverais magufuli katika uzindu...,,
4,ID_885caf5c_sentiment_test__hausa,sentiment,hausa,naijasenti,Tantance ra’ayin wannan rubutu kuma a rarraba ...,@user 🤣 akwai lauje cikin nadi gaskiya.,,


In [None]:
import torch

def generate_response(prompt, max_new_tokens=128):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()
    output_ids = model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        do_sample=True,  # Use greedy first
        pad_token_id=tokenizer.eos_token_id,
        temperature=0.7,
        top_p=0.9
    )
    # Slice out the generated part only
    generated = output_ids[0][input_ids.shape[1]:]
    return tokenizer.decode(generated, skip_special_tokens=True).strip()


def build_prompt(row):
    sent_nli_prompt = f"Below is an instruction to complete a task. Make sure to produce a one word response or label. Do not give explanations or preamble."
    if row["task"] == "sentiment":
        return sent_nli_prompt + f"{row['instruction']}\n{row['inputs']}\nOutput:"
    elif row["task"] == "mmt":
        return f"{row['instruction']}\n{row['inputs']}\nOutput:"
    elif row["task"] == "nli":
        return sent_nli_prompt + f"{row['premise']}\n{row['instruction']}\n{row['inputs']}\nOutput:"
    else:
        raise ValueError(f"Unknown task: {row['task']}")


row = test_data.iloc[40]
prompt = build_prompt(row)
print("Prompt:", prompt)
print("generated_response", generate_response(prompt))

Prompt: Below is an instruction to complete a task. Make sure to produce a one word response or label. Do not give explanations or preamble.Tafadhali tambua mawazo yaliyoonyeshwa kwenye matini haya kwa kutegemea miongozo ifuatayo: Chanya: iwapo matini yanadokeza mawazo, mtazamo na hali chanya ya kihisia. Hasi: iwapo matini yanadokeza mawazo au hisia hasi. Wastani: iwapo matini hayadokezi lugha chanya au hasi kwa njia ya moja kwa moja au isiyo ya moja kwa moja.
naomba kusema wazi kuwa 1 hedhi si jambo la ziada ambalo mwanamke anajiamulia iwe sehemu ya maisha yake au l
Output:
generated_response Wastani: Wastani: wanastani hichoacho ameutimizwa katika maji ya serengeti ili tatizo la maji tatizo la maji tatizo la maji tatizo la maji tatizo la maji tatizo la maji tatizo la maji tatizo la maji tatizo la maji tatizo la maji tatizo la maji tatizo la maji tatizo la maji tatizo la maji tatizo la maji tatizo la maji tatizo la maji tatizo la maji tatizo la maji tatizo la maji tat


In [None]:
from peft import PeftModel

base = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", token=token)
tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
tokenizer.pad_token = tokenizer.eos_token

responses = []

for _, row in test_data.iterrows():
    task = row['task']
    model = PeftModel.from_pretrained(base, f"/kaggle/working/lora_{task}_adapter")
    model.eval().to("cuda")
    prompt = build_prompt(row)
    response = generate_response(prompt)
    responses.append({
        "ID": row["ID"],
        "task": row["task"],
        "Response": response
    })


ss = pd.DataFrame(responses)
ss.head()

In [None]:
def map_hausa_sent(response):
    response = response.lower()
    if "kyakkyawa" in response:
        return 0
    if "tsaka" in response:
        return 1
    if "korau" in response:
        return 2
        
def map_swahili_sent(response):
    response = response.lower()
    if "chanya" in response:
        return 0
    if "wastani" in response:
        return 1
    if "hasi" in response:
        return 2
    
def map_xnli(response):
    response = response.lower()
    if "true" in response:
        return 0
    if "neutral" in response:
        return 1
    if "false" in response:
        return 2
    if "neither" in response:
        return 1

In [None]:
clean_labels = []
for _, row in ss.iterrows():
    response = row['Response']
    task = row['task']
    if task == "sentiment":
        if "swahili" in row["ID"]:
            predicted_label = map_swahili_sent(response)
            clean_labels.append(predicted_label)
        if "hausa" in row["ID"]:
            predicted_label = map_hausa_sent(response)
            clean_labels.append(predicted_label)
    if task == "nli":
        predicted_label = map_xnli(response)
        clean_labels.append(predicted_label)
    if task == "mmt":
        clean_labels.append(response)

In [None]:
clean_ss = pd.DataFrame({
    "ID": ss['ID'].values,
    "Response": clean_labels
})
