In [1]:
!pip install -U trl bitsandbytes

Collecting trl
  Downloading trl-0.17.0-py3-none-any.whl.metadata (12 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading trl-0.17.0-py3-none-any.whl (348 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m348.0/348.0 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hDownloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: trl, bitsandbytes
Successfully installed bitsandbytes-0.45.5 trl-0.17.0


In [2]:
import os
import warnings
warnings.filterwarnings("ignore")
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, concatenate_datasets, Dataset, Value
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
token = user_secrets.get_secret("HFtoken")

# Define reusable functions for dataset loading and processing

In [4]:
def load_dataset_by_tag(dataset_type, tag, split='train'):
    return load_dataset(f"{dataset_type}{tag}", split=split)

def load_and_combine_datasets(tag, split='train'):
    """
    Load and combine multiple datasets with all unique columns (union).
    Empty strings are used for missing values.
    
    Args:
        tag (str): Tag for the datasets (Train, Test)
        split (str): Split to load (train, test)
        
    Returns:
        Dataset: Combined dataset with all unique columns
    """
    se_dataset = load_dataset_by_tag("lelapa/Sentiment", tag, split)
    mt_dataset = load_dataset_by_tag("lelapa/MT", tag, split)
    xn_dataset = load_dataset_by_tag("lelapa/XNLI", tag, split)

    # Identify all unique columns (union)
    all_columns = list(set(se_dataset.column_names) | 
                      set(mt_dataset.column_names) | 
                      set(xn_dataset.column_names))
    print(f"All Columns: {all_columns}")

    # Function to ensure dataset has all columns, filling missing ones with empty strings
    def ensure_all_columns(dataset, all_cols):
        # Add each missing column one by one
        for col in all_cols:
            if col not in dataset.column_names:
                # Create array of empty strings with the same length as the dataset
                empty_column = [""] * len(dataset)
                dataset = dataset.add_column(col, empty_column)
        
        return dataset

    # Ensure all datasets have all columns
    se_dataset = ensure_all_columns(se_dataset, all_columns)
    mt_dataset = ensure_all_columns(mt_dataset, all_columns)
    xn_dataset = ensure_all_columns(xn_dataset, all_columns)

    # Make sure 'targets' column is string type if it exists in all datasets
    if "targets" in all_columns:
        se_dataset = se_dataset.cast_column("targets", Value("string"))
        mt_dataset = mt_dataset.cast_column("targets", Value("string"))
        xn_dataset = xn_dataset.cast_column("targets", Value("string"))

    # Concatenate datasets
    combined_dataset = concatenate_datasets([se_dataset, mt_dataset, xn_dataset])

    return combined_dataset


In [5]:
train = load_and_combine_datasets('train')
pd.DataFrame(train)

README.md:   0%|          | 0.00/485 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/39.0k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/400 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/485 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/72.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/600 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/447 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/35.0k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/400 [00:00<?, ? examples/s]

All Columns: ['task', 'data_source', 'instruction', 'targets', 'premise', 'inputs', 'langs', 'ID']


Casting the dataset:   0%|          | 0/400 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/600 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/400 [00:00<?, ? examples/s]

Unnamed: 0,ID,task,langs,data_source,instruction,inputs,targets,premise
0,ID_6aba33a1_sentiment_ dev_hausa,sentiment,hausa,afrisenti,Za ka iya tantance yanayin wannan rubutu? Bi w...,@user @user allah ya tsayyaba yar uwa 🎂 😍,Kyakkyawa,
1,ID_ce64d307_sentiment_ dev_hausa,sentiment,hausa,naijasenti,Da fatan za a gano ra'ayin da ke cikin wannan ...,@user intenet a masallachi😭😭😭 wani salo ne na ...,Tsaka-tsaki,
2,ID_dfb02831_sentiment_ dev_swahili,sentiment,swahili,swahili_tweet,Tafadhali tambua mawazo yaliyoonyeshwa kwenye ...,picha mbunge wa kilombero peter lijualikali ak...,Wastani,
3,ID_2efc9515_sentiment_ dev_hausa,sentiment,hausa,afrisenti,Gano ra'ayin da aka bayyana a cikin wannan rub...,@user @user @user @user @user hhh amma rahama ...,Tsaka-tsaki,
4,ID_ad1d9888_sentiment_ dev_swahili,sentiment,swahili,afrisenti,Tafadhali tambua mawazo yaliyoonyeshwa kwenye ...,swali zuri sana nawatafuta wajuzi wa mambo wat...,Wastani,
...,...,...,...,...,...,...,...,...
1395,ID_085354e1_dev_afrixnli_swa,,swa,,"Is the following question True, False or Neither?",Maduka ya habari ya kitaifa hufanya maeneo yet...,1,tahadhari kuhusu jinsi habari za kitaifa zinav...
1396,ID_586e104a_dev_afrixnli_swa,,swa,,"Is the following question True, False or Neither?",Kurasa zilihusisha wanachama na maafisa wa kaw...,0,Uanachama ulijumuisha kati ya wanaume wazima ...
1397,ID_b871ea53_dev_afrixnli_hau,,hau,,"Is the following question True, False or Neither?",Ban damu ba da abinda labarun ƙasa ke nuni cik...,2,Ka damu da yadda labarun ƙasa ke shafar unguwa...
1398,ID_70aae970_dev_afrixnli_hau,,hau,,"Is the following question True, False or Neither?",Ya zabi ƙin kama hannayen sa saboda anyi musu ...,0,Kuma mani rashin mutunci dan bazan goyi banyan...


In [6]:
def extract_task_from_id(id_string):
    #Extract task type from ID string.
    task = id_string.split('_')[3]
    # Handle special case for sentiment task
    return 'sentiment' if task == ' dev' else task

def balance_target_lengths(df, task_column='task', reference_task='mt', repetition_factor=11):
    """
    Balance target sequence lengths by repeating shorter targets.
    
    Args:
        df (DataFrame): DataFrame containing task and targets columns
        task_column (str): Name of the task column
        reference_task (str): Task with longer sequences to use as reference
        repetition_factor (int): Number of times to repeat shorter sequences
        
    Returns:
        DataFrame: DataFrame with balanced target lengths
    """
    df_balanced = df.copy()
    
    for task in df_balanced[task_column].unique():
        if task != reference_task:
            mask = df_balanced[task_column] == task
            df_balanced.loc[mask, 'targets'] = df_balanced.loc[mask, 'targets'].apply(
                lambda x: ' '.join([x] * repetition_factor)
            )
    
    return df_balanced



#Format examples for instruction tuning.
def formatting_prompts_func(example):
    premise = example['premise']
    premise = premise+'\n' if len(premise) else ''
    if example['targets'] is not None:
        return f"### Instruction: {example['instruction']}\n### Input: {premise}{example['inputs']}\n### Response: {example['targets']}"
    return f"### Instruction: {example['instruction']}\n### Input: {premise}{example['inputs']}\n### Response:"


def setup_model_and_tokenizer(model_name, use_4bit=True):
    
    #Set up model and tokenizer for QLoRA fine-tuning if argument use_4bit = True.

    # Define BitsAndBytes config for quantization
    if use_4bit:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
        )
    else:
        bnb_config = None
    
    # Load model with quantization config
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        token=token,
    )
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
    tokenizer.pad_token = tokenizer.eos_token
    
    return model, tokenizer, bnb_config


def apply_lora_adapters(model, r=8, lora_alpha=16, dropout=0.05):
    
    # Define LoRA Config
    lora_config = LoraConfig(
        r=r,
        lora_alpha=lora_alpha,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=dropout,
        bias="none",
        task_type="CAUSAL_LM"
    )
    
    # Apply LoRA adapters to model
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    
    return model


def setup_trainer(model, dataset, tokenizer, output_dir, num_epochs=6):
    """
    Set up SFTTrainer for direct fine-tuning.
    
    Args:
        model: Model to fine-tune
        dataset: Training dataset
        tokenizer: Tokenizer
        output_dir (str): Output directory for checkpoints
        num_epochs (int): Number of training epochs
        
    Returns:
        SFTTrainer: Trainer object
    """
    # Define response template for proper label masking
    response_template_with_context = "\n### Response:"
    response_template_ids = tokenizer.encode(response_template_with_context, add_special_tokens=False)[2:]
    
    # Data collator for masked LM training
    collator = DataCollatorForCompletionOnlyLM(response_template_ids, tokenizer=tokenizer)
    
    # Training arguments
    train_args = SFTConfig(
        output_dir=output_dir,
        max_seq_length=256,
        num_train_epochs=num_epochs,
        save_strategy="epoch",
        optim = 'adamw_bnb_8bit',
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        logging_steps=10,
        save_total_limit=2,
        report_to=[],  # Disable wandb
    )
    
    # Trainer setup
    trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        args=train_args,
        formatting_func=formatting_prompts_func,
        data_collator=collator,
    )
    
    return trainer


def generate_response(model, tokenizer, prompt, max_new_tokens=20):
    """
    Generate response using fine-tuned model.
    
    Args:
        model: Fine-tuned model
        tokenizer: Tokenizer
        prompt (str): Input prompt
        max_new_tokens (int): Maximum number of tokens to generate
        
    Returns:
        str: Generated response
    """
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=False
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = generated_text[len(prompt):].strip()
    
    return response


def encode_sentiment_label(label):
    """
    Encode sentiment label to integer.
    
    Args:
        label (str): Sentiment label
        
    Returns:
        int: Encoded label
    """
    for c, i in enumerate(["Chanya", "Wastani", "Hasi"]):
        if label == i:
            return c
    for c, i in enumerate(["Kyakkyawa", "Tsaka-tsaki", "Korau"]):
        if label == i:
            return c
    return 0


def apply_inference_to_test_data(model, tokenizer, test_dataset):

    df = pd.DataFrame(test_dataset)
    model.eval()
    
    # Apply inference with tqdm progress bar
    tqdm.pandas(desc="Generating Responses")
    df['generated'] = df.progress_apply(
        lambda row: generate_response(model, tokenizer, formatting_prompts_func(row)), 
        axis=1
    )
    
    # Process responses based on task type
    df['Response'] = ''
    
    # Sentiment task
    mask = df.ID.apply(lambda x: 'sentiment' in x)
    df.loc[mask, 'Response'] = df.loc[mask, 'generated'].apply(
        lambda x: encode_sentiment_label(x.strip().split()[0])
    )
    
    # XNLI task
    mask = df.ID.apply(lambda x: 'afrixnli' in x)
    df.loc[mask, 'Response'] = df.loc[mask, 'generated'].apply(
        lambda x: int(x.strip().split()[0])%3 if x.strip().split()[0].isdigit() else 0
    )
    
    # MT task
    mask = df.ID.apply(lambda x: 'mt_' in x)
    df.loc[mask, 'Response'] = df.loc[mask, 'generated']
    
    return df

def display_formatted_examples(df, num_examples=2):
    """
    Display formatted examples for each task.
    
    Args:
        df (DataFrame): DataFrame containing the examples
        num_examples (int): Number of examples to display per task
    """
    for task in df.task.unique():
        print(f"\n\n{'='*40}\nTask: {task}\n{'='*40}")
        mask = df.task == task
        for i, (_, row) in enumerate(df[mask].iterrows()):
            if i >= num_examples:
                break
                
            print(f"\nExample {i+1}:")
            print("-" * 40)
            formatted = formatting_prompts_func(row)
            print(formatted)
            print("-" * 40)

# Load and explore the datasets

In [7]:
print("# Loading datasets")
train_dataset = load_and_combine_datasets("Train")
test_dataset = load_and_combine_datasets("Test")

print("\n# Example from training dataset:")
print(train_dataset[0])

print("\n# Example from test dataset:")
print(test_dataset[0])

# Loading datasets


README.md:   0%|          | 0.00/485 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/39.0k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/400 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/485 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/72.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/600 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/447 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/35.0k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/400 [00:00<?, ? examples/s]

All Columns: ['task', 'data_source', 'instruction', 'targets', 'premise', 'inputs', 'langs', 'ID']


Casting the dataset:   0%|          | 0/400 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/600 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/400 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/486 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/33.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/300 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/484 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/22.4k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/300 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/447 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/28.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/300 [00:00<?, ? examples/s]

All Columns: ['task', 'data_source', 'instruction', 'targets', 'premise', 'inputs', 'langs', 'ID']


Casting the dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/300 [00:00<?, ? examples/s]


# Example from training dataset:
{'ID': 'ID_6aba33a1_sentiment_ dev_hausa', 'task': 'sentiment', 'langs': 'hausa', 'data_source': 'afrisenti', 'instruction': 'Za ka iya tantance yanayin wannan rubutu? Bi waɗannan jagororin sharhi: Kyakkyawa: idan rubutu na nuna kyakkyawan tunani, hali, da yanayi. Korau: idan rubutu yana nuna mummunar tunani ko yanayi. Neutral: idan rubutu baya nuna kyakkyawar magana ko mara kyau kai tsaye ko a kaikaice.', 'inputs': '@user @user allah ya tsayyaba yar uwa 🎂 😍', 'targets': 'Kyakkyawa', 'premise': ''}

# Example from test dataset:
{'ID': 'ID_f3c74c7b_sentiment_test__hausa', 'task': 'sentiment', 'langs': 'hausa', 'data_source': 'afrisenti', 'instruction': "Gano ra'ayin da aka bayyana a cikin wannan rubutu. Bin waɗannan jagororin, kyakkyawa yana na rubutu na nufin kyakkyawan tunani, ɗabi'a, da motsin rai. Korau na nuna rubutu na nufin mummunan tunani ko motsin rai. Tsaka-tsaki na nuna rubutu baya nufin magana mai kyau ko mara kyau kai tsaye ko a kaikaice.",

Convert dataset to DataFrame for easier processing

In [8]:
print("# Converting to DataFrame and extracting task types")
train_df = train_dataset.to_pandas()
train_df['task'] = train_df.ID.apply(extract_task_from_id)

print("\n# Dataset distribution by task:")
print(train_df.task.value_counts())

# Converting to DataFrame and extracting task types

# Dataset distribution by task:
task
mt           600
sentiment    400
afrixnli     400
Name: count, dtype: int64


In [9]:
display_formatted_examples(train_df)



Task: sentiment

Example 1:
----------------------------------------
### Instruction: Za ka iya tantance yanayin wannan rubutu? Bi waɗannan jagororin sharhi: Kyakkyawa: idan rubutu na nuna kyakkyawan tunani, hali, da yanayi. Korau: idan rubutu yana nuna mummunar tunani ko yanayi. Neutral: idan rubutu baya nuna kyakkyawar magana ko mara kyau kai tsaye ko a kaikaice.
### Input: @user @user allah ya tsayyaba yar uwa 🎂 😍
### Response: Kyakkyawa
----------------------------------------

Example 2:
----------------------------------------
### Instruction: Da fatan za a gano ra'ayin da ke cikin wannan rubutu bisa ga jagorori masu zuwa: Kyakkyawa: idan rubutu na nuna kyakkyawan tunani, hali, da yanayi. Korau: idan rubutu yana nuna mummunar tunani ko yanayi. Neutral: idan rubutu baya nuna kyakkyawar magana ko mara kyau kai tsaye ko a kaikaice.
### Input: @user intenet a masallachi😭😭😭 wani salo ne na karkatar da masu ibada zuwa wani abu daban amma a raayina bai da mahimmanchi
### Response: Tsa

In [10]:
print("\n# Applying target length balancing fix")
balanced_df = balance_target_lengths(train_df)
display_formatted_examples(balanced_df)


# Applying target length balancing fix


Task: sentiment

Example 1:
----------------------------------------
### Instruction: Za ka iya tantance yanayin wannan rubutu? Bi waɗannan jagororin sharhi: Kyakkyawa: idan rubutu na nuna kyakkyawan tunani, hali, da yanayi. Korau: idan rubutu yana nuna mummunar tunani ko yanayi. Neutral: idan rubutu baya nuna kyakkyawar magana ko mara kyau kai tsaye ko a kaikaice.
### Input: @user @user allah ya tsayyaba yar uwa 🎂 😍
### Response: Kyakkyawa Kyakkyawa Kyakkyawa Kyakkyawa Kyakkyawa Kyakkyawa Kyakkyawa Kyakkyawa Kyakkyawa Kyakkyawa Kyakkyawa
----------------------------------------

Example 2:
----------------------------------------
### Instruction: Da fatan za a gano ra'ayin da ke cikin wannan rubutu bisa ga jagorori masu zuwa: Kyakkyawa: idan rubutu na nuna kyakkyawan tunani, hali, da yanayi. Korau: idan rubutu yana nuna mummunar tunani ko yanayi. Neutral: idan rubutu baya nuna kyakkyawar magana ko mara kyau kai tsaye ko a kaikaice.
### Input: 

In [11]:
balanced_dataset = Dataset.from_pandas(balanced_df.reset_index(drop=True))

In [12]:
model_name = "lelapa/InkubaLM-0.4B"
# turn off qlora: use_4bit=False 
model, tokenizer, bnb_config = setup_model_and_tokenizer(model_name, use_4bit=False)

config.json:   0%|          | 0.00/763 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.66G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/960 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/991k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.95M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

# Train with balanced dataset

In [13]:
balanced_trainer = setup_trainer(
    model=model, 
    dataset=balanced_dataset,
    tokenizer=tokenizer,
    output_dir="./sft_model/balanced"
)
balanced_trainer.train()

Applying formatting function to train dataset:   0%|          | 0/1400 [00:00<?, ? examples/s]

Converting train dataset to ChatML:   0%|          | 0/1400 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/1400 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1400 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1400 [00:00<?, ? examples/s]

Step,Training Loss
10,18.3465
20,18.6905
30,16.2721
40,17.9818
50,15.1368
60,13.0814
70,17.7133
80,15.4582
90,14.5252
100,9.629


TrainOutput(global_step=522, training_loss=5.922442252836921, metrics={'train_runtime': 1412.798, 'train_samples_per_second': 5.946, 'train_steps_per_second': 0.369, 'total_flos': 3286876516245504.0, 'train_loss': 5.922442252836921})

# Inference

In [15]:
ckpt_path = "sft_model/balanced/checkpoint-440/"
inference_model = AutoModelForCausalLM.from_pretrained(ckpt_path,
                                                        device_map="auto")
inference_model.eval()
results_df = apply_inference_to_test_data(inference_model, tokenizer, test_dataset)
results_df[['ID', 'Response']].to_csv('submission_full_finetune.csv', index=False)
results_df[['ID', 'generated','Response']].head()

Generating Responses:   0%|          | 0/900 [00:00<?, ?it/s]

Unnamed: 0,ID,generated,Response
0,ID_f3c74c7b_sentiment_test__hausa,Tsaka-tsaki Tsaka-tsaki Tsaka-tsaki Tsaka-tsaki,1
1,ID_aad19dbf_sentiment_test__hausa,Kyakkyawa Kyakkyawa Kyakkyawa Kyakkyawa Kyakkyawa,0
2,ID_f6de0381_sentiment_test__hausa,Korau Korau Korau Korau Korau Korau Kora,2
3,ID_cbec84fe_sentiment_test__swahili,Wastani Wastani Wastani Wastani Wastani Wastan...,1
4,ID_885caf5c_sentiment_test__hausa,Korau Korau Korau Korau Korau Korau Kora,2
