###### Downloading Packages

In [None]:
!pip install -qU huggingface_hub datasets

###### Loading the HF Dataset

In [None]:
from datasets import load_dataset, DatasetDict, concatenate_datasets, Dataset

# Function to split a dataset into train and validation sets
def split_dataset(dataset_name, split_ratios=[0.95, 0.05]):
    """Split a dataset into train and validation sets."""
    dataset = load_dataset(dataset_name, split='train') # Load the dataset
    train_size, valid_size = split_ratios  # Unpack the sizes

    # Split the dataset directly into train and validation
    train_valid = dataset.train_test_split(test_size=valid_size)

    # Create a DatasetDict to organize the splits
    split_dataset = DatasetDict({
        'train': train_valid['train'],
        'validation': train_valid['test']
    })

    return split_dataset

In [None]:
# Downloading Tiny Stories
tiny_stories_yoruba = split_dataset("ccibeekeoc42/TinyStories_yoruba")
tiny_stories_igbo = split_dataset("ccibeekeoc42/TinyStories_igbo")

# Downloading Dolly_HHRLHF
dolly_hhrlhf_yoruba = split_dataset("ccibeekeoc42/DollyHHRLHF_yoruba")
dolly_hhrlhf_igbo = split_dataset("ccibeekeoc42/DollyHHRLHF_igbo")

# Downloading English-Igbo Multilingual
english_to_igbo = load_dataset("ccibeekeoc42/english_to_igbo")
english_to_igbo["train"] = english_to_igbo["train"].select(range(2500))
english_to_igbo["validation"] = english_to_igbo["test"].select(range(250))
english_to_igbo_reverse = english_to_igbo.copy() # Creating the reversals as Deep copies

# Downloading guanaco-llama2 Multilingual
guanaco_llama2 = DatasetDict()
guanaco_llama2["train"] = load_dataset("mlabonne/guanaco-llama2-1k", split="train")  #.select(range(1500)) # mlabonne/guanaco-llama2
guanaco_llama2["validation"] = load_dataset("mlabonne/guanaco-llama2", split="test").select(range(100))

###### Prompt Formatting (Tiny Stories)

```
<s>[INST] <<SYS>>Use the provided input to translate the english input text into Igbo/yoruba.<</SYS>>

{input} [/INST] {response}</s>
```

In [None]:
def create_prompt(sample, mode="train", src_lng="English", tgt_lng="Igbo", use_ai=False):
    """
    Args:
    - sample: A dictionary containing the sample data.
    - mode: The mode of operation, e.g., "train".
    - src_lng: The source language.
    - tgt_lng: The target language.
    - use_ai: A flag indicating whether to use AI translations.

    Returns:
    A dictionary with a single key-value pair. The key is 'prompt_response' for human translations
    and 'prompt_response_ai' for AI translations.
    """
    # Define the system message template.
    DEFAULT_SYSTEM_MESSAGE = ""
    # DEFAULT_SYSTEM_MESSAGE += "You are a helpful, respectful and honest multilingual assistant. Always answer as helpfully as possible.  "
    # DEFAULT_SYSTEM_MESSAGE += "Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. "
    # DEFAULT_SYSTEM_MESSAGE += "Please ensure that your responses are socially unbiased and positive in nature. "
    # DEFAULT_SYSTEM_MESSAGE += "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. "
    # DEFAULT_SYSTEM_MESSAGE += "If you don't know the answer to a question, please don't share false information."
    system_message = f"{DEFAULT_SYSTEM_MESSAGE}You are an expert {tgt_lng} translator!"
    # Create the input text for the prompt.
    input_text = f"Translate the {src_lng} input text into {tgt_lng}. " + sample[src_lng.capitalize()].strip() # Extract the input text.

    # Determine the key for fetching the appropriate response based on the use_ai flag.
    response_key = tgt_lng.capitalize() + ("_AI" if use_ai else "")
    response = sample.get(response_key, "").strip()

    # Define the end-of-sequence token.
    eos_token = " </s>"

    # Construct the full prompt.
    full_prompt = "[INST] <<SYS>> " + system_message + " <</SYS>> "
    full_prompt += input_text + " [/INST] "
    if mode == "train":
        full_prompt += response + eos_token

    # Return the constructed prompt with the appropriate key based on the use_ai flag.
    return_key = 'prompt_response_ai' if use_ai else 'prompt_response'
    return {return_key: full_prompt}


In [None]:
print(create_prompt(tiny_stories_yoruba['train'][0], mode="train", src_lng="English", tgt_lng="yoruba")['prompt_response'])

[INST] <<SYS>> You are an expert yoruba translator! <</SYS>> Translate the English input text into yoruba. Once upon a time, there was a little boy named Timmy. Timmy loved animals and wanted to go to the zoo. One day, his mother took him to the zoo. They saw lots of animals like lions, tigers, and monkeys. Timmy was very happy.
 
 As they were walking around, Timmy saw a sign that said "Do not feed the animals." Timmy's mom told him it was hard for the animals to eat food that wasn't good for them. Timmy promised to not feed the animals.
 
 As they were leaving the zoo, Timmy saw a little girl feeding the animals. He told her it was hard for the animals to eat that food. The little girl said she didn't know and promised to not do it again. Timmy felt happy that he helped the animals and arrived home with a big smile on his face. [/INST] Ní àkókò kán sẹyìn, ọmọdékùnrin kán wà tí á npè ní Tímì. Tímì nífẹ àwọn ẹrankó ó sí fẹ láti lọ sí ọgbà àwọn ẹrankó. Lọjọ kán, ìyá rẹ mú lọ sí ọgbà àwọ

In [None]:
# Mapping the prompts and adding a new entry to the dataset
tiny_stories_yoruba['train'] = tiny_stories_yoruba['train'].map(create_prompt, fn_kwargs={'mode': 'train', 'src_lng': 'English', 'tgt_lng': 'Yoruba', 'use_ai':False})
tiny_stories_yoruba['validation'] = tiny_stories_yoruba['validation'].map(create_prompt, fn_kwargs={'mode': 'train', 'src_lng': 'English', 'tgt_lng': 'Yoruba', 'use_ai':False})
tiny_stories_yoruba['train'] = tiny_stories_yoruba['train'].map(create_prompt, fn_kwargs={'mode': 'train', 'src_lng': 'English', 'tgt_lng': 'Yoruba', 'use_ai':True})
tiny_stories_yoruba['validation'] = tiny_stories_yoruba['validation'].map(create_prompt, fn_kwargs={'mode': 'train', 'src_lng': 'English', 'tgt_lng': 'Yoruba', 'use_ai':True})
# same for Igbo
tiny_stories_igbo['train'] = tiny_stories_igbo['train'].map(create_prompt, fn_kwargs={'mode': 'train', 'src_lng': 'English', 'tgt_lng': 'Igbo', 'use_ai':False})
tiny_stories_igbo['validation'] = tiny_stories_igbo['validation'].map(create_prompt, fn_kwargs={'mode': 'train', 'src_lng': 'English', 'tgt_lng': 'igbo', 'use_ai':False})
tiny_stories_igbo['train'] = tiny_stories_igbo['train'].map(create_prompt, fn_kwargs={'mode': 'train', 'src_lng': 'English', 'tgt_lng': 'Igbo', 'use_ai':True})
tiny_stories_igbo['validation'] = tiny_stories_igbo['validation'].map(create_prompt, fn_kwargs={'mode': 'train', 'src_lng': 'English', 'tgt_lng': 'igbo', 'use_ai':True})

Map:   0%|          | 0/1311 [00:00<?, ? examples/s]

Map:   0%|          | 0/69 [00:00<?, ? examples/s]

Map:   0%|          | 0/1311 [00:00<?, ? examples/s]

Map:   0%|          | 0/69 [00:00<?, ? examples/s]

Map:   0%|          | 0/2554 [00:00<?, ? examples/s]

Map:   0%|          | 0/135 [00:00<?, ? examples/s]

Map:   0%|          | 0/2554 [00:00<?, ? examples/s]

Map:   0%|          | 0/135 [00:00<?, ? examples/s]

In [None]:
# Combining the Yoruba and Igbo Datasets
def concatenate_prompt_responses(dataset_yoruba, dataset_igbo, split):
    # Select the 'prompt_response' feature from both datasets
    yoruba_responses = dataset_yoruba[split].remove_columns(["English", "Yoruba_AI", "Yoruba"])
    igbo_responses = dataset_igbo[split].remove_columns(["English", "Igbo_AI", "Igbo"])

    # Concatenate the datasets
    combined = concatenate_datasets([yoruba_responses, igbo_responses])
    return combined

# Combine the 'train' and 'validation' splits for both datasets
combined_train = concatenate_prompt_responses(tiny_stories_yoruba, tiny_stories_igbo, "train")
combined_validation = concatenate_prompt_responses(tiny_stories_yoruba, tiny_stories_igbo, "validation")

# Create a new DatasetDict for the combined dataset
combined_tiny_stories_igbo_yoruba = DatasetDict({
    "train": combined_train,
    "validation": combined_validation
})

In [None]:
def combine_columns_extend_dataset(dataset_dict):
    """Combine 'prompt_response' and 'prompt_response_ai' into a single column 'prompt_response'"""
    # Initialize an empty DatasetDict
    updated_dataset_dict = DatasetDict()

    for split in dataset_dict.keys():
        # Duplicate the original dataset split
        original_dataset = dataset_dict[split]
        duplicate_dataset = original_dataset.flatten_indices()

        # Update the 'prompt_response' in the duplicate with 'prompt_response_ai'
        duplicate_dataset = duplicate_dataset.map(lambda example: {'prompt_response': example['prompt_response_ai']})

        # Concatenate the original and the duplicate datasets
        combined_dataset = concatenate_datasets([original_dataset, duplicate_dataset])
        combined_dataset = combined_dataset.remove_columns(["prompt_response_ai"])

        # Add the combined dataset to the updated DatasetDict
        updated_dataset_dict[split] = combined_dataset
    return updated_dataset_dict

# Combine and extend the dataset
combined_tiny_stories_igbo_yoruba = combine_columns_extend_dataset(combined_tiny_stories_igbo_yoruba)

Flattening the indices:   0%|          | 0/3865 [00:00<?, ? examples/s]

Map:   0%|          | 0/3865 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/204 [00:00<?, ? examples/s]

Map:   0%|          | 0/204 [00:00<?, ? examples/s]

###### Prompt Formatting (English_to_Igbo HF)

In [None]:
english_to_igbo['train'] = english_to_igbo['train'].map(create_prompt, fn_kwargs={'mode': 'train', 'src_lng': 'English', 'tgt_lng': 'Igbo', 'use_ai':False})
english_to_igbo['validation'] = english_to_igbo['validation'].map(create_prompt, fn_kwargs={'mode': 'train', 'src_lng': 'English', 'tgt_lng': 'Igbo', 'use_ai':False})

# Reversing The Language Order
english_to_igbo_reverse['train'] = english_to_igbo['train'].map(create_prompt, fn_kwargs={'mode': 'train', 'src_lng': 'Igbo', 'tgt_lng': 'English', 'use_ai':False})
english_to_igbo_reverse['validation'] = english_to_igbo['validation'].map(create_prompt, fn_kwargs={'mode': 'train', 'src_lng': 'Igbo', 'tgt_lng': 'English', 'use_ai':False})

In [None]:
# Combining the Tiny Stories and HF Igbo Datasets
def concatenate_prompt_responses(hf_igbo, hf_igbo_reverse, tiny_stories, split):
    # Select the 'prompt_response' feature from both datasets
    hf_igbo_responses = hf_igbo[split].remove_columns(["English", "Igbo"])
    hf_igbo_reverse_responses = hf_igbo_reverse[split].remove_columns(["English", "Igbo"])
    tiny_stories_responses = tiny_stories[split]

    # Concatenate the datasets
    combined = concatenate_datasets([hf_igbo_responses, hf_igbo_reverse_responses, tiny_stories_responses])
    return combined

# Combine the 'train' and 'validation' splits for both datasets
combined_train = concatenate_prompt_responses(english_to_igbo, english_to_igbo_reverse, combined_tiny_stories_igbo_yoruba, "train")
combined_validation = concatenate_prompt_responses(english_to_igbo, english_to_igbo_reverse, combined_tiny_stories_igbo_yoruba, "validation")

# CUpdating the prev combined_tiny_stories_igbo_yoruba to now include this new dataset
combined_tiny_stories_igbo_yoruba = DatasetDict({
    "train": combined_train,
    "validation": combined_validation
})

###### Prompt Formatting (BBC News)

In [None]:
import gspread
from google.colab import auth
from google.auth import default

auth.authenticate_user()                                                        #helps colab identify and authenticate the current user
creds, _ = default()                                                            #get credentials to be used to access and use Google's services and APIs
gc = gspread.authorize(creds)

def get_list_from_sheet(workbook_name, sheet_name):
    """Extracts the first three columns from a specified sheet, removes rows with any empty fields, and returns the data in a Dataset format."""
    sheet = gc.open(workbook_name).worksheet(sheet_name)

    # Retrieve values from the first three columns, excluding the header
    igbo_column = sheet.col_values(1)[1:]
    english_ai_column = sheet.col_values(2)[1:]
    english_column = sheet.col_values(3)[1:]

    # Zip the columns together and filter out any rows where any column is empty
    filtered_rows = [
        (ig, eng_ai, eng) for ig, eng_ai, eng in zip(igbo_column, english_ai_column, english_column )
        if ig.strip() != '' and eng_ai.strip() != '' and eng.strip() != ''
    ]

    # If no rows remain after filtering, handle accordingly (e.g., return an empty dataset)
    if not filtered_rows:
        return Dataset.from_dict({'Igbo': [], 'English_AI': [], 'English': []})

    # Unzip the filtered rows back into separate lists
    igbo_column, igbo_ai_column, english_column  = zip(*filtered_rows)

    # Create a dataset from the dictionary of columns
    data = {
        'Igbo': igbo_column,
        'English_AI': igbo_ai_column,
        'English': english_column,
    }
    dataset = Dataset.from_dict(data)
    return dataset

bbc_news_igbo = get_list_from_sheet('BBC_News', 'Igbo')
bbc_news_yoruba = get_list_from_sheet('BBC_News', 'Yoruba')

In [None]:
# Renaming the columns for Yoruba
def rename_columns_in_dataset(dataset):
    # Rename columns as needed
    # dataset = dataset.rename_column("Igbo_AI", "Yoruba_AI")
    dataset = dataset.rename_column("Igbo", "Yoruba")
    return dataset

bbc_news_yoruba = rename_columns_in_dataset(bbc_news_yoruba)

In [None]:
def split_train_validation(dataset, train_ratio=0.95):
    """Splits the dataset into train and validation sets based on the specified ratio."""
    # Split the dataset into train and validation sets
    split_datasets = dataset.train_test_split(test_size=1 - train_ratio)

    # Retrieve the train and validation datasets from the split
    train_dataset = split_datasets['train']
    validation_dataset = split_datasets['test']

    return train_dataset, validation_dataset

# Split the dataset into train and validation sets
bbc_news_igbo_splits = split_train_validation(bbc_news_igbo)
bbc_news_yoruba_splits = split_train_validation(bbc_news_yoruba)

# Organize splits into DatasetDicts
bbc_news_igbo_dict = DatasetDict({
    'train': bbc_news_igbo_splits[0],
    'validation': bbc_news_igbo_splits[1]
})

bbc_news_yoruba_dict = DatasetDict({
    'train': bbc_news_yoruba_splits[0],
    'validation': bbc_news_yoruba_splits[1]
})

In [None]:
# Creating the reversals as Deep copies
bbc_news_igbo_dict_reverse = DatasetDict({'train': bbc_news_igbo_splits[0], 'validation': bbc_news_igbo_splits[1]})
bbc_news_yoruba_dict_reverse = DatasetDict({'train': bbc_news_yoruba_splits[0], 'validation': bbc_news_yoruba_splits[1]})

In [None]:
# Mapping the prompts and adding a new entry to the dataset
bbc_news_yoruba_dict['train'] = bbc_news_yoruba_dict['train'].map(create_prompt, fn_kwargs={'mode': 'train', 'src_lng': 'Yoruba', 'tgt_lng': 'English', 'use_ai':False})
bbc_news_yoruba_dict['validation'] = bbc_news_yoruba_dict['validation'].map(create_prompt, fn_kwargs={'mode': 'train', 'src_lng': 'Yoruba', 'tgt_lng': 'English', 'use_ai':False})
bbc_news_yoruba_dict['train'] = bbc_news_yoruba_dict['train'].map(create_prompt, fn_kwargs={'mode': 'train', 'src_lng': 'Yoruba', 'tgt_lng': 'English', 'use_ai':True})
bbc_news_yoruba_dict['validation'] = bbc_news_yoruba_dict['validation'].map(create_prompt, fn_kwargs={'mode': 'train', 'src_lng': 'Yoruba', 'tgt_lng': 'English', 'use_ai':True})
# same for Igbo
bbc_news_igbo_dict['train'] = bbc_news_igbo_dict['train'].map(create_prompt, fn_kwargs={'mode': 'train', 'src_lng': 'Igbo', 'tgt_lng': 'English', 'use_ai':False})
bbc_news_igbo_dict['validation'] = bbc_news_igbo_dict['validation'].map(create_prompt, fn_kwargs={'mode': 'train', 'src_lng': 'Igbo', 'tgt_lng': 'English', 'use_ai':False})
bbc_news_igbo_dict['train'] = bbc_news_igbo_dict['train'].map(create_prompt, fn_kwargs={'mode': 'train', 'src_lng': 'Igbo', 'tgt_lng': 'English', 'use_ai':True})
bbc_news_igbo_dict['validation'] = bbc_news_igbo_dict['validation'].map(create_prompt, fn_kwargs={'mode': 'train', 'src_lng': 'Igbo', 'tgt_lng': 'English', 'use_ai':True})

# Reversal Yoruba
bbc_news_yoruba_dict_reverse['train'] = bbc_news_yoruba_dict_reverse['train'].map(create_prompt, fn_kwargs={'mode': 'train', 'src_lng': 'English', 'tgt_lng': 'Yoruba', 'use_ai':False})
bbc_news_yoruba_dict_reverse['validation'] = bbc_news_yoruba_dict_reverse['validation'].map(create_prompt, fn_kwargs={'mode': 'train', 'src_lng': 'English', 'tgt_lng': 'Yoruba', 'use_ai':False})
# bbc_news_yoruba_dict_reverse['train'] = bbc_news_yoruba_dict_reverse['train'].map(create_prompt, fn_kwargs={'mode': 'train', 'src_lng': 'English_AI', 'tgt_lng': 'Yoruba', 'use_ai':False})
# bbc_news_yoruba_dict_reverse['validation'] = bbc_news_yoruba_dict_reverse['validation'].map(create_prompt, fn_kwargs={'mode': 'train', 'src_lng': 'English_AI', 'tgt_lng': 'Yoruba', 'use_ai':False})
# Reversal for Igbo
bbc_news_igbo_dict_reverse['train'] = bbc_news_igbo_dict_reverse['train'].map(create_prompt, fn_kwargs={'mode': 'train', 'src_lng': 'English', 'tgt_lng': 'Igbo', 'use_ai':False})
bbc_news_igbo_dict_reverse['validation'] = bbc_news_igbo_dict_reverse['validation'].map(create_prompt, fn_kwargs={'mode': 'train', 'src_lng': 'English', 'tgt_lng': 'Igbo', 'use_ai':False})
# bbc_news_igbo_dict_reverse['train'] = bbc_news_igbo_dict_reverse['train'].map(create_prompt, fn_kwargs={'mode': 'train', 'src_lng': 'English_AI', 'tgt_lng': 'Igbo', 'use_ai':False})
# bbc_news_igbo_dict_reverse['validation'] = bbc_news_igbo_dict_reverse['validation'].map(create_prompt, fn_kwargs={'mode': 'train', 'src_lng': 'English_AI', 'tgt_lng': 'Igbo', 'use_ai':False})

Map:   0%|          | 0/706 [00:00<?, ? examples/s]

Map:   0%|          | 0/38 [00:00<?, ? examples/s]

Map:   0%|          | 0/706 [00:00<?, ? examples/s]

Map:   0%|          | 0/38 [00:00<?, ? examples/s]

Map:   0%|          | 0/661 [00:00<?, ? examples/s]

Map:   0%|          | 0/35 [00:00<?, ? examples/s]

Map:   0%|          | 0/661 [00:00<?, ? examples/s]

Map:   0%|          | 0/35 [00:00<?, ? examples/s]

Map:   0%|          | 0/706 [00:00<?, ? examples/s]

Map:   0%|          | 0/38 [00:00<?, ? examples/s]

Map:   0%|          | 0/661 [00:00<?, ? examples/s]

Map:   0%|          | 0/35 [00:00<?, ? examples/s]

In [None]:
bbc_news_yoruba_dict_reverse['train'][6]

{'Yoruba': 'Àmọ́ ìdí tí wọ́n ṣe gbe Facebook kalẹ̀ kò yípadà: láti so àwọn èèyàn pọ̀ káàkiri àgbáyé àti láti pa owó látara ìpolówó ọjà.',
 'English_AI': 'But the reason they set up Facebook has not changed: to connect people around the world and to make money through advertising. ',
 'English': "But the reason why Facebook was launched didn't change: to connect people all over the world and to make money through advertisement.\n",
 'prompt_response': "[INST] <<SYS>> You are an expert Yoruba translator! <</SYS>> Translate the English input text into Yoruba. But the reason why Facebook was launched didn't change: to connect people all over the world and to make money through advertisement. [/INST] Àmọ́ ìdí tí wọ́n ṣe gbe Facebook kalẹ̀ kò yípadà: láti so àwọn èèyàn pọ̀ káàkiri àgbáyé àti láti pa owó látara ìpolówó ọjà. </s>"}

In [None]:
bbc_news_igbo_dict['train'][6]

{'Igbo': "A churu Gasset oge mba Equatorial Gini gbara ha 3-0 bu okpu goolu e jiri machie ha nti n'ulo ha. Agbanyeghi na ha abanyela ogbo 16, a churu Gasset ebe Ivory Coast ga-ezutazi mba Senegal ozo.",
 'English_AI': 'Gasset was sent off when Equatorial Guinea beat them 3-0 with a hat-trick to beat them at home. Although they have entered the round of 16, Gasset was kicked out as Ivory Coast will now meet Senegal again.',
 'English': 'Gasset was sacked when Equatorial Guinea scored them 3-0 which was a slap on their face in their home. Despite not qualifying for round 16, Gasset was sacked while Ivory Coast will meet Senegal next.',
 'prompt_response': "[INST] <<SYS>> You are an expert English translator! <</SYS>> Translate the Igbo input text into English. A churu Gasset oge mba Equatorial Gini gbara ha 3-0 bu okpu goolu e jiri machie ha nti n'ulo ha. Agbanyeghi na ha abanyela ogbo 16, a churu Gasset ebe Ivory Coast ga-ezutazi mba Senegal ozo. [/INST] Gasset was sacked when Equatoria

In [None]:
create_prompt(bbc_news_igbo_dict['train'][6], src_lng="English", tgt_lng="Igbo")

{'prompt_response': "[INST] <<SYS>> You are an expert Igbo translator! <</SYS>> Translate the English input text into Igbo. Gasset was sacked when Equatorial Guinea scored them 3-0 which was a slap on their face in their home. Despite not qualifying for round 16, Gasset was sacked while Ivory Coast will meet Senegal next. [/INST] A churu Gasset oge mba Equatorial Gini gbara ha 3-0 bu okpu goolu e jiri machie ha nti n'ulo ha. Agbanyeghi na ha abanyela ogbo 16, a churu Gasset ebe Ivory Coast ga-ezutazi mba Senegal ozo. </s>"}

In [None]:
# Combining the Yoruba and Igbo Datasets
def concatenate_prompt_responses(dataset_yoruba, dataset_igbo, dataset_yoruba_reverse, dataset_igbo_reverse, split):
    # Select the 'prompt_response' feature from both datasets
    yoruba_responses = dataset_yoruba[split].remove_columns(["Yoruba", "English_AI",  "English"])
    igbo_responses = dataset_igbo[split].remove_columns(["Igbo", "English_AI",  "English"])
    yoruba_responses_reverse = dataset_yoruba_reverse[split].remove_columns(["Yoruba", "English_AI",  "English"])
    igbo_responses_reverse  = dataset_igbo_reverse[split].remove_columns(["Igbo", "English_AI",  "English"])

    # Concatenate the datasets
    combined = concatenate_datasets([yoruba_responses, igbo_responses, yoruba_responses_reverse, igbo_responses_reverse])
    return combined

# Combine the 'train' and 'validation' splits for both datasets
combined_train = concatenate_prompt_responses(bbc_news_yoruba_dict, bbc_news_igbo_dict, bbc_news_yoruba_dict_reverse, bbc_news_igbo_dict_reverse, "train")
combined_validation = concatenate_prompt_responses(bbc_news_yoruba_dict, bbc_news_igbo_dict, bbc_news_yoruba_dict_reverse, bbc_news_igbo_dict_reverse, "validation")

# Create a new DatasetDict for the combined dataset
combined_bbc_news_igbo_yoruba = DatasetDict({
    "train": combined_train,
    "validation": combined_validation
})

In [None]:
def combine_columns_extend_dataset(dataset_dict):
    """Combine 'prompt_response' and 'prompt_response_ai' into a single column 'prompt_response'"""
    # Initialize an empty DatasetDict
    updated_dataset_dict = DatasetDict()

    for split in dataset_dict.keys():
        # Duplicate the original dataset split
        original_dataset = dataset_dict[split]
        duplicate_dataset = original_dataset.flatten_indices()

        # Update the 'prompt_response' in the duplicate with 'prompt_response_ai'
        duplicate_dataset = duplicate_dataset.map(lambda example: {'prompt_response': example['prompt_response_ai']})

        # Concatenate the original and the duplicate datasets
        combined_dataset = concatenate_datasets([original_dataset, duplicate_dataset])
        combined_dataset = combined_dataset.remove_columns(["prompt_response_ai"])

        # Add the combined dataset to the updated DatasetDict
        updated_dataset_dict[split] = combined_dataset
    return updated_dataset_dict

# Combine and extend the dataset
combined_bbc_news_igbo_yoruba = combine_columns_extend_dataset(combined_bbc_news_igbo_yoruba)

Flattening the indices:   0%|          | 0/2734 [00:00<?, ? examples/s]

Map:   0%|          | 0/2734 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/146 [00:00<?, ? examples/s]

Map:   0%|          | 0/146 [00:00<?, ? examples/s]

In [None]:
combined_bbc_news_igbo_yoruba

DatasetDict({
    train: Dataset({
        features: ['prompt_response'],
        num_rows: 5468
    })
    validation: Dataset({
        features: ['prompt_response'],
        num_rows: 292
    })
})

###### Prompt Formatting (guanaco_llama2)

In [None]:
# Renaming the column from "text" to "prompt_response" to match
def rename_columns_in_dataset(dataset):
    # Rename columns as needed
    dataset = dataset.rename_column("text", "prompt_response")
    return dataset

guanaco_llama2 = rename_columns_in_dataset(guanaco_llama2)

In [None]:
from datasets import DatasetDict

def insert_text_in_dataset(dataset_dict, text_to_insert):
    """
    Modifies each entry in the dataset by inserting a specific text right after the <s>[INST] tag.

    Args:
    - dataset_dict (DatasetDict): The DatasetDict containing the datasets.
    - text_to_insert (str): The text to insert after the <s>[INST] tag.

    Returns:
    - DatasetDict: The updated dataset with modifications applied.
    """
    def insert_text(entry):
        # Finding the position right after <s>[INST]
        index = entry['prompt_response'].find('<s>[INST]') + len('<s>[INST]')
        # Inserting the text
        entry['prompt_response'] = entry['prompt_response'][3:index] + text_to_insert + entry['prompt_response'][index:]
        return entry

    # Apply the function to both the train and validation datasets
    dataset_dict['train'] = dataset_dict['train'].map(insert_text, batched=False)
    dataset_dict['validation'] = dataset_dict['validation'].map(insert_text, batched=False)

    return dataset_dict

# Calling for guanaco_llama2 'train' and 'validation' splits
DEFAULT_SYSTEM_MESSAGE = (
    " <<SYS>> You are a helpful, respectful and honest multilingual assistant. Always answer as helpfully as possible. "
    "Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. "
    "Please ensure that your responses are socially unbiased and positive in nature. "
    "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. "
    "If you don't know the answer to a question, please don't share false information."
    " <</SYS>> "
)

# Modify the dataset
guanaco_llama2 = insert_text_in_dataset(guanaco_llama2, DEFAULT_SYSTEM_MESSAGE)

In [None]:
print(guanaco_llama2['train'][0]['prompt_response'])

[INST] <<SYS>> You are a helpful, respectful and honest multilingual assistant. Always answer as helpfully as possible. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. <</SYS>>  Me gradué hace poco de la carrera de medicina ¿Me podrías aconsejar para conseguir rápidamente un puesto de trabajo? [/INST] Esto vale tanto para médicos como para cualquier otra profesión tras finalizar los estudios aniversarios y mi consejo sería preguntar a cuántas personas haya conocido mejor. En este caso, mi primera opción sería hablar con otros profesionales médicos, echar currículos en hospitales y cualquier centro de salud. En paralelo, trabajaría por mejorar 

###### Prompt Formatting (Dolly_HHRLHF)

```
<s>[INST] <<SYS>>Below is an instruction that describes a task. Write a response that appropriately completes the request.<</SYS>>

{input} [/INST] {response} </s>
```

In [None]:
def create_prompt_dolly(sample, mode="train", lng="English", use_ai=False):
    """
    Update the prompt template based on the language and translation type (human or AI).

    Args:
    - sample: A dictionary containing the sample data.
    - mode: The mode of operation, e.g., "train".
    - lng: The language of the prompt and response.
    - use_ai: A flag indicating whether to use AI translations.

    Returns:
    A dictionary with a key for the formatted prompt response, adjusted for language and translation type.
    """
    # Determine columns based on language and translation type
    prefix = "_AI" if use_ai else ""
    if lng.capitalize() == "Igbo":
        cols = [f"Prompt_Igbo{prefix}", f"Response_Igbo{prefix}"]
    elif lng.capitalize() == "Yoruba":
        cols = [f"Prompt_Yoruba{prefix}", f"Response_Yoruba{prefix}"]
    else:
        cols = ["Prompt_English", "Response_English"]  # Assuming no AI version for English

    system_message, prompt, _ = sample[cols[0]].split("###")
    prompt = "\n".join(prompt.split("\n")[1:])
    response = sample[cols[1]].strip()

    # Define tokens
    # bos_token = "<s>" if not use_ai else ""  # Add BOS token only for human translations
    eos_token = " </s>"

    # Define the system message template.
    DEFAULT_SYSTEM_MESSAGE = ""
    # DEFAULT_SYSTEM_MESSAGE += "You are a helpful, respectful and honest multilingual assistant. Always answer as helpfully as possible.  "
    # DEFAULT_SYSTEM_MESSAGE += "Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. "
    # DEFAULT_SYSTEM_MESSAGE += "Please ensure that your responses are socially unbiased and positive in nature. "
    # DEFAULT_SYSTEM_MESSAGE += "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. "
    # DEFAULT_SYSTEM_MESSAGE += "If you don't know the answer to a question, please don't share false information."
    system_message = f"{DEFAULT_SYSTEM_MESSAGE}\n{system_message}"

    # Construct the full prompt
    full_prompt = f"[INST] <<SYS>> {system_message.strip()} <</SYS>> "
    full_prompt += f"{prompt.strip()} [/INST] "
    if mode == "train":
        full_prompt += f"{response}{eos_token}"

    # Adjust the return key based on whether AI translations are used
    key_suffix = "_ai" if use_ai else ""
    return {f'prompt_response_{lng.lower()}{key_suffix}': full_prompt}

In [None]:
# Mapping the prompts and adding a new entry to the dataset
dolly_hhrlhf_yoruba['train'] = dolly_hhrlhf_yoruba['train'].map(create_prompt_dolly, fn_kwargs={'mode': 'train', 'lng': 'Yoruba'})
dolly_hhrlhf_yoruba['validation'] = dolly_hhrlhf_yoruba['validation'].map(create_prompt_dolly, fn_kwargs={'mode': 'train', 'lng': 'Yoruba'})
dolly_hhrlhf_yoruba['train'] = dolly_hhrlhf_yoruba['train'].map(create_prompt_dolly, fn_kwargs={'mode': 'train', 'lng': 'Yoruba', 'use_ai':True})
dolly_hhrlhf_yoruba['validation'] = dolly_hhrlhf_yoruba['validation'].map(create_prompt_dolly, fn_kwargs={'mode': 'train', 'lng': 'Yoruba', 'use_ai':True})
# same for Igbo
dolly_hhrlhf_igbo['train'] = dolly_hhrlhf_igbo['train'].map(create_prompt_dolly, fn_kwargs={'mode': 'train', 'lng': 'Igbo'})
dolly_hhrlhf_igbo['validation'] = dolly_hhrlhf_igbo['validation'].map(create_prompt_dolly, fn_kwargs={'mode': 'train', 'lng': 'igbo'})
dolly_hhrlhf_igbo['train'] = dolly_hhrlhf_igbo['train'].map(create_prompt_dolly, fn_kwargs={'mode': 'train', 'lng': 'Igbo', 'use_ai':True})
dolly_hhrlhf_igbo['validation'] = dolly_hhrlhf_igbo['validation'].map(create_prompt_dolly, fn_kwargs={'mode': 'train', 'lng': 'igbo', 'use_ai':True})
# same for English (Using the Igbo as basline beacuse it has more english)
dolly_hhrlhf_igbo['train'] = dolly_hhrlhf_igbo['train'].map(create_prompt_dolly, fn_kwargs={'mode': 'train', 'lng': 'English'})
dolly_hhrlhf_igbo['validation'] = dolly_hhrlhf_igbo['validation'].map(create_prompt_dolly, fn_kwargs={'mode': 'train', 'lng': 'English'})

Map:   0%|          | 0/367 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/367 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/535 [00:00<?, ? examples/s]

Map:   0%|          | 0/29 [00:00<?, ? examples/s]

Map:   0%|          | 0/535 [00:00<?, ? examples/s]

Map:   0%|          | 0/29 [00:00<?, ? examples/s]

Map:   0%|          | 0/535 [00:00<?, ? examples/s]

Map:   0%|          | 0/29 [00:00<?, ? examples/s]

In [None]:
def merge_responses(dataset_yoruba, dataset_igbo, split):
    # For Yoruba, select only the 'prompt_response_yoruba'
    yoruba_responses = dataset_yoruba[split].map(lambda example: {'prompt_response': example['prompt_response_yoruba']})
    yoruba_responses_ai = dataset_yoruba[split].map(lambda example: {'prompt_response': example['prompt_response_yoruba_ai']})
    igbo_responses = dataset_igbo[split].map(lambda example: {'prompt_response': example['prompt_response_igbo']})
    igbo_responses_ai = dataset_igbo[split].map(lambda example: {'prompt_response': example['prompt_response_igbo_ai']})
    english_responses = dataset_igbo[split].map(lambda example: {'prompt_response': example['prompt_response_english']})
    # Concatenate the datasets
    combined = concatenate_datasets([yoruba_responses, yoruba_responses_ai, igbo_responses, igbo_responses_ai, english_responses])
    return combined

# Combine the 'train' and 'validation' splits for both datasets
combined_train = merge_responses(dolly_hhrlhf_yoruba, dolly_hhrlhf_igbo, "train")
combined_validation = merge_responses(dolly_hhrlhf_yoruba, dolly_hhrlhf_igbo, "validation")

# Create a new DatasetDict for the combined dataset
combined_dolly_hhrlhf_igbo_yoruba_english = DatasetDict({
    "train": combined_train,
    "validation": combined_validation
})

# Function to select only the 'prompt_response' feature
def select_prompt_response(example):
    return {'prompt_response': example['prompt_response']}

combined_dolly_hhrlhf_igbo_yoruba_english = combined_dolly_hhrlhf_igbo_yoruba_english.map(select_prompt_response,  remove_columns=[col for col in combined_dolly_hhrlhf_igbo_yoruba_english.column_names['train'] if col != 'prompt_response'])

Map:   0%|          | 0/367 [00:00<?, ? examples/s]

Map:   0%|          | 0/367 [00:00<?, ? examples/s]

Map:   0%|          | 0/535 [00:00<?, ? examples/s]

Map:   0%|          | 0/535 [00:00<?, ? examples/s]

Map:   0%|          | 0/535 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/29 [00:00<?, ? examples/s]

Map:   0%|          | 0/29 [00:00<?, ? examples/s]

Map:   0%|          | 0/29 [00:00<?, ? examples/s]

Map:   0%|          | 0/2339 [00:00<?, ? examples/s]

Map:   0%|          | 0/127 [00:00<?, ? examples/s]

###### Prompt Formatting (Combining Tiny Stories + Dolly_HHRLHF + BBC News + guanaco_llama2)

In [None]:
# Concatenate the 'train' splits
combined_train = concatenate_datasets([
    combined_tiny_stories_igbo_yoruba['train'],
    combined_dolly_hhrlhf_igbo_yoruba_english['train'],
    combined_bbc_news_igbo_yoruba['train'],
    guanaco_llama2['train']
])

# Concatenate the 'validation' splits
combined_validation = concatenate_datasets([
    combined_tiny_stories_igbo_yoruba['validation'],
    combined_dolly_hhrlhf_igbo_yoruba_english['validation'],
    combined_bbc_news_igbo_yoruba['validation'],
    guanaco_llama2['validation']
])

# Create a new DatasetDict for the combined dataset
combined_all_dataset = DatasetDict({
    'train': combined_train,
    'validation': combined_validation
})

In [None]:
# Shuffle the 'train' split
seed = 42
shuffled_train = combined_all_dataset['train'].shuffle(seed=seed)
shuffled_validation = combined_all_dataset['validation'].shuffle(seed=seed)

# Update the combined_all_dataset with the shuffled splits
combined_all_dataset_shuffled = DatasetDict({
    'train': shuffled_train,
    'validation': shuffled_validation
})

###### Saving as JSONL files for Hugging Face Upload

In [None]:
# Save the 'train' split to a JSON Lines file
combined_all_dataset_shuffled['train'].to_json('train.jsonl')

# Save the 'validation' split to a JSON Lines file
combined_all_dataset_shuffled['validation'].to_json('validation.jsonl')

Creating json from Arrow format:   0%|          | 0/22 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

1657564

In [None]:
import json
import os

def clean_and_overwrite_jsonl(input_file):
    temp_file = input_file + '.tmp'  # Temporary file name
    with open(input_file, 'r') as infile, open(temp_file, 'w') as outfile:
        for line in infile:
            data = json.loads(line)
            if data['prompt_response'] is not None:
                json.dump(data, outfile)
                outfile.write('\n')

    # Replace the original file with the cleaned temporary file
    os.replace(temp_file, input_file)

# Example usage
clean_and_overwrite_jsonl('train.jsonl')
clean_and_overwrite_jsonl('validation.jsonl')

###### Testing Pulling Final Dataset From HF

In [None]:
# low_resource_dataset = load_dataset("ccibeekeoc42/low_resource_multilingual_sft")
# low_resource_dataset