#### Install firectl on Linux

In [None]:
!wget -O firectl.gz https://storage.googleapis.com/fireworks-public/firectl/stable/linux-amd64.gz
!gunzip firectl.gz
!sudo install -o root -g root -m 0755 firectl /usr/local/bin/firectl

#### Download and process dataset

In [1]:
from datasets import load_dataset

In [43]:
dataset = load_dataset("malhajar/alpaca-gpt4-ar")

In [44]:
dataset = dataset.filter(lambda x: x['input'] is not None)

In [45]:
len(dataset['train'])

20658

In [46]:
LLAMA_PROMPT_TEMPLATE_AR = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant. Answer in Arabic only. \
{instruction}<|eot_id|><|start_header_id|>user<|end_header_id|>

{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""


LLAMA_PROMPT_TEMPLATE_EN = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant. \
{instruction}<|eot_id|><|start_header_id|>user<|end_header_id|>

{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""

In [50]:
EOS_TOKEN = "<|eot_id|>"

def formatting_prompts_func(examples):
    instructions_ar = examples["instruction-arabic"]
    inputs_ar       = examples["input-arabic"]

    instructions = examples["instruction"]
    inputs       = examples["input"]

    en_instruction_en_input_en_prompts = []
    en_instruction_ar_input_ar_prompts = []
    en_instruction_en_input_ar_prompts = []

    for instruction_, input_ in zip(instructions, inputs):
        en_instruction_en_input_en_output = LLAMA_PROMPT_TEMPLATE_EN.format(instruction=instruction_, input=input_) + EOS_TOKEN
        en_instruction_en_input_en_prompts.append(en_instruction_en_input_en_output)

    for instruction_, input_ in zip(instructions, inputs_ar):
        en_instruction_ar_input_ar_output = LLAMA_PROMPT_TEMPLATE_AR.format(instruction=instruction_, input=input_) + EOS_TOKEN
        en_instruction_ar_input_ar_prompts.append(en_instruction_ar_input_ar_output)

    for instruction_, input_ in zip(instructions, inputs):
        en_instruction_en_input_ar_output = LLAMA_PROMPT_TEMPLATE_AR.format(instruction=instruction_, input=input_) + EOS_TOKEN
        en_instruction_en_input_ar_prompts.append(en_instruction_en_input_ar_output)

    return {
        "en_instruction_en_input_en_prompts" : en_instruction_en_input_en_prompts, 
        "en_instruction_ar_input_ar_prompts": en_instruction_ar_input_ar_prompts,
        "en_instruction_en_input_ar_prompts": en_instruction_en_input_ar_prompts
    }

In [51]:
dataset = dataset.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/20658 [00:00<?, ? examples/s]

##### Check the samples having the highest input size, which is similar to the context.

In [52]:
from tqdm import tqdm
import numpy as np

In [31]:
input_length = []

for sample in tqdm(dataset['train']):

    input_length.append(len(sample['input']))

100%|██████████| 20658/20658 [00:07<00:00, 2619.82it/s]


In [55]:
sorted(input_length)[-100]

490

#### I'll select the top 5000 and test my luck

In [54]:
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text', 'instruction-arabic', 'input-arabic', 'output-arabic', 'text-arabic', 'en_instruction_en_input_en_prompts', 'en_instruction_ar_input_ar_prompts', 'en_instruction_en_input_ar_prompts'],
        num_rows: 20658
    })
})

In [59]:
final_dataset = dataset.filter(lambda x: len(x['input']) >= 65)

Filter:   0%|          | 0/20658 [00:00<?, ? examples/s]

In [61]:
len(final_dataset['train'])

5096

### Create jsonl file

In [64]:
import json

split_data = final_dataset["train"]

counter = 0
with open("llama3-8b-alpaca-ar.jsonl", "w") as f:
    for item in split_data:

        en_instruction_en_input_en_outout_prompt = {"input": item["en_instruction_en_input_en_prompts"], "output": item["output"]}
        en_instruction_ar_input_ar_output_prompt = {"input": item["en_instruction_ar_input_ar_prompts"], "output": item["output-arabic"]}
        en_instruction_en_input_ar_output_prompt = {"input": item["en_instruction_en_input_ar_prompts"], "output": item["output-arabic"]}
        
        json.dump(en_instruction_en_input_en_outout_prompt, f)
        counter += f.write("\n")

        json.dump(en_instruction_ar_input_ar_output_prompt, f)
        counter += f.write("\n")

        json.dump(en_instruction_en_input_ar_output_prompt, f)
        counter += f.write("\n")

print(f"{counter} lines converted")

15288 lines converted


#### Start finetuneing job

In [None]:
!firectl create fine-tuning-job --settings-file llama3-8b-alpaca-ar.jsonl --display-name "Llama3 8b Alpaca Ar Finetune"