# Fine Tuning a Base LLM with Instructional Tuning

In [2]:
import itertools
import jsonlines

import creds
import pandas as pd
import datasets

from pprint import pprint
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

In [4]:
dataset = load_dataset("lucasmccabe-lmi/CodeAlpaca-20k", split="train")
pd.DataFrame(dataset)

Unnamed: 0,instruction,input,output
0,Create a function that takes a specific input ...,,"def f(x):\n """"""\n Takes a specific input..."
1,Generate a unique 8 character string that cont...,,import string\nimport random\n\ndef random_pas...
2,"Given a course consisting of 5 lessons, write ...","Lessons = [""Python Basics"", ""Javascript Basics...",def display_first_lesson(lessons):\n print(...
3,Create an algorithm to encourage work balance ...,,One algorithm to encourage work balance and pr...
4,Write a JavaScript that changes the text of a ...,,"document.getElementById(""myParagraph"").innerHT..."
...,...,...,...
20017,Write a script in Python to remove the charact...,"string=""!!He##ll%o W..or##ld!!""","new_string = """"\nfor char in string:\n if c..."
20018,Write a code in Python to count the number of ...,,# Count the number of words in the sentence\ns...
20019,Implement a method in JavaScript that checks i...,"String = ""racecar""",function checkPalindrome(str){\n let revStr...
20020,Write a JavaScript code snippet to test if a g...,,if (Array.isArray(variable)) {\n console.log(...


In [5]:
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

In [6]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['instruction'])):
        text = f"### Question: {example['instruction'][i]}\n ### Answer: {example['output'][i]}"
        output_texts.append(text)
    return output_texts

In [8]:
response_template = " ### Answer:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

In [9]:
trainer = SFTTrainer(
    model,
    train_dataset=dataset,
    formatting_func=formatting_prompts_func,
    data_collator=collator,
)


  return torch._C._cuda_getDeviceCount() > 0
Map: 100%|██████████| 20022/20022 [00:00<00:00, 21663.52 examples/s]


In [10]:
trainer.train()

Step,Training Loss


: 

## 1. Data Preparation

In [2]:
## load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('google/gemma-7b',token = creds.HUGGINGFACE_TOKEN)

In [3]:
## sample of Tokenizer 
text = 'Hi, how are you?'
encoded_text = tokenizer(text)
print(encoded_text)

decoded_text = tokenizer.decode(encoded_text['input_ids'])
print(decoded_text)

{'input_ids': [2, 2151, 235269, 1368, 708, 692, 235336], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
<bos>Hi, how are you?


In [4]:
list_texts = ["Hi, how are you?", "I'm good", "Yes"]
encoded_texts = tokenizer(list_texts)
print("Encoded several texts: ", encoded_texts["input_ids"])

Encoded several texts:  [[2, 2151, 235269, 1368, 708, 692, 235336], [2, 235285, 235303, 235262, 1426], [2, 3553]]


In [5]:
tokenizer.pad_token = tokenizer.eos_token 
encoded_texts_longest = tokenizer(list_texts, padding=True)
print("Using padding: ", encoded_texts_longest["input_ids"])

Using padding:  [[2, 2151, 235269, 1368, 708, 692, 235336], [1, 1, 2, 235285, 235303, 235262, 1426], [1, 1, 1, 1, 1, 2, 3553]]


In [6]:
## load the model
foundational_model = AutoModelForCausalLM.from_pretrained('google/gemma-7b', token = creds.HUGGINGFACE_TOKEN)

Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.44s/it]


In [7]:
## load the dataset for instructional tuning
instruction_tuned_dataset = load_dataset('kotzeje/lamini_docs.jsonl',
                                         token=creds.HUGGINGFACE_TOKEN,
                                         split='train',
                                         streaming=True)


In [8]:
m = 5
print("Instruction-tuned dataset:")
top_m = list(itertools.islice(instruction_tuned_dataset, m))
for j in top_m:
  print(j)

Instruction-tuned dataset:
{'question': 'How can I evaluate the performance and quality of the generated text from Lamini models?', 'answer': "There are several metrics that can be used to evaluate the performance and quality of generated text from Lamini models, including perplexity, BLEU score, and human evaluation. Perplexity measures how well the model predicts the next word in a sequence, while BLEU score measures the similarity between the generated text and a reference text. Human evaluation involves having human judges rate the quality of the generated text based on factors such as coherence, fluency, and relevance. It is recommended to use a combination of these metrics for a comprehensive evaluation of the model's performance."}
{'question': "Can I find information about the code's approach to handling long-running tasks and background jobs?", 'answer': 'Yes, the code includes methods for submitting jobs, checking job status, and retrieving job results. It also includes a met

In [9]:
examples = pd.DataFrame(instruction_tuned_dataset).to_dict()

In [10]:
# examples = instruction_tuned_dataset.to_dict()

if "question" in examples and "answer" in examples:
  text = examples["question"][0] + examples["answer"][0]
elif "instruction" in examples and "response" in examples:
  text = examples["instruction"][0] + examples["response"][0]
elif "input" in examples and "output" in examples:
  text = examples["input"][0] + examples["output"][0]
else:
  text = examples["text"][0]

prompt_template = """### Question:
{question}

### Answer:"""

num_examples = len(examples["question"])
finetuning_dataset = []
for i in range(num_examples):
  question = examples["question"][i]
  answer = examples["answer"][i]
  text_with_prompt_template = prompt_template.format(question=question)
  finetuning_dataset.append({"question": text_with_prompt_template, "answer": answer})

from pprint import pprint
print("One datapoint in the finetuning dataset:")
pprint(finetuning_dataset[0])

One datapoint in the finetuning dataset:
{'answer': 'There are several metrics that can be used to evaluate the '
           'performance and quality of generated text from Lamini models, '
           'including perplexity, BLEU score, and human evaluation. Perplexity '
           'measures how well the model predicts the next word in a sequence, '
           'while BLEU score measures the similarity between the generated '
           'text and a reference text. Human evaluation involves having human '
           'judges rate the quality of the generated text based on factors '
           'such as coherence, fluency, and relevance. It is recommended to '
           'use a combination of these metrics for a comprehensive evaluation '
           "of the model's performance.",
 'question': '### Question:\n'
             'How can I evaluate the performance and quality of the generated '
             'text from Lamini models?\n'
             '\n'
             '### Answer:'}


In [11]:
text = finetuning_dataset[1]["question"] + finetuning_dataset[1]["answer"]
tokenized_inputs = tokenizer(
    text,
    return_tensors="np",
    padding=True
)
print(tokenized_inputs["input_ids"])

[[     2   6176  10825 235292    108   3611    590   1717   2113   1105
     573   3409 235303 235256   5688    577  13404   1497 235290  23655
   13333    578   3824  11088 235336    109   6176  10358 235292   3553
  235269    573   3409   6566   5358    604  51945  11088 235269  18604
    3356   4938 235269    578 115227   3356   3190 235265   1165   1170
    6566    476   2370    604 192927  11088 235265  32788 235269   1104
     603    476   2370    604  21100   6733  30854    774    476   2091
  235269    948   1538    614   7275    604   1497 235290  23655  13333
  235265]]


In [12]:
max_length = 2048
max_length = min(
    tokenized_inputs["input_ids"].shape[1],
    max_length,
)

In [13]:
tokenized_inputs = tokenizer(
    text,
    return_tensors="np",
    truncation=True,
    max_length=max_length
)

In [14]:
tokenized_inputs["input_ids"]

array([[     2,   6176,  10825, 235292,    108,   3611,    590,   1717,
          2113,   1105,    573,   3409, 235303, 235256,   5688,    577,
         13404,   1497, 235290,  23655,  13333,    578,   3824,  11088,
        235336,    109,   6176,  10358, 235292,   3553, 235269,    573,
          3409,   6566,   5358,    604,  51945,  11088, 235269,  18604,
          3356,   4938, 235269,    578, 115227,   3356,   3190, 235265,
          1165,   1170,   6566,    476,   2370,    604, 192927,  11088,
        235265,  32788, 235269,   1104,    603,    476,   2370,    604,
         21100,   6733,  30854,    774,    476,   2091, 235269,    948,
          1538,    614,   7275,    604,   1497, 235290,  23655,  13333,
        235265]])

In [15]:
def tokenize_function(examples):
    if "question" in examples and "answer" in examples:
      text = examples["question"][0] + examples["answer"][0]
    elif "input" in examples and "output" in examples:
      text = examples["input"][0] + examples["output"][0]
    else:
      text = examples["text"][0]

    tokenizer.pad_token = tokenizer.eos_token
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        padding=True,
    )

    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        2048
    )
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=max_length
    )

    return tokenized_inputs

In [16]:
finetuning_dataset_loaded = datasets.load_dataset('kotzeje/lamini_docs.jsonl', split="train")

tokenized_dataset = finetuning_dataset_loaded.map(
    tokenize_function,
    batched=True,
    batch_size=1,
    drop_last_batch=True
)

print(tokenized_dataset)

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask'],
    num_rows: 1400
})


In [17]:
tokenized_dataset = tokenized_dataset.add_column("labels", tokenized_dataset["input_ids"])

In [18]:
pd.DataFrame(tokenized_dataset)

Unnamed: 0,question,answer,input_ids,attention_mask,labels
0,How can I evaluate the performance and quality...,There are several metrics that can be used to ...,"[2, 2299, 798, 590, 18739, 573, 4665, 578, 361...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 2299, 798, 590, 18739, 573, 4665, 578, 361..."
1,Can I find information about the code's approa...,"Yes, the code includes methods for submitting ...","[2, 3611, 590, 1717, 2113, 1105, 573, 3409, 23...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 3611, 590, 1717, 2113, 1105, 573, 3409, 23..."
2,How does Lamini AI handle requests for generat...,Lamini AI offers features for generating text ...,"[2, 2299, 1721, 15583, 1904, 16481, 6589, 1255...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 2299, 1721, 15583, 1904, 16481, 6589, 1255..."
3,Does the `submit_job()` function expose any ad...,It is unclear which `submit_job()` function is...,"[2, 11227, 573, 4103, 12480, 235298, 9860, 114...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 11227, 573, 4103, 12480, 235298, 9860, 114..."
4,Does the `add_data()` function support differe...,"No, the `add_data()` function does not support...","[2, 11227, 573, 4103, 1254, 235298, 1259, 1149...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 11227, 573, 4103, 1254, 235298, 1259, 1149..."
...,...,...,...,...,...
1395,Does Lamini have the ability to understand and...,"Yes, Lamini has the ability to understand and ...","[2, 11227, 15583, 1904, 791, 573, 7374, 577, 3...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 11227, 15583, 1904, 791, 573, 7374, 577, 3..."
1396,Can I fine-tune the pre-trained models provide...,"Yes, you can fine-tune the pre-trained models ...","[2, 3611, 590, 4948, 235290, 19052, 573, 953, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 3611, 590, 4948, 235290, 19052, 573, 953, ..."
1397,Can Lamini generate text that is suitable for ...,"Yes, Lamini can generate text that is suitable...","[2, 3611, 15583, 1904, 11941, 2793, 674, 603, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 3611, 15583, 1904, 11941, 2793, 674, 603, ..."
1398,Does the documentation have a secret code that...,I wish! This documentation only talks about La...,"[2, 11227, 573, 18312, 791, 476, 7401, 3409, 6...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 11227, 573, 18312, 791, 476, 7401, 3409, 6..."


In [19]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=123)
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 140
    })
})


In [20]:
split_dataset['train']

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1260
})

In [21]:
input_text = "What is the capital of France"
input_ids = tokenizer(input_text, return_tensors="pt")

In [22]:
input_ids

{'input_ids': tensor([[   2, 1841,  603,  573, 6037,  576, 6081]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}

In [23]:
outputs = foundational_model.generate(**input_ids)



In [24]:
print(tokenizer.decode(outputs[0]))



<bos>What is the capital of France?

What is the capital of Germany?

What is the


In [21]:
from trl import SFTTrainer

In [23]:
def formatting_prompts_func(example):
    text = f"### Question: {example['question']}\n ### Answer: {example['answer']}"
    return text

In [24]:
pd.DataFrame(tokenized_dataset)

Unnamed: 0,question,answer,input_ids,attention_mask,labels
0,How can I evaluate the performance and quality...,There are several metrics that can be used to ...,"[2, 2299, 798, 590, 18739, 573, 4665, 578, 361...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 2299, 798, 590, 18739, 573, 4665, 578, 361..."
1,Can I find information about the code's approa...,"Yes, the code includes methods for submitting ...","[2, 3611, 590, 1717, 2113, 1105, 573, 3409, 23...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 3611, 590, 1717, 2113, 1105, 573, 3409, 23..."
2,How does Lamini AI handle requests for generat...,Lamini AI offers features for generating text ...,"[2, 2299, 1721, 15583, 1904, 16481, 6589, 1255...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 2299, 1721, 15583, 1904, 16481, 6589, 1255..."
3,Does the `submit_job()` function expose any ad...,It is unclear which `submit_job()` function is...,"[2, 11227, 573, 4103, 12480, 235298, 9860, 114...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 11227, 573, 4103, 12480, 235298, 9860, 114..."
4,Does the `add_data()` function support differe...,"No, the `add_data()` function does not support...","[2, 11227, 573, 4103, 1254, 235298, 1259, 1149...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 11227, 573, 4103, 1254, 235298, 1259, 1149..."
...,...,...,...,...,...
1395,Does Lamini have the ability to understand and...,"Yes, Lamini has the ability to understand and ...","[2, 11227, 15583, 1904, 791, 573, 7374, 577, 3...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 11227, 15583, 1904, 791, 573, 7374, 577, 3..."
1396,Can I fine-tune the pre-trained models provide...,"Yes, you can fine-tune the pre-trained models ...","[2, 3611, 590, 4948, 235290, 19052, 573, 953, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 3611, 590, 4948, 235290, 19052, 573, 953, ..."
1397,Can Lamini generate text that is suitable for ...,"Yes, Lamini can generate text that is suitable...","[2, 3611, 15583, 1904, 11941, 2793, 674, 603, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 3611, 15583, 1904, 11941, 2793, 674, 603, ..."
1398,Does the documentation have a secret code that...,I wish! This documentation only talks about La...,"[2, 11227, 573, 18312, 791, 476, 7401, 3409, 6...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 11227, 573, 18312, 791, 476, 7401, 3409, 6..."


In [25]:
trainer = SFTTrainer(
    'facebook/opt-350m',
    train_dataset = tokenized_dataset,
    formatting_func=formatting_prompts_func,
    packing = True
)

Generating train split: 118 examples [00:00, 652.76 examples/s]


In [27]:
split_dataset['train']

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1260
})

In [26]:
trainer.train()

: 

In [29]:
# load jsonl dataset
# dataset = load_dataset("json", data_files="path/to/dataset.jsonl", split="train")
# load dataset from the HuggingFace Hub
dataset = load_dataset("philschmid/dolly-15k-oai-style", split="train")

Downloading readme: 100%|██████████| 523/523 [00:00<00:00, 1.96MB/s]
Downloading data: 100%|██████████| 7.24M/7.24M [00:00<00:00, 8.90MB/s]
Generating train split: 100%|██████████| 15011/15011 [00:00<00:00, 291600.81 examples/s]


In [37]:
pd.DataFrame(dataset)['messages'][0]

[{'content': "When did Virgin Australia start operating?\nVirgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.",
  'role': 'user'},
 {'content': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.',
  'role': 'assistant'}]

In [39]:
trainer = SFTTrainer(
    "facebook/opt-350m",
    # args=training_args,
    train_dataset=dataset,
    packing=True,
)

  return torch._C._cuda_getDeviceCount() > 0
Generating train split: 0 examples [00:00, ? examples/s]
No chat template is defined for this tokenizer - using the default template for the GPT2TokenizerFast class. If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.

Generating train split: 2590 examples [00:02, 1287.39 examples/s]


In [40]:
trainer.train()

KeyboardInterrupt: 