# Launch SFT jobs in the Notebook.
<a target="_blank" href="https://colab.research.google.com/github/ai-hero/llm-research-orchestration/blob/main/notebooks/fine_tuning_research.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [1]:
!pip uninstall aihero-research-finetuning -y
!pip uninstall aihero-research-config -y
!pip install -q git+https://github.com/ai-hero/llm-research-fine-tuning.git@main#egg=aihero-research-finetuning
!pip install numpy==1.25.2 # Bug in collab - https://github.com/numpy/numpy/issues/25150

[0m  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [19]:
# Set all important env variables needed for the application to work

## NOTE: It's best practice don't set the here, set them in your secrets.

## wandb
%env WANDB_DISABLED=true
## OR
# %env WANDB_API_KEY=
# %env WANDB_USERNAME=
## NOTE: Enabling WANDB also starts reporting progress to slower -
##  we run progressive predictions for 100 test samples. And prediting
##  while training is very slow on a notebook

## huggingface
%env HF_TOKEN=

env: WANDB_DISABLED=true
env: HF_TOKEN=


## Preparing the dataset for Fine-Tuning.
In this example, we'll prepare some of the dataset for fine-tuning

In [3]:
from datasets import DatasetDict, load_dataset

In [4]:
dolly_dataset = load_dataset("databricks/databricks-dolly-15k")

Downloading readme:   0%|          | 0.00/8.20k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
dolly_dataset["train"].to_pandas().head()

Unnamed: 0,instruction,context,response,category
0,When did Virgin Australia start operating?,"Virgin Australia, the trading name of Virgin A...",Virgin Australia commenced services on 31 Augu...,closed_qa
1,Which is a species of fish? Tope or Rope,,Tope,classification
2,Why can camels survive for long without water?,,Camels use the fat in their humps to keep them...,open_qa
3,"Alice's parents have three daughters: Amy, Jes...",,The name of the third daughter is Alice,open_qa
4,When was Tomoaki Komorida born?,Komorida was born in Kumamoto Prefecture on Ju...,"Tomoaki Komorida was born on July 10,1981.",closed_qa


Let's first build our prompt/completion dataset

In [6]:
def build_training_example(row):
    """Generate a training example from a row in the dataset in prompt+completion format."""
    prompt = f"## Instruction: {row.get('instruction')}\n"
    if row.get("context", ""):
        prompt = f"{prompt}## Context: {row['context']}\n"
    prompt = f"{prompt}## Response:"

    completion = row["response"]
    return {"prompt": prompt, "completion": completion}


extracted_dataset = dolly_dataset.map(build_training_example).remove_columns(
    ["instruction", "context", "response", "category"]
)
extracted_dataset["train"].to_pandas().head()

Map:   0%|          | 0/15011 [00:00<?, ? examples/s]

Unnamed: 0,prompt,completion
0,## Instruction: When did Virgin Australia star...,Virgin Australia commenced services on 31 Augu...
1,## Instruction: Which is a species of fish? To...,Tope
2,## Instruction: Why can camels survive for lon...,Camels use the fat in their humps to keep them...
3,## Instruction: Alice's parents have three dau...,The name of the third daughter is Alice
4,## Instruction: When was Tomoaki Komorida born...,"Tomoaki Komorida was born on July 10,1981."


Next, let's split the data into train/val/test split

In [7]:
def build_dataset(
    dataset, train_split=0.8, val_split=0.1, test_split=0.1, train_size=None, val_size=None, test_size=None
):
    """Build a train, val and test dataset from a dataset."""
    train_testvalid = dataset.train_test_split(train_size=train_split)
    test_valid = train_testvalid["test"].train_test_split(test_size=test_split / (test_split + val_split))

    if train_size and len(train_testvalid["train"]) > train_size:
        train_testvalid["train"] = train_testvalid["train"].select(range(train_size))

    if val_size and len(test_valid["train"]) > val_size:
        test_valid["train"] = test_valid["train"].select(range(val_size))

    if test_size and len(test_valid["test"]) > train_size:
        test_valid["test"] = test_valid["test"].select(range(train_size))

    return DatasetDict(
        {
            "train": train_testvalid["train"],
            "val": test_valid["train"],
            "test": test_valid["test"],
        }
    )


train_size = 250
val_size = 50
test_size = 50
new_dataset = build_dataset(extracted_dataset["train"], train_size=train_size, val_size=val_size, test_size=test_size)

In [8]:
import os
import shutil
from pathlib import Path


def save_dataset(dataset_name, new_dataset):
    """Save a dataset to disk."""
    current_directory = Path(".")
    shutil.rmtree(current_directory / dataset_name, ignore_errors=True)
    os.mkdir(current_directory / dataset_name)
    dataset_path = (current_directory / dataset_name).as_posix()
    new_dataset.save_to_disk(dataset_path)
    return dataset_path


dataset_name = "dolly-15k"
dataset_path = save_dataset(dataset_name, new_dataset)

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

## Running the fine tuning

In [9]:
def build_config(dataset_name, dataset_path):
    """Build a config file for the dataset."""
    current_directory = Path(".")
    config_path = (current_directory / f"{dataset_name}.yaml").as_posix()
    config_yaml = f"""
project:
  name: "{dataset_name}"

task: "completion"

dataset:
  name: "{dataset_name}"
  type: "local"
  task: "completion"
  path: "{dataset_path}"

base:
  name: "meta-llama/Llama-2-7b-hf"
  type: "huggingface"

trainer:
  packing: false
  max_seq_length: 512

sft:
  per_device_train_batch_size: 1
  per_device_eval_batch_size: 1
  learning_rate: 0.0002
  lr_scheduler_type: "cosine"
  optim: "paged_adamw_8bit"
  warmup_ratio: 0.1
  max_steps: {train_size}
  gradient_accumulation_steps: 4
  gradient_checkpointing: True
  gradient_checkpointing_kwargs:
    use_reentrant: False
  logging_strategy: "steps"
  logging_steps: 5
  evaluation_strategy: "steps"
  eval_steps: 50
peft:
  r: 64  # the rank of the LoRA matrices
  lora_alpha: 16 # the weight
  lora_dropout: 0.1 # dropout to add to the LoRA layers
  bias: "none" # add bias to the nn.Linear layers?
  task_type: "CAUSAL_LM"
  target_modules:  # the name of the layers to add LoRA
    - "q_proj"
    - "k_proj"
    - "v_proj"
    - "o_proj"
    - "gate_proj"
    - "up_proj"
    - "down_proj"
    -  "lm_head"
quantized: true
"""
    with open(config_path, "w", encoding="utf-8") as f:
        f.write(config_yaml)

    return config_path


config_path = build_config(dataset_name, dataset_path)

In [10]:
# Load the training Job (validates schema)
from aihero.research.config.schema import TrainingJob

training_config = TrainingJob.load(config_path)

In [11]:
from aihero.research.finetuning.train import TrainingJobRunner

training_job_runner = TrainingJobRunner(training_config)
training_job_runner.run()

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful
Loading model


config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Loading dataset
Loading dataset locally:  ['val', 'dataset_dict.json', 'train', 'test']


Generating train split: 0 examples [00:00, ? examples/s]

250 rows in train split


Generating train split: 0 examples [00:00, ? examples/s]

50 rows in val split


Generating train split: 0 examples [00:00, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


250 rows in test split
Starting training
trainable params: 162,218,048 || all params: 6,900,641,856 || trainable%: 2.350767528370625


Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Starting training


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
50,1.4853,1.426832
100,1.282,1.412787
150,1.0277,1.482571
200,1.1391,1.515042
250,0.866,1.547399


Saving model..


## Test out the code

In [18]:
import random

import torch

# Take one row from our test dataset
eval_prompt = random.choice(new_dataset["test"])["prompt"]

ft_model = training_job_runner.model
ft_tokenizer = training_job_runner.tokenizer

model_input = ft_tokenizer(eval_prompt, return_tensors="pt").to("cuda")

ft_model.eval()
with torch.no_grad():
    print(ft_tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))

## Instruction: You and your friends are discussing the best tier ratings of common London-based football teams. You are trying to determine whether Chelsea F.C., Arsenal F.C., Tottenham Hotspur F.C., West Ham United F.C., Fulham F.C. and Crystal Palace F.C. are considered by fans to be the 'best', 'ok' and 'worst'.
## Response:The best team is Chelsea F.C. The ok team is Arsenal F.C. The worst team is Crystal Palace F.C.
