### Instruction Dataset

In [None]:
import os 
from datasets import load_dataset
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
import json 

# Load LIMA dataset
lima_dataset = load_dataset("GAIR/lima", cache_dir="~/storage/hf-datasets")
lima_dataset

In [None]:
# Export dataset to JSON for inspection

lima_dataset["train"].to_json("lima.json")
source_max_conversations = {}
sources_seen = set()
instructions = {}
for item in lima_dataset['train']:
    source = item['source']
    conv_length = len(item['conversations'])
    source_max_conversations[source] = max(source_max_conversations.get(source, 0), conv_length)
    
    if source not in sources_seen:
        instructions[source] = item
        sources_seen.add(source)

# Display the maximum number of conversation turns by source
print("Maximum number of conversation turns by source:")
for source, max_turns in sorted(source_max_conversations.items()):
    print(f"{source}: {max_turns}")
# Get one example from each unique source
print("\nExample item from each source:")
# Display one example from each source
for source, seq in instructions.items():
    print(f"\n--- {source} ---")
    print(json.dumps(seq, indent=2))

In [None]:
# Prepare instruction pairs
def process_lima_dataset_to_json(dataset):
  processed_data = []

  for item in dataset:
    messages = [{"role": "system", "content": "You are a helpful AI chatbot."}]
    conversation = item["conversations"]
    source = item["source"]
    
    for i, message in enumerate(conversation):
      if i % 2 == 0:
        messages.append({"role": "user", "content": message})
      else:
        messages.append({"role": "assistant", "content": message})
    processed_data.append({"messages": messages, "source": item["source"]})
  return processed_data

# Process training and test datasets
train_processed = process_lima_dataset_to_json(lima_dataset["train"])
test_processed = process_lima_dataset_to_json(lima_dataset["test"])

# Print an example to verify
print("Example processed conversation:")
print(json.dumps(train_processed[-1], indent=2))
print(f"Processed {len(train_processed)} training examples and {len(test_processed)} test examples")


In [None]:
from transformers import AutoTokenizer
from datasets import Dataset
import numpy as np

# Tokenize and prepare dataset for training
model_name = "meta-llama/Llama-3.1-8B"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Function to tokenize conversations
def tokenize_function(instructions):
    texts = []
    for seq in instructions:
        text = ""
        for message in seq["messages"]:
            if message["role"] == "system":
                text += f"<|system|>\n{message['content']}\n"
            elif message["role"] == "user":
                text += f"<|user|>\n{message['content']}\n"
            elif message["role"] == "assistant":
                text += f"<|assistant|>\n{message['content']}\n"
        texts.append(text + tokenizer.eos_token)
    tokenized_inputs = tokenizer(texts, truncation=True, max_length=2048)
    return tokenized_inputs

train_dataset = Dataset.from_list(train_processed)
test_dataset = Dataset.from_list(test_processed) if test_processed else None
# Split train into train and validation
train_val_split = train_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_val_split["train"]
val_dataset = train_val_split["test"]

# Create tokenized datasets
train_dataset = {}
train_dataset["train"] = train_dataset.map(
    lambda examples: tokenize_function([examples]),
    batched=False,
    remove_columns=train_dataset.column_names
)
train_dataset["validation"] = val_dataset.map(
    lambda examples: tokenize_function([examples]),
    batched=False,
    remove_columns=val_dataset.column_names
)

print(f"Training samples: {len(train_dataset['train'])}")
print(f"Validation samples: {len(train_dataset['validation'])}")

In [None]:
# Checkout the tokenized dataset correctness
print(f"\nVocabulary size: {tokenizer.vocab_size}")
print(f"Model max length: {tokenizer.model_max_length}\n")
print("\nTokenized dataset example:")

input_ids = train_dataset['train'][0]['input_ids']

# Decode the first tokenized sequence
decoded_text = tokenizer.decode(input_ids[0], skip_special_tokens=False)
print("\nOriginal tokenized sequence length:", input_ids)
print("\nDecoded text:\n")
print(decoded_text)


### Huggingface Training (training script for single GPU)

In [1]:
from datasets import load_dataset

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, get_peft_model
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Environment Setup
!export TOKENIZERS_PARALLELISM=false
!export CUDA_VISIBLE_DEVICES=0
torch.utils.checkpoint.use_reentrant = True

In [3]:
# Dataset helper functions
system_message = """You are a helpful AI assistant. Users will ask you questions and you will answer the questions."""

def create_conversation(sample):
  messages = [{"role": "system", "content": system_message.format(schema=sample["source"])}]
  conversation = sample["conversations"]
  for i, message in enumerate(conversation):
    if i % 2 == 0:
      messages.append({"role": "user", "content": message})
    else:
      messages.append({"role": "assistant", "content": message})
  return { "messages": messages }

# Customized chat template to pass to the tokenizer 
# However, We use the llama 3 instruct chat template
def format_messages(sample):
    text = ""
    for message in sample["messages"]:
        if message["role"] == "system":
            text += f"<|system|>{message['content']}\n"
        elif message["role"] == "user":
            text += f"<|user|>{message['content']}\n"
        elif message["role"] == "assistant":
            text += f"<|assistant|>{message['content']}\n"
    return {"text": text}

In [4]:
# Load training datasets
train_dataset = load_dataset("GAIR/lima", cache_dir="~/storage/hf-datasets", split="train")
train_dataset = train_dataset.map(create_conversation, batched=False)
train_dataset = train_dataset.remove_columns(["conversations", "source"])
# Dataset Sample
print("Sample Multi-turn Dataset Item:")
print("----------------------")
messages = train_dataset[-1]['messages']
print('\n\n'.join([f"{message['role']}:\n------\n{message['content']}" for message in messages]))

Sample Multi-turn Dataset Item:
----------------------
system:
------
You are a helpful AI assistant. Users will ask you questions and you will answer the questions.

user:
------
Why aren't python nested functions called closures?

assistant:
------
A closure occurs when a function has access to a local variable from an enclosing scope that has finished its execution. For example, 

```
def make_printer(msg):
    def printer():
        print(msg)
    return printer
printer = make_printer('Foo!')
printer()
```

When `make_printer` is called, a new frame is put on the stack with the compiled code for the `printer` function as a constant and the value of `msg` as a local. It then creates and returns the function. Because the function `printer` references the `msg` variable, it is kept alive after the `make_printer` function has returned.
So, if your nested functions don't

* access variables that are local to enclosing scopes,
* do so when they are executed outside of that scope,

then t

In [5]:
# Tokenizer configs and initialization
model_name = "meta-llama/Llama-3.1-8B"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.padding_side = "right"
tokenizer.pad_token = tokenizer.eos_token

In [6]:
messages = train_dataset['messages']

# chat_temmplate is of LlaMa
formatted_text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    bos_token="<|begin_of_text|>",
    chat_template="{{- bos_token }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content']|trim %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n    {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n    {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n    {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0]['content']|trim %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n    {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n    {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n    {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n    {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n    {%- elif 'tool_calls' in message %}\n        {%- if not message.tool_calls|length == 1 %}\n            {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n        {%- endif %}\n        {%- set tool_call = message.tool_calls[0].function %}\n        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n            {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n            {%- for arg_name, arg_val in tool_call.arguments | items %}\n                {{- arg_name + '=\"' + arg_val + '\"' }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- endif %}\n                {%- endfor %}\n            {{- \")\" }}\n        {%- else  %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n            {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n            {{- '\"parameters\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- \"}\" }}\n        {%- endif %}\n        {%- if builtin_tools is defined %}\n            {#- This means we're in ipython mode #}\n            {{- \"<|eom_id|>\" }}\n        {%- else %}\n            {{- \"<|eot_id|>\" }}\n        {%- endif %}\n    {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n        {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- \"<|eot_id|>\" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
    clean_up_tokenization_spaces=True,
    eos_token="<|eot_id|>",
)

train_dataset_tokenized = tokenizer(
    formatted_text,
    padding=True,
    truncation=True,
    return_attention_mask=True,
    return_tensors="pt",
)

# Alternatively, the paddings and attention masks can be created using the collator 
# data_collator = DataCollatorWithPadding(
#     tokenizer=tokenizer
# )

In [7]:
# Create proper datasets from the tokenized data
dataset = Dataset.from_dict({
	"input_ids": train_dataset_tokenized['input_ids'].tolist(),
	"attention_mask": train_dataset_tokenized['attention_mask'].tolist(),
})

split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

print("Tokenized Dataset Sample:\n----------------------")
print(tokenizer.decode(train_dataset[-1]["input_ids"]))
print(f"\n----------------------\n\nTraining set size: {len(train_dataset)}")
print(f"Test set size: {len(test_dataset)}")
max_input_seq_len = len(train_dataset[-1]["input_ids"])
print(f"\n----------------------\nFinalized max sequence length: {max_input_seq_len}\n")


Tokenized Dataset Sample:
----------------------
<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a helpful AI assistant. Users will ask you questions and you will answer the questions.<|eot_id|><|start_header_id|>user<|end_header_id|>

write the saddest story you possibly write about a jar of Jam, five playing cards and a gun<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Strawberry jam.
It was the smell that graced my summer afternoons, sweet and lovely and fresh. She’d greet me at the door with the scent in her hair. They were blissful, those days. The cupboards were always lined with mason jars and the happiness we shared seemed eternal.
Ace, jack, nine, two, king.
Spades, diamonds, hearts, clubs and spades again, respectively. The cards had fallen off the dresser. Somehow, these five survived the blaze. These five cards, instead of, say, our favorite blanket. Instead of her

In [8]:
# Quantization Configs
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_use_double_quant=True, 
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=torch.float16
)

In [9]:
# PEFT
output_directory="./output/"
peft_model_id=output_directory+"model"
# Lora Configs
lora_config = LoraConfig(
    r = 8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
# Model Configs
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    quantization_config=bnb_config
)
model = get_peft_model(model, lora_config)

# load on single GPU
model.to("cuda:0")

Loading checkpoint shards: 100%|██████████| 4/4 [00:19<00:00,  4.78s/it]


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.L

In [11]:
# Training Configs
sft_configs = SFTConfig(
    report_to="none",
    output_dir=output_directory+"checkpoints",
    logging_dir=output_directory+"logs",
    optim="paged_adamw_8bit",
    per_device_train_batch_size=1,
    lr_scheduler_type="cosine",
    logging_strategy="steps",
    logging_steps=250,
    dataset_text_field='text',
    eval_strategy="steps",
    eval_steps=5000,
    save_steps=5000,
    num_train_epochs=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    save_strategy="epoch",
    learning_rate=2e-4,
    fp16=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    push_to_hub=False,
    max_seq_length=max_input_seq_len,
    packing=False,
    eval_packing=False,
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token": False
    }
)

In [12]:
# Start Training
trainer = SFTTrainer(
    model=model,
    args=sft_configs,
    train_dataset=train_dataset,
    peft_config=lora_config,
    processing_class=tokenizer,
    eval_dataset=test_dataset
)

trainer.train()

trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)

Truncating train dataset: 100%|██████████| 824/824 [00:00<00:00, 22797.99 examples/s]
Truncating eval dataset: 100%|██████████| 206/206 [00:00<00:00, 25106.84 examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


sobami1:660245:660245 [0] NCCL INFO cudaDriverVersion 12060
sobami1:660245:660245 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to enp1s0f1
sobami1:660245:660245 [0] NCCL INFO Bootstrap : Using enp1s0f1:10.13.44.223<0>
sobami1:660245:660245 [0] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so)
sobami1:660245:660245 [0] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so
sobami1:660245:660245 [0] NCCL INFO NET/Plugin: Using internal network plugin.
NCCL version 2.21.5+cuda12.4
sobami1:660245:660808 [0] NCCL INFO Failed to open libibverbs.so[.1]
sobami1:660245:660808 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to enp1s0f1
sobami1:660245:660808 [0] NCCL INFO NET/Socket : Using [0]enp1s0f1:10.13.44.223<0>
sobami1:660245:660808 [0] NCCL INFO Using non-device net plugin version 0
sobami1:660245:660808 [0] NCCL INFO Using network Socket
sobami1:660245:660808 [0] NCCL INFO nccl

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


sobami1:660245:660808 [0] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so
sobami1:660245:660808 [0] NCCL INFO TUNER/Plugin: Using internal tuner plugin.
sobami1:660245:660808 [0] NCCL INFO ncclCommInitRank comm 0x6f8ed8c0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 62000 commId 0x9ae4329fbd6b01fa - Init COMPLETE


Step,Training Loss,Validation Loss


('./output/model/tokenizer_config.json',
 './output/model/special_tokens_map.json',
 './output/model/tokenizer.json')

## [TODO] Ray Tune - Distributed Instruction Tuning (For larger models)

In [9]:
# dependencies
import os
import ray
from ray import tune
from ray.air import session
from ray.tune.search.optuna import OptunaSearch
from transformers import (
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    default_data_collator,
)
import torch
from peft import LoraConfig, TaskType, get_peft_model

2025-04-23 21:22:05,878	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-04-23 21:22:05,955	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [None]:
# Compute setup
ray.init(address="ray://localhost:3002")

In [None]:
# Training function for Ray Tune
def train_model(config):
    # Initialize model
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )
    
    # LoRA configuration
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=config["lora_r"],
        lora_alpha=config["lora_alpha"],
        lora_dropout=config["lora_dropout"],
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    )
    
    # Apply LoRA adapters to the model
    model = get_peft_model(model, peft_config)
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"./results-lima-llama-8b-{session.get_trial_id()}",
        learning_rate=config["learning_rate"],
        per_device_train_batch_size=config["batch_size"],
        gradient_accumulation_steps=4,
        num_train_epochs=config["num_epochs"],
        weight_decay=config["weight_decay"],
        warmup_ratio=0.1,
        logging_dir="./logs",
        logging_steps=50,
        save_strategy="epoch",
        evaluation_strategy="epoch",
        save_total_limit=2,
        fp16=True,
        report_to="tensorboard",
    )
    
    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset["train"],
        eval_dataset=train_dataset["validation"] if "validation" in train_dataset else None,
        data_collator=default_data_collator,
    )
    
    # Start training
    trainer.train()
    
    # Report metrics to Ray Tune
    eval_results = trainer.evaluate()
    session.report({
        "loss": eval_results["eval_loss"],
        "perplexity": torch.exp(torch.tensor(eval_results["eval_loss"])).item()
    })

# Set up Ray Tune search space
search_space = {
    "learning_rate": tune.loguniform(1e-5, 5e-4),
    "batch_size": tune.choice([4, 8]),
    "num_epochs": tune.choice([1, 2, 3]),
    "weight_decay": tune.loguniform(0.01, 0.1),
    "lora_r": tune.choice([8, 16, 32]),
    "lora_alpha": tune.choice([16, 32, 64]),
    "lora_dropout": tune.uniform(0.05, 0.2),
}

# Configure the search algorithm
search_algo = OptunaSearch()

# Launch hyperparameter tuning
tuner = tune.Tuner(
    train_model,
    tune_config=tune.TuneConfig(
        metric="perplexity",
        mode="min",
        search_alg=search_algo,
        num_samples=10,
    ),
    param_space=search_space,
)

# Run the hyperparameter search
results = tuner.fit()

# Get best trial
best_trial = results.get_best_trial("perplexity", "min", "last")
print(f"Best trial config: {best_trial.config}")
print(f"Best trial final perplexity: {best_trial.last_result['perplexity']}")

# Save the best model information
best_model_path = os.path.join(f"./results-lima-llama-8b-{best_trial.trial_id}", "checkpoint-best")
print(f"Best model path: {best_model_path}")