In [109]:
import json
import torch
import transformers
from pathlib import Path
from datetime import datetime
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model

import bitsandbytes as bnb

from trl import SFTTrainer

## Config


In [2]:
dataset_path = Path("../DATA/dataset_2024-Jan-30_23-29-10/")

In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Running device: {device}")

Running device: cuda:0


## Model


In [5]:
model_id = "mistralai/Mistral-7B-Instruct-v0.2"
weights_dir = "./weights"


model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map={"": 0},
    cache_dir=weights_dir,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    add_eos_token=True,
    cache_dir=weights_dir,
    padding_side="left",  # Warning message indicated this
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

## Run inference on base model


In [6]:
prompt_template = """

    [INST]
    Translate user queries about industrial robotic operations into JSON outputs for specific function calls.
    {query}
    [/INST]


    
    """

In [7]:
def get_completion(
    prompt: str, model, tokenizer, device: torch.device, max_new_tokens: int = 1000
) -> str:
    encoded_prompt = tokenizer(prompt, return_tensors="pt", add_special_tokens=True).to(
        device
    )
    print(f"Tokens shape: {encoded_prompt['input_ids'].shape}")

    generated_ids = model.generate(
        **encoded_prompt,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )

    print(f"Generated tokens shape: {generated_ids.shape}")

    decoded = tokenizer.batch_decode(generated_ids)

    return decoded[0]

In [38]:
query = "Retrieve robot joint values"
expected_response = (
    "{'functions': [{'function_name': 'get_joint_values', 'inputs': []}]}"
)
expected_response = expected_response.replace("'", '"')
print(f"Example query: {query}")
print(f"Expected response: {json.loads(expected_response)}")

Example query: Retrieve robot joint values
Expected response: {'functions': [{'function_name': 'get_joint_values', 'inputs': []}]}


In [39]:
prompt = prompt_template.format(query=query)

result = get_completion(prompt=prompt, model=model, tokenizer=tokenizer, device=device)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Tokens shape: torch.Size([1, 47])
Generated tokens shape: torch.Size([1, 207])


In [40]:
print(result.strip())

<s> 

    [INST]
    Translate user queries about industrial robotic operations into JSON outputs for specific function calls.
    Retrieve robot joint values
    [/INST]


    
    </s> ":id": "1",
    :"functionName": "getRobotJointValues",
    :"parameters": {
        "--robotId--": "robot1"
     },
    :"outputFormat": "application/json"
    }


This JSON represents a request for retrieving the current joint values of a specific industrial robot named 'robot1'. This request assumes the existence of a function named 'getRobotJointValues' that can receive the robot identifier as a parameter and return the current joint values as JSON data. The "--robotId--" placeholder in the parameters section should be replaced by the actual robot ID. The 'outputFormat' field is used to specify that the response should be returned in JSON format.</s>


## Load the dataset


In [121]:
dataset = load_from_disk(dataset_path)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['data'],
        num_rows: 1064
    })
    test: Dataset({
        features: ['data'],
        num_rows: 119
    })
})


In [122]:
dataset["train"][1]["data"]["function_calling"]

"{'functions': [{'function_name': 'get_joint_values', 'inputs': []}]}"

In [123]:
def generate_prompt(data_point) -> str:
    prefix_text = "Below is a user query about industrial robotic operations, translate it into JSON outputs for specific function calls.\n\n"
    text = f"""[INST]{prefix_text} {data_point['data']['user_query']} [/INST]{data_point['data']['function_calling']}"""

    return text

### Train data


In [124]:
text_column = [generate_prompt(data_point) for data_point in dataset["train"]]
dataset["train"] = dataset["train"].add_column("prompts", text_column)
print(f"Dataset structure: {dataset.column_names}")

Dataset structure: {'train': ['data', 'prompts'], 'test': ['data']}


In [125]:
train_dataset = dataset["train"].shuffle(seed=1234)
train_dataset = train_dataset.map(
    lambda samples: tokenizer(samples["prompts"]), batched=True
)

Map:   0%|          | 0/1064 [00:00<?, ? examples/s]

In [126]:
print(f"Train Dataset structure: \n{train_dataset}")

Train Dataset structure: 
Dataset({
    features: ['data', 'prompts', 'input_ids', 'attention_mask'],
    num_rows: 1064
})


In [127]:
tokenized_train_set = train_dataset.remove_columns(column_names=["data"])

### Validation data


In [128]:
text_column = [generate_prompt(data_point) for data_point in dataset["test"]]
dataset["test"] = dataset["test"].add_column("prompts", text_column)
print(f"Dataset structure: {dataset.column_names}")

Dataset structure: {'train': ['data', 'prompts'], 'test': ['data', 'prompts']}


In [129]:
test_dataset = dataset["test"].shuffle(seed=1234)
test_dataset = test_dataset.map(
    lambda samples: tokenizer(samples["prompts"]), batched=True
)

Map:   0%|          | 0/119 [00:00<?, ? examples/s]

In [130]:
print(f"Test Dataset structure: \n{test_dataset}")

Test Dataset structure: 
Dataset({
    features: ['data', 'prompts', 'input_ids', 'attention_mask'],
    num_rows: 119
})


~~_**Note**: SFTTrainer class when passing datasets will only look for `input_ids` and `attention_mask`._~~


In [131]:
tokenized_test_set = test_dataset.remove_columns(column_names=["data"])

## Apply Lora


In [87]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
print(f"Model architecture: \n{model}")

Model architecture: 
MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNor

In [89]:
type(model)

transformers.models.mistral.modeling_mistral.MistralForCausalLM

These are the layers that we are going to use for fine-tuning. <br>
According to QLoRA paper: <br>
_"We find that the most critical LoRA hyperparameter is how many LoRA adapters are used in total and that LoRA on all linear transformer block layers is required to match full finetuning performance."_


In [91]:
def find_all_linear_names(model) -> list:
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split(".")
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
        if "lm_head" in lora_module_names:  # Used for 16-bit
            lora_module_names.remove("lm_head")
    return list(lora_module_names)

In [92]:
modules = find_all_linear_names(model)
print(f"Number of found modules: {len(modules)}")
print(f"Module names: {modules}")

Number of found modules: 7
Module names: ['down_proj', 'gate_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'q_proj']


In [93]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,  # 16
    target_modules=modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CASUAL_LM",
)

model = get_peft_model(model=model, peft_config=lora_config)

In [100]:
trainable, total = model.get_nb_trainable_parameters()
print(
    f"Trainable: {trainable:,} | Total: {total:,} | Percentage: {(trainable/total):.3%}"
)

Trainable: 20,971,520 | Total: 7,262,703,616 | Percentage: 0.289%


## Run the training

Fine-Tuning with QLoRA and Supervised Fine-Tuning


In [110]:
current_time = datetime.now().strftime("%Y-%b-%d_%H-%M-%S")
output_dir = f"./experiments/{current_time}"

In [142]:
tokenizer.pad_token = tokenizer.eos_token
torch.cuda.empty_cache()

trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_train_set,
    eval_dataset=tokenized_test_set,
    dataset_text_field="prompts",
    peft_config=lora_config,
    args=transformers.TrainingArguments(
        num_train_epochs=1,
        # max_steps=100 # For testing
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_ratio=0.03,
        learning_rate=3e-4,
        logging_steps=1,
        output_dir=output_dir,
        optim="paged_adamw_8bit",
        save_strategy="epoch",
        remove_unused_columns=False,
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False
    ),
)



Map:   0%|          | 0/119 [00:00<?, ? examples/s]

### Start the training


In [143]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`prompts` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [148]:
tokenized_test_set["prompts"]

["[INST]Below is a user query about industrial robotic operations, translate it into JSON outputs for specific function calls.\n\n Retrieve robot joints status [/INST]{'functions': [{'function_name': 'get_joint_values', 'inputs': []}]}",
 '[INST]Below is a user query about industrial robotic operations, translate it into JSON outputs for specific function calls.\n\n Please capitalize where necessary: aldegrever doesn\'t decidedly copy holbein, but it is apparent that his dance of death is very much inspired by holbein. [/INST]{{"functions": [{{"function_name": "", "inputs": []}}]}}',
 "[INST]Below is a user query about industrial robotic operations, translate it into JSON outputs for specific function calls.\n\n Move the robot TCP to position (500, 300, 700) micrometers [/INST]{'functions': [{'function_name': 'move_tcp', 'inputs': [{'name': 'x', 'value': 500.0, 'unit': 'micrometers'}, {'name': 'y', 'value': 300.0, 'unit': 'micrometers'}, {'name': 'z', 'value': 700.0, 'unit': 'micromete