In [44]:
import torch
import transformers
from pathlib import Path
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

## Config


In [45]:
dataset_path = Path("../DATA/dataset_2024-Jan-30_23-29-10/")

In [23]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

In [24]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Running device: {device}")

Running device: cuda:0


## Model


In [38]:
model_id = "mistralai/Mistral-7B-Instruct-v0.2"
weights_dir = "./weights"


model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map={"": 0},
    cache_dir=weights_dir,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    add_eos_token=True,
    cache_dir=weights_dir,
    padding_side="left",  # Warning message indicated this
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

## Run inference on base model


In [39]:
prompt_template = """

    [INST]
    Translate user queries about industrial robotic operations into JSON outputs for specific function calls.
    {query}
    [/INST]


    
    """

In [40]:
def get_completion(
    prompt: str, model, tokenizer, device: torch.device, max_new_tokens: int = 1000
) -> str:
    encoded_prompt = tokenizer(prompt, return_tensors="pt", add_special_tokens=True).to(
        device
    )
    print(f"Tokens shape: {encoded_prompt['input_ids'].shape}")

    generated_ids = model.generate(
        **encoded_prompt,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )

    print(f"Generated tokens shape: {generated_ids.shape}")

    decoded = tokenizer.batch_decode(generated_ids)

    return decoded[0]

In [41]:
query = "Obtain the status of the robot's third joint"
prompt = prompt_template.format(query=query)

result = get_completion(prompt=prompt, model=model, tokenizer=tokenizer, device=device)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Tokens shape: torch.Size([1, 53])
Generated tokens shape: torch.Size([1, 90])


In [42]:
print(result.strip())

<s> 

    [INST]
    Translate user queries about industrial robotic operations into JSON outputs for specific function calls.
    Obtain the status of the robot's third joint
    [/INST]


    
    </s>{
      "function": "getRobotJointStatus",
      "robotID": "1",
      "jointIndex": "3"
    }</s>


## Load the dataset


In [61]:
dataset = load_from_disk(dataset_path)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['data'],
        num_rows: 1064
    })
    test: Dataset({
        features: ['data'],
        num_rows: 119
    })
})


In [49]:
dataset["train"][0]

{'data': {'function_calling': '{{"functions": [{{"function_name": "", "inputs": []}}]}}',
  'user_query': 'Here are some keywords about a restaurant:\n\nname = The Cambridge Blue, eatType = pub, food = French, priceRange = £20-25, near = Café Brazil. Write a sentence that describes the following attributes of a restaurant.'}}

In [53]:
def generate_prompt(data_point) -> str:
    prefix_text = "Below is a user query about industrial robotic operations, translate it into JSON outputs for specific function calls.\n\n"
    text = f"""[INST]{prefix_text} {data_point['data']['user_query']} [/INST]{data_point['data']['function_calling']}"""

    return text

In [None]:
train_dataset = dataset["train"]
text_column = 

In [59]:
dataset["train"]["data"]

[{'function_calling': '{{"functions": [{{"function_name": "", "inputs": []}}]}}',
  'user_query': 'Here are some keywords about a restaurant:\n\nname = The Cambridge Blue, eatType = pub, food = French, priceRange = £20-25, near = Café Brazil. Write a sentence that describes the following attributes of a restaurant.'},
 {'function_calling': "{'functions': [{'function_name': 'get_joint_values', 'inputs': []}]}",
  'user_query': 'Retrieve robot joint values'},
 {'function_calling': '{{"functions": [{{"function_name": "", "inputs": []}}]}}',
  'user_query': 'Test sentence: "You and I are as much alike as a horse and a cow."\nIs this test sentence a correct grammatical English sentence?'},
 {'function_calling': "{'functions': [{'function_name': 'move_tcp', 'inputs': [{'name': 'x', 'value': 1000.0, 'unit': 'mm'}]}]}",
  'user_query': 'Move robot TCP along the x-axis for 1000 millimeters'},
 {'function_calling': '{{"functions": [{{"function_name": "", "inputs": []}}]}}',
  'user_query': 'NATO

In [55]:
train_dataset = dataset["train"].shuffle(seed=1234)
train_dataset = train_dataset.map(lambda samples: print(samples))

Map:   0%|          | 0/1064 [00:00<?, ? examples/s]

{'data': {'function_calling': "{'functions': [{'function_name': 'move_tcp', 'inputs': [{'name': 'x', 'value': 1000.0, 'unit': 'mm'}]}]}", 'user_query': 'Move the TCP along the X-axis for 1000 millimeters'}}
{'data': {'function_calling': "{'functions': [{'function_name': 'move_joint', 'inputs': [{'name': 'joint', 'value': [0], 'unit': None}, {'name': 'angle', 'value': [30.0], 'unit': 'deg'}]}, {'function_name': 'move_joint', 'inputs': [{'name': 'joint', 'value': [6], 'unit': None}, {'name': 'angle', 'value': [-45.0], 'unit': 'deg'}]}]}", 'user_query': 'Rotate joint 1 by 30 degrees and joint 7 by -45 degrees'}}
{'data': {'function_calling': "{'functions': [{'function_name': 'get_tcp_position', 'inputs': []}]}", 'user_query': "What is the current position of the robot's TCP?"}}
{'data': {'function_calling': "{'functions': [{'function_name': 'move_joint', 'inputs': [{'name': 'joint', 'value': [0, 1, 2], 'unit': None}, {'name': 'angle', 'value': [0.785, -1.57, 2.09], 'unit': 'rad'}]}]}", 'u

In [None]:
train_dataset[0]

In [None]:
dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)