# Data Preparation

For the sake of the workshop, we have created a small dataset, which is available on [HuggingFace](https://huggingface.co/datasets/riv25-aim410/riv25_aim410_nl2sql_toolcall). This dataset is generated from two publicly available datasets. In this notebook, we show you how we did it.

## Access data from HF

In [None]:
%pip install sagemaker==2.254.1 datasets transformers boto3 --quiet --upgrade

In [None]:
# disable xet in huggingface because of bug with ipykernel
# https://github.com/huggingface/xet-core/issues/526
import os
os.environ["HF_HUB_DISABLE_XET"] = "1"

In [None]:
from datasets import load_dataset, concatenate_datasets


nl2sql_text = load_dataset("gretelai/synthetic_text_to_sql").shuffle(seed=42)
ds = load_dataset("interstellarninja/hermes_reasoning_tool_use").shuffle(
    seed=42
)

## Prepare data using `apply_chat_template`

In [None]:
from transformers import AutoTokenizer


model_id = "Qwen/Qwen3-0.6B"
tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

In [None]:
# Demo - Sample the dataset (remove for better performances)
ds['train'] = ds['train'].select(range(600))
nl2sql_text['train'] = nl2sql_text['train'].select(range(500))
nl2sql_text['test'] = nl2sql_text['test'].select(range(100)) 

In [None]:
def convert_and_tokenize_hermes_reasoning_tool_use(sample, tokenizer):
    # Replace "from" key with "role", and "value" with "content"
    for message in sample['conversations']:
        message["role"] = message.pop("from")
        message["content"] = message.pop("value")

    # Replace "human" value with "user", and "gpt" with "assistant"
    for message in sample['conversations']:
        if message["role"] == "human":
            message["role"] = "user"
        elif message["role"] == "gpt":
            message["role"] = "assistant"

    # Apply the chat template
    sample["text"] = tokenizer.apply_chat_template(
        sample['conversations'], tokenize=False, enable_thinking=False
    )
    return sample


ds_v2 = ds["train"].map(
    convert_and_tokenize_hermes_reasoning_tool_use,
    remove_columns=list(ds["train"].features),
    fn_kwargs={"tokenizer": tok},
)
ds_v2 = ds_v2.train_test_split(test_size=0.2)
tool_call_train_dataset = ds_v2['train']
tool_call_test_dataset = ds_v2['test'] 

In [None]:
def convert_and_tokenize_synthetic_text_to_sql(sample, tokenizer):
    system = f"""
        You are an expert SQL developer. Given the provided database schema and the following user question, generate a syntactically correct SQL query. 
        Only reply with the SQL query, nothing else. Do NOT use the backticks to identify the code, just reply with the pure SQL query.
    """
    query = f"""
        -- Schema --
        {sample["sql_context"]}
        -- Query --
        {sample["sql_prompt"]}
        -- SQL --
    """
    # reasoning = sample["sql_explanation"]
    answer = sample["sql"]
    chat = [
        {"role": "system", "content": system},
        {"role": "user", "content": query},
        # {"role": "assistant", "reasoning_content": reasoning, "content": answer},
        {"role": "assistant", "content": answer},
    ]
    sample["text"] = tokenizer.apply_chat_template(
        chat, tokenize=False, enable_thinking=False
    )
    return sample


nl2sql_train_dataset = nl2sql_text["train"].map(
    convert_and_tokenize_synthetic_text_to_sql,
    remove_columns=list(nl2sql_text["train"].features),
    fn_kwargs={"tokenizer": tok},
)
nl2sql_test_dataset = nl2sql_text["test"].map(
    convert_and_tokenize_synthetic_text_to_sql,
    remove_columns=list(nl2sql_text["test"].features),
    fn_kwargs={"tokenizer": tok},
) 

In [None]:
train_dataset = concatenate_datasets(
    [tool_call_train_dataset, nl2sql_train_dataset]
)
test_dataset = concatenate_datasets(
    [tool_call_test_dataset, nl2sql_test_dataset]
) 

## Upload data

In [None]:
import boto3
import shutil
import sagemaker

sagemaker_session = sagemaker.Session()
s3_client = boto3.client('s3')

bucket_name = sagemaker_session.default_bucket()
default_prefix = sagemaker_session.default_bucket_prefix

# save train_dataset to s3 using our SageMaker session
if default_prefix:
    input_path = f"{default_prefix}/datasets/finetuning-modeltrainer-accelerate"
else:
    input_path = f"datasets/finetuning-modeltrainer-accelerate"

train_dataset_s3_path = f"s3://{bucket_name}/{input_path}/train/dataset.json"
val_dataset_s3_path = f"s3://{bucket_name}/{input_path}/val/dataset.json"

# Save datasets to s3
# We will fine tune only with 20 records due to limited compute resource for the workshop
train_dataset.to_json("./data/train/dataset.json", orient="records")
test_dataset.to_json("./data/val/dataset.json", orient="records")

s3_client.upload_file(
    "./data/train/dataset.json", bucket_name, f"{input_path}/train/dataset.json"
)
s3_client.upload_file(
    "./data/val/dataset.json", bucket_name, f"{input_path}/val/dataset.json"
)

shutil.rmtree("./data")

print(f"Training data uploaded to:")
print(train_dataset_s3_path)
print(val_dataset_s3_path)