# Supervised Fine-Tuning (SFT) with Serverless customization on SageMaker AI

## Lab 1 - Data preparation

In this notebook, we are going to prepare the dataset for later on fine-tuning Qwen 2.5 - 7B Instruct

***

### Prerequisites

### Install requirements

In [None]:
%pip install -r requirements.txt

#### Setup and dependencies

In [None]:
import boto3
from sagemaker.core.helper.session_helper import Session, get_execution_role

sess = Session()
sagemaker_session_bucket = None

if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = get_execution_role()
except ValueError:
    iam = boto3.client("iam")
    role = iam.get_role(RoleName="sagemaker_execution_role")["Role"]["Arn"]

s3_client = boto3.client("s3")
sess = Session(default_bucket=sagemaker_session_bucket)
bucket_name = sess.default_bucket()
default_prefix = sess.default_bucket_prefix

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

***

### Prepare the dataset

In [None]:
import datasets
from datasets import load_dataset

dataset = (
    load_dataset(
        "FreedomIntelligence/medical-o1-reasoning-SFT",
        "en",
        split="train",
        streaming=True,
    )
    .take(3000)
    .shuffle(buffer_size=1000)
)

dataset = datasets.Dataset.from_generator(lambda: dataset, features=dataset.features)

In [None]:
import pandas as pd

df = pd.DataFrame(dataset)

df.head()

In [None]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(df, test_size=0.2, random_state=42)
train, test = train_test_split(df, test_size=0.1, random_state=42)

print("Number of train elements: ", len(train))
print("Number of validation elements: ", len(val))
print("Number of test elements: ", len(test))

In [None]:
from datasets import Dataset
from tqdm import tqdm


def prepare_dataset_train_val(sample):
    yield {
        "prompt": sample["Question"],
        "completion": f"<think>\n{sample['Complex_CoT']}\n</think>\n\n {sample['Response']}",
    }


def prepare_dataset_test(sample):
    yield {
        "query": sample["Question"],
        "response": f"<think>\n{sample['Complex_CoT']}\n</think>\n\n {sample['Response']}",
    }

In [None]:
def convert_to_messages_train_val(dataset):
    """Iteratively run conversion on multi-turn conversation and flatten to messages"""
    records = []

    print("Original lenght: ", len(dataset))

    # Unroll your generator for every dataset row
    for row in tqdm(dataset, total=len(dataset), desc="Converting to messages"):
        for example in prepare_dataset_train_val(row):
            records.append(example)

    # Convert list of dicts → Hugging Face Dataset and return
    return Dataset.from_list(records)


def convert_to_messages_test(dataset):
    """Iteratively run conversion on multi-turn conversation and flatten to messages"""
    records = []

    print("Original lenght: ", len(dataset))

    # Unroll your generator for every dataset row
    for row in tqdm(dataset, total=len(dataset), desc="Converting to messages"):
        for example in prepare_dataset_test(row):
            records.append(example)

    # Convert list of dicts → Hugging Face Dataset and return
    return Dataset.from_list(records)

In [None]:
from datasets import Dataset, DatasetDict
import json
from random import randint

train_dataset = Dataset.from_pandas(train)
val_dataset = Dataset.from_pandas(val)
test_dataset = Dataset.from_pandas(test)

dataset = DatasetDict(
    {"train": train_dataset, "val": val_dataset, "test": test_dataset}
)

train_dataset = convert_to_messages_train_val(dataset["train"])

print(json.dumps(train_dataset[randint(0, len(train_dataset) - 1)], indent=2))

val_dataset = convert_to_messages_train_val(dataset["val"])

test_dataset = convert_to_messages_test(dataset["test"])

print(json.dumps(test_dataset[randint(0, len(test_dataset) - 1)], indent=2))

#### Upload to Amazon S3

In [None]:
import shutil

In [None]:
# save train_dataset to s3 using our SageMaker session
if default_prefix:
    input_path = f"{default_prefix}/datasets/serverless-model-customization-sft"
else:
    input_path = f"datasets/serverless-model-customization-sft"

train_dataset_s3_path = f"s3://{bucket_name}/{input_path}/train/dataset.jsonl"
val_dataset_s3_path = f"s3://{bucket_name}/{input_path}/val/dataset.jsonl"
test_dataset_s3_path = f"s3://{bucket_name}/{input_path}/test/dataset.jsonl"

In [None]:
train_dataset.to_json("./data/train/dataset.jsonl", orient="records")
val_dataset.to_json("./data/val/dataset.jsonl", orient="records")
test_dataset.to_json("./data/test/dataset.jsonl", orient="records")

s3_client.upload_file(
    "./data/train/dataset.jsonl", bucket_name, f"{input_path}/train/dataset.jsonl"
)
s3_client.upload_file(
    "./data/val/dataset.jsonl", bucket_name, f"{input_path}/val/dataset.jsonl"
)
s3_client.upload_file(
    "./data/test/dataset.jsonl", bucket_name, f"{input_path}/test/dataset.jsonl"
)

shutil.rmtree("./data")

print(f"Training data uploaded to:")
print(train_dataset_s3_path)
print(val_dataset_s3_path)
print(test_dataset_s3_path)

#### Create Training Dataset

In [None]:
from sagemaker.ai_registry.dataset import DataSet
from sagemaker.ai_registry.dataset_utils import CustomizationTechnique

In [None]:
dataset_train = DataSet.create(
    name="medical-o1-reasoning-sft-train",
    source=train_dataset_s3_path,
    customization_technique=CustomizationTechnique.SFT,
    wait=True,
)

print(f"TRAINING_DATASET ARN: {dataset_train.arn}")

dataset_val = DataSet.create(
    name="medical-o1-reasoning-sft-val",
    source=val_dataset_s3_path,
    customization_technique=CustomizationTechnique.SFT,
    wait=True,
)

print(f"VALIDATION_DATASET ARN: {dataset_val.arn}")

dataset_test = DataSet.create(
    name="medical-o1-reasoning-sft-test",
    source=test_dataset_s3_path,
    wait=True,
)

print(f"TEST_DATASET ARN: {dataset_test.arn}")