# Fine-tune DeepSeek-R1-Distill-Qwen-7B using SageMaker Hyperpod recipes and ModelTrainer

In this notebook, we fine-tune [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) on Amazon SageMaker AI, using SageMaker Hyperpod recies and [ModelTrainer](https://sagemaker.readthedocs.io/en/v2.239.0/api/training/model_trainer.html) class

Recipe: [DeepSeek R1 Distill Qwen 7b - LoRA](https://github.com/aws/sagemaker-hyperpod-recipes/blob/main/recipes_collection/recipes/fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_7b_seq16k_gpu_lora.yaml)


## Prerequisites

Our first step is to install Libraries we need on the client to correctly prepare our dataset and start our training/evaluations jobs.

In [None]:
!pip install --upgrade "setuptools" "sagemaker==2.239.1" "graphene" "datasets==3.2.0" "transformers==4.44.2"

In [None]:
from sagemaker.modules import Session
from sagemaker.modules.train import ModelTrainer
from sagemaker.modules.configs import Compute
from sagemaker.modules.configs import Networking
from sagemaker.modules.configs import FileSystemDataSource
from sagemaker.modules.configs import S3DataSource
from sagemaker.modules.configs import InputData
from sagemaker.modules.configs import StoppingCondition
from sagemaker.modules.configs import SourceCode
from typing import Any
from utility import *
import sagemaker
import boto3
import os
from datasets import load_dataset
from transformers import AutoTokenizer


In [None]:
sagemaker_session = sagemaker.Session()
bucket_name = sagemaker_session.default_bucket()

# HuggingFace Model ID
model_id = "deepseek-ai/DeepSeek-R1"

# VPC config
network_config={
   "subnets": ["subnet-xxxx"], # e.g. ['subnet-xxxx','subnet-yyyyy']
   "security_group_ids": ["sg-xxxx"] # e.g. ["sg-xxxx"]
}

# FSx mount name
fsx_mount_point='/xxxx'

# HuggingFace token
hf_token="<>"

fsx_dir_basemodel="deepseek_r1_671b_tj"

## Prepare the dataset

In this example, we use the [FreedomIntelligence/medical-o1-reasoning-SFT](https://huggingface.co/datasets/FreedomIntelligence/medical-o1-reasoning-SFT) dataset from Hugging Face. The FreedomIntelligence/medical-o1-reasoning-SFT is used to fine-tune HuatuoGPT-o1, a medical LLM designed for advanced medical reasoning. This dataset is constructed using GPT-4o, which searches for solutions to verifiable medical problems and validates them through a medical verifier.

For details, see the paper and GitHub repository.

In [None]:
# HF dataset that we will be working with 
dataset_name="FreedomIntelligence/medical-o1-reasoning-SFT"

In [None]:
def generate_prompt(data_point):
    """
    Generates a medical analysis prompt based on patient information.
    
    Args:
        data_point (dict): Dictionary containing target and meaning_representation keys
        
    Returns:
        dict: Dictionary containing the formatted prompt
    """
    full_prompt = f"""
    Below is an instruction that describes a task, paired with an input that provides further context. 
    Write a response that appropriately completes the request. 
    Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

    ### Instruction:
    You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. 
    Please answer the following medical question. 

    ### Question:
    {data_point["Question"]}

    ### Response:
    {data_point["Complex_CoT"]}

    """
    return {"prompt": full_prompt.strip()}

In [None]:
# Load dataset from the HF hub
train_set = load_dataset(dataset_name, 'en', split="train[5%:]")
test_set = load_dataset(dataset_name, 'en', split="train[:5%]")

# Add system message to each conversation
columns_to_remove = list(train_set.features)

train_dataset = train_set.map(
    generate_prompt,
    remove_columns=columns_to_remove,
    batched=False
)

test_dataset = test_set.map(
    generate_prompt,
    remove_columns=columns_to_remove,
    batched=False
)

In [None]:
# Review dataset
train_dataset, test_dataset

In [None]:
####################
# Model & Tokenizer
####################
max_seq_length=1024

# Initialize a tokenizer by loading a pre-trained tokenizer configuration, using the fast tokenizer implementation if available.
tokenizer = AutoTokenizer.from_pretrained(
        model_id,
        use_fast=True
    )

tokenizer.pad_token = tokenizer.eos_token
    
def tokenize(text):
    result = tokenizer(
        text['prompt'],
        max_length=max_seq_length,
        padding="max_length",
        truncation=True
    )
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
train_dataset = train_dataset.map(tokenize, remove_columns=["prompt"])
test_dataset = test_dataset.map(tokenize, remove_columns=["prompt"])

### Upload the tokenized data to Amazon S3

In [None]:
input_path = 'datasets/deepseek-r1-distilled-qwen-7b-recipe-lora'
train_dataset_s3_path = f"s3://{bucket_name}/{input_path}/train"
test_dataset_s3_path = f"s3://{bucket_name}/{input_path}/test"

train_dataset.save_to_disk(train_dataset_s3_path)
test_dataset.save_to_disk(test_dataset_s3_path)

# Common functions 

Let us define some utility function to run model training using the SageMaker ModelTrainer class.

For additional information about ModelTrainer, you can refer to Accelerate your ML lifecycle using the new and improved Amazon SageMaker Python SDK – Part 1: ModelTrainer

In [None]:
def create_model_trainer(
    use_recipes: bool,
    compute: dict,
    network: dict,
    data_channel: dict,
    action: str,
    hyperparameters: dict ={},
    source_code: str=None,
    training_recipe: str=None,
    recipe_overrides: str=None,
    image_uri: str=None
) -> ModelTrainer:
    """
    Creates and executes a model training job using SageMaker.
    
    Args:
        use_recipes (bool): Flag to determine if using SageMaker recipes
        compute (dict): Compute configuration for training
        source_code (str): Path to source code
        network (dict): Network configuration
        data_channel (dict): Data channel configuration
        action (str): Action identifier for job naming
        
    Raises:
        ValueError: If required parameters are missing or invalid
    """
    # Parameter validation
    required_params = {
        'use_recipes': use_recipes,
        'compute': compute,
        **({'source_code': source_code} if source_code is not None else {}),
        'network': network,
        'data_channel': data_channel,
        'action': action,
        **({'training_recipe': training_recipe} if training_recipe is not None else {}),
        **({'recipe_overrides': recipe_overrides} if recipe_overrides is not None else {}),
    }
    
    for param_name, param_value in required_params.items():
        if param_value is None:
            raise ValueError(f"Required parameter '{param_name}' is missing")
            

    # Job name creation
    job_name = f'model-trainer-deepseek-{action}'

    # Image URI selection
    if image_uri is None or len(image_uri) == 0:
        if use_recipes:
            image_uri = (
                "658645717510.dkr.ecr.us-east-1.amazonaws.com/smdistributed-modelparallel:2.4.1-gpu-py311-cu121"
            )
        else:
            image_uri = sagemaker.image_uris.retrieve(
                framework="pytorch",
                region=sagemaker_session.boto_session.region_name,
                version="2.4",
                instance_type=compute.instance_type,
                image_scope="training"
            )

    # Setting up stopping condition
    stopping_condition = StoppingCondition(max_runtime_in_seconds=43200)

    # Estimator configuration
    estimator_config = {
        'training_image': image_uri,
        'source_code': source_code,
        'networking': network,
        'compute': compute,
        'base_job_name': job_name,
        'stopping_condition': stopping_condition
    }

    if(len(hyperparameters) != 0):
        estimator_config.update({'hyperparameters':hyperparameters})

    if(source_code is None):
        estimator_config.pop('source_code')

    # Create and execute model trainer
    try:
        if(use_recipes):
            estimator_config.update({'training_recipe':training_recipe})
            estimator_config.update({'recipe_overrides':recipe_overrides})
            estimator_config.update({'requirements':"scripts/requirements.txt"})
    
            print(f'estimator_config:{estimator_config}')
            model_trainer= ModelTrainer.from_recipe(**estimator_config) 
        else: 
            print(f'estimator_config:{estimator_config}')
            model_trainer= ModelTrainer(**estimator_config)
        
        return model_trainer
        #model_trainer.fit(input_data_config=[data_channel], wait=True)
    except Exception as e:
        raise Exception(f"Failed to create or execute model trainer: {str(e)}")


# STEP 1: Download model to Amazon FSx for Lustre directory 

In this step, we will download the DeepSeek-R1 model to FSx directory.

Select the instance type, FSx data channel, network configuration for the training job, source code and define the ModelTrainer class to run the training job on ml.c5.18xlarge instance to download DeepSeek-R1 model from huggingface hub.

In [None]:
# Create compute instance
compute = ComputeCreator.create(
    instance_type="ml.c5.18xlarge",
    instance_count=1
)

# Create FSx data channel
data_channel = FSxDataChannelCreator.create_channel(
    directory_path=fsx_mount_point
)

# Create network configuration
network = NetworkConfigCreator.create_network_config(network_config)

# Set up source code configuration
source_code = SourceCode(
    source_dir="scripts",
    entry_script="download.py"
)

# Print configuration details
print(f'Compute Instance created: {compute}')
print(f'Type: {type(compute)}\n')

print(f'Data Channel created: {data_channel}')
print(f'Type: {type(data_channel)}\n')

print(f'Network created: {network}')
print(f'Type: {type(network)}\n')

print(f'Source code created: {source_code}')
print(f'Type: {type(source_code)}\n')

# Define hyperparameters
hyperparameters = {
    "model_id": model_id,      # Hugging Face model id
    "hf_token": hf_token,
    "local_fsx_dir": fsx_dir_basemodel
}

# Create model trainer
model_trainer = create_model_trainer(
    use_recipes=False,
    compute=compute,
    network=network,
    data_channel=data_channel,          
    action="download",
    hyperparameters=hyperparameters,
    source_code=source_code
)


In [None]:
model_trainer.train(input_data_config=[data_channel], wait=True)

# STEP 2: Convert DeepSeek R1 from FP8 to BF16

HyperPod recipes disable FP8 in the QLoRA and LoRA recipes. BF16 is the most optimal precision type for generalizing PEFT training configurations to various datasets. That being said, the default weights provided by the DeepSeek team on their official R1 repository are of type FP8. To ensure stable fine-tuning for a DeepSeek-R1 model, we will first convert it to BF16 using the fp8_cast_bf16.py command-line script provided by DeepSeek. Executing this script, will copy over the converted BF16 weights in safetensor format to the specified output directory.

We will use ModelTrainer class to execute the conversion using training jobs

In [None]:
# Define constants
fsx_modeldir_bf16 = "deepseek_r1_bf16"
FSX_DIR_PATH = f"{fsx_mount_point}/{fsx_dir_basemodel}"

# Create compute instance
compute = ComputeCreator.create(
    instance_type="ml.p5.48xlarge",
    instance_count=1
)

# Create FSx data channel
data_channel = FSxDataChannelCreator.create_channel(
    directory_path=FSX_DIR_PATH
)

# Create network configuration
network = NetworkConfigCreator.create_network_config(network_config)

# Set up source code configuration
source_code = SourceCode(
    source_dir="scripts",
    entry_script="convert.sh"
)

# Print configuration details
print(f'Compute Instance created: {compute}')
print(f'Type: {type(compute)}\n')

print(f'Data Channel created: {data_channel}')
print(f'Type: {type(data_channel)}\n')

print(f'Network created: {network}')
print(f'Type: {type(network)}\n')

print(f'Source code created: {source_code}')
print(f'Type: {type(source_code)}\n')

# Define hyperparameters for model conversion
hyperparameters = {
    "model_id": model_id,          # Hugging Face model id
    "hf_token": hf_token,
    "converted_fsx_dir": fsx_modeldir_bf16
}

# Create model trainer for conversion
model_trainer = create_model_trainer(
    use_recipes=False,
    compute=compute,
    network=network,
    data_channel=data_channel,          
    action="convert",
    hyperparameters=hyperparameters,
    source_code=source_code
)


In [None]:
model_trainer.train(input_data_config=[data_channel], wait=True)

# STEP 3: Fine-tune the DeepSeek-R1 model

Our next phase involves the fine-tuning of the DeepSeek-R1 model utilizing two ml.p5.48xlarge instances, leveraging distributed training. We'll implement this through SageMaker's recipe "hf_deepseek_r1_671b_seq8k_gpu_qlora", which incorporates the Quantized Low-Rank Adaptation (QLoRA) methodology. QLoRA makes LLM trainable on limited compute by quantizing the base model to 4-bit precision while using small, trainable low-rank adapters for fine-tuning, dramatically reducing memory requirements without sacrificing model quality.

We can override recipe parameters, to tune the script to our needs. 

In [None]:
recipe_overrides = {
    "run": {
        "results_dir": "/opt/ml/model",
    },
    "exp_manager": {
        "exp_dir": "/opt/ml/input/data/modelweights/output/",
        "explicit_log_dir": "/opt/ml/output/tensorboard",
    },
    "model": {
        "hf_model_name_or_path": "/opt/ml/input/data/modelweights/",
        "data": {
            #"use_synthetic_data": True,
            "train_dir": "/opt/ml/input/data/train",
            "val_dir": "/opt/ml/input/data/test",
        }
    },
}

In [None]:
# Create compute configuration with P5 instances
compute = ComputeCreator.create(
    instance_type="ml.p5.48xlarge",
    instance_count=2
)

# Construct FSx directory path for model
fsx_dir_path = f"{fsx_mount_point}/{fsx_dir_basemodel}/{fsx_modeldir_bf16}"

# Set up network configuration
network = NetworkConfigCreator.create_network_config(network_config)

# Print configuration details
print(f'Compute Instance created: {compute}')
print(f'Type: {type(compute)}\n')

print(f'Data Channel created: {data_channel}')
print(f'Type: {type(data_channel)}\n')

print(f'Network created: {network}')
print(f'Type: {type(network)}\n')

# Create model trainer for fine-tuning
model_trainer = create_model_trainer(
    use_recipes=True,
    compute=compute,
    network=network,
    data_channel=data_channel,          
    action="finetune",
    training_recipe='fine-tuning/deepseek/hf_deepseek_r1_671b_seq8k_gpu_qlora',
    recipe_overrides=recipe_overrides                       
)

In [None]:
from sagemaker.modules.configs import InputData

# Create FSx data channel
data_channel = FSxDataChannelCreator.create_channel(
    directory_path=fsx_dir_path
)

# Pass the input data
train_input = InputData(
    channel_name="train",
    data_source=train_dataset_s3_path, # S3 path where training data is stored
)

test_input = InputData(
    channel_name="test",
    data_source=test_dataset_s3_path, # S3 path where training data is stored
)

data_channel, train_input, test_input

In [None]:
model_trainer.train(input_data_config=[data_channel, train_input, test_input], wait=True)

# STEP 4: Merge the trained adapter with the base model 

In [None]:
# Constants and Configuration
ECR_IMAGE_URI = "658645717510.dkr.ecr.us-east-1.amazonaws.com/smdistributed-modelparallel:2.4.1-gpu-py311-cu121"

# Create compute configuration with P5 instance
compute = ComputeCreator.create(
    instance_type="ml.p5.48xlarge",
    instance_count=1
)

# Construct FSx directory path for model weights
fsx_dir_path = f"{fsx_mount_point}/{fsx_dir_basemodel}/{fsx_modeldir_bf16}"

# Create FSx data channel for model access
data_channel = FSxDataChannelCreator.create_channel(
    directory_path=fsx_dir_path
)

# Set up network configuration
network = NetworkConfigCreator.create_network_config(network_config)

# Print configuration details for verification
print(f'Compute Instance created: {compute}')
print(f'Type: {type(compute)}\n')

print(f'Data Channel created: {data_channel}')
print(f'Type: {type(data_channel)}\n')

print(f'Network created: {network}')
print(f'Type: {type(network)}\n')

# Define hyperparameters for model parallel training
hyperparameters = {
    "mp_parameters": {
        #"tensor_parallel_degree": 1
    }
}

# Configure source code location and entry point
source_code = SourceCode(
    source_dir="scripts",
    entry_script="cli-inference.sh"
)

# Create model trainer for adapter merging
model_trainer = create_model_trainer(
    use_recipes=False,
    compute=compute,
    network=network,
    data_channel=data_channel,          
    action="mergeadapter",
    hyperparameters=hyperparameters,
    source_code=source_code,
    image_uri=ECR_IMAGE_URI
)


In [None]:
model_trainer.train(input_data_config=[data_channel], wait=True)