# Prep work

Before starting the training, we need to create model repositories on the HuggingFace model hub for both our reward model adapters and our final RLHF model adapters.

Also, in oder to be able to use the Llama 3.1 8b instruct model we need to accept the license terms of the model in the HuggingFace model hub. 

To authenticate against the HuggingFace model hub we need to create an access token, which we will use later in the notebook.

In [None]:
# Define model to be fine-tuned
#model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
model_id = "meta-llama/Llama-3.2-1B-Instruct"

# Install dependencies

In [None]:
# Install PyTorch with CUDA support
%pip install -U torch==2.2.0+cu118 --index-url https://download.pytorch.org/whl/cu118

In [None]:
# Install all dependencies from requirements.txt file
%pip install -r requirements.txt

In [None]:
# Import required libraries
import boto3
import botocore
import bitsandbytes as bnb
import multiprocessing
import sys
import functools
import json
import torch
import transformers
import warnings
from dataclasses import dataclass, field
from typing import Optional
from datasets import load_dataset, load_from_disk, Dataset, DatasetDict
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM, BitsAndBytesConfig, set_seed
from trl import ModelConfig, RewardConfig, PPOConfig, PPOTrainer, RewardTrainer, AutoModelForCausalLMWithValueHead, get_kbit_device_map, get_peft_config, get_quantization_config
from trl.core import LengthSampler
from accelerate import Accelerator
from peft import AutoPeftModelForCausalLM, AutoPeftModelForSequenceClassification, LoraConfig, get_peft_model, prepare_model_for_kbit_training
from sagemaker.remote_function import remote
from tqdm import tqdm
import s3fs

# Setup
## Creation of S3 bucket for data and model artifacts

In [None]:
import boto3
import sagemaker
from sagemaker.session import Session
import uuid
import time

def create_s3_bucket_for_models(bucket_name=None, region=None):
    """
    Create an S3 bucket for storing SageMaker models.
    
    Args:
        bucket_name: Optional name for the S3 bucket. If not provided, a name will be generated.
        region: AWS region to create the bucket in. If not provided, uses the SageMaker session's region.
        
    Returns:
        tuple: (bucket_name, s3_output_path) - The name of the created bucket and the S3 path for model output
    """
    # Initialize boto3 clients
    s3_client = boto3.client('s3')
    session = Session()
    
    # Get the AWS region if not provided
    if not region:
        region = session.boto_region_name
    
    # Generate a unique bucket name if not provided
    if not bucket_name:
        timestamp = int(time.time())
        random_suffix = str(uuid.uuid4())[:8]
        bucket_name = f"sagemaker-model-artifacts-{timestamp}-{random_suffix}"
    
    # Create the S3 bucket
    try:
        if region == 'us-east-1':
            s3_client.create_bucket(Bucket=bucket_name)
        else:
            location = {'LocationConstraint': region}
            s3_client.create_bucket(
                Bucket=bucket_name,
                CreateBucketConfiguration=location
            )
        print(f"Created S3 bucket: {bucket_name}")
        
        # Create a folder for model output
        s3_output_path = f"s3://{bucket_name}/models"
        s3_data_path = f"s3://{bucket_name}/data"
        
        return bucket_name, s3_output_path, s3_data_path
    
    except Exception as e:
        print(f"Error creating S3 bucket: {e}")
        return None, None

# Create a bucket for your SageMaker models
bucket_name, s3_output_path, s3_data_path = create_s3_bucket_for_models()
print(f"Model output path: {s3_output_path}, Data output path: {s3_data_path}")

# Data preperation

## Reward model training dataset

Dataset used: Anthropic HH-RLHF (helpful) - https://huggingface.co/datasets/Anthropic/hh-rlhf

Target format: 
```json
DatasetDict({
    train: Dataset({
        features: ['input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
        num_rows: _
    })
    test: Dataset({
        features: ['input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
        num_rows: _
    })
})
```

In [None]:
# Login to huggingface
hf_token = "***HF_TOKEN***"
login(hf_token)

In [None]:
# Load dataset
ds = load_dataset("Anthropic/hh-rlhf", data_dir="helpful-base")
ds

In [None]:
ds['train'][67]

In [None]:
def extract_dialogue(input_text):
    # Split the input by lines and initialize variables
    lines = input_text.strip().split("\n\n")
    dialogue_list = []

    # Iterate through each line and extract the dialogue
    for line in lines:
        # Check if the line starts with "Human" or "Assistant" and split accordingly
        if line.startswith("Human:"):
            role = "user"
            content = line.replace("Human: ", "").strip()
        elif line.startswith("Assistant:"):
            role = "assistant"
            content = line.replace("Assistant: ", "").strip()
        else:
            # If the line doesn't start with "Human" or "Assistant", it's part of the previous message's content
            # Append it to the last message's content
            dialogue_list[-1]["content"] += "\n\n" + line.strip()
            continue

        # Append the extracted dialogue piece to the list
        dialogue_list.append({"role": role, "content": content})

    return dialogue_list

def process(row):
        row["chosen"] = extract_dialogue(row["chosen"])
        row["rejected"] = extract_dialogue(row["rejected"])
        row["prompt"] = row["chosen"][0]["content"]
        return row

In [None]:
ds_processed = ds.map(
        process,
        load_from_cache_file=False,
    )
ds_processed

In [None]:
ds_processed['train'][67]

In [None]:
# Adjusting to llama prompt template format: https://github.com/meta-llama/llama-recipes
system_prompt = "Please answer the user's question to the best of your knowledge. If you don't know the answer respond that you don't know."

def encode_dialogue_turn(message):
    return f'<|start_header_id|>{message.get("role")}<|end_header_id|>{message.get("content")}<|eot_id|>'

def encode_dialogue(dialogue):
    if system_prompt:
        return f'<|begin_of_text|><|start_header_id|>system<|end_header_id|>{system_prompt}<|eot_id|>{functools.reduce(lambda a, b: a + encode_dialogue_turn(b), dialogue, "")}'
    else:
        return f'<|begin_of_text|>{functools.reduce(lambda a, b: a + encode_dialogue_turn(b), dialogue, "")}'


def encode_row(item):
    return {"chosen": encode_dialogue(item["chosen"]), "rejected": encode_dialogue(item["rejected"]), "prompt": item["prompt"]}
                                      
def encode_dataset(dataset):
    return list(map(encode_row, dataset))

In [None]:
encoded_dataset = ds_processed.map(encode_row)
encoded_dataset

In [None]:
encoded_dataset['train'][67]

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
# Tokenize and stack into target format
def preprocess_function(examples):
    new_examples = {
        "input_ids_chosen": [],
        "attention_mask_chosen": [],
        "input_ids_rejected": [],
        "attention_mask_rejected": [],
    }
    for chosen, rejected in zip(examples["chosen"], examples["rejected"]):
        tokenized_chosen = tokenizer(chosen)
        tokenized_rejected = tokenizer(rejected)

        new_examples["input_ids_chosen"].append(tokenized_chosen["input_ids"])
        new_examples["attention_mask_chosen"].append(tokenized_chosen["attention_mask"])
        new_examples["input_ids_rejected"].append(tokenized_rejected["input_ids"])
        new_examples["attention_mask_rejected"].append(tokenized_rejected["attention_mask"])

    return new_examples

In [None]:
tokenized_dataset_hhrlhf = encoded_dataset.map(
        preprocess_function,
        batched=True,
    ).remove_columns(["chosen", "rejected", "prompt"])
tokenized_dataset_hhrlhf

In [None]:
import boto3
import s3fs
#from datasets.filesystems import S3FileSystem

# Define the S3 path
s3_bucket = bucket_name
dataset_path_hhrlhf = f's3://{s3_bucket}/experiments-hhrlhf/helpful-base-train-test-tokenized-llama318binstruct'

# Verify S3 bucket permissions first
s3_client = boto3.client('s3')
try:
    s3_client.head_bucket(Bucket=s3_bucket)
    print(f"Successfully connected to bucket: {s3_bucket}")
except Exception as e:
    print(f"Error connecting to bucket: {e}")
    raise


# Save the dataset to S3 using the appropriate filesystem
try:
    # Make sure you have the s3fs package installed
    tokenized_dataset_hhrlhf.save_to_disk(
        dataset_path_hhrlhf, 
        fs=S3FileSystem()
    )
    print(f"Successfully uploaded dataset to: {dataset_path_hhrlhf}")
except Exception as e:
    print(f"Error uploading to S3: {e}")
    
    # Alternative approach if the above fails
    print("Trying alternative approach...")
    #fs = s3fs.S3FileSystem()
    tokenized_dataset_hhrlhf.save_to_disk(
        dataset_path_hhrlhf
    )
    print(f"Successfully uploaded dataset using alternative method to: {dataset_path_hhrlhf}")

In [None]:
dataset_path_hhrlhf

## PPO training dataset

Dataset used: Stanford Question&Answering Dataset (SQuAD) - https://rajpurkar.github.io/SQuAD-explorer/

Target format: 
```json

```

In [None]:
# Download SQuAD dataset
!wget --no-check-certificate https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
!wget --no-check-certificate https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json

In [None]:
# Load files
with open('./train-v2.0.json') as f:
    d_train = json.load(f)
with open('./dev-v2.0.json') as f:
    d_test = json.load(f)

In [None]:
def extract_questions(dataset):
    ret_questions = []
    for topic in dataset:
        paragraphs = topic['paragraphs']
        for paragraph in paragraphs:
            qas = paragraph['qas']
            for qa in qas:
                ret_questions.append([{
            "role": "system", "content": f'Instruction: Please answer the user\'s question to the best of your knowledge. If you don\'t know the answer respond that you don\'t know.',
        }, {
            "role": "user", "content": qa['question'],
        }])
    return ret_questions

# Adjusting to llama prompt template format: https://github.com/meta-llama/llama-recipes
def encode_dialogue_turn(message):
    message = message
    return f'<|start_header_id|>{message.get("role")}<|end_header_id|>{message.get("content")}<|eot_id|>'

def encode_dialogue(dialogue):
    return {'input': f'<|begin_of_text|>{functools.reduce(lambda a, b: a + encode_dialogue_turn(b), dialogue, "")}'}

                                      
def encode_dataset(dataset):
    #print(dataset)
    return list(map(encode_dialogue, dataset))

In [None]:
encoded_train = encode_dataset(extract_questions(d_train['data']))
encoded_test = encode_dataset(extract_questions(d_test['data']))

In [None]:
encoded_train[0]

In [None]:
# Create DatasetDict
dataset_dict = DatasetDict({
    "train": Dataset.from_list(encoded_train),
    "test": Dataset.from_list(encoded_test)
})
dataset_dict

In [None]:
# Restrict training context size (due to memory limitations, can be adjusted)
input_min_text_length = 1
input_max_text_length = 2048

def create_and_prepare_dataset(tokenizer, dataset):
    
    input_size = LengthSampler(input_min_text_length, input_max_text_length)

    def tokenize(example):
        text_size = input_size()
        example["input_ids"] = tokenizer.encode(example["input"])[:text_size]
        example["query"] = tokenizer.decode(example["input_ids"])
        return example

    dataset = dataset.map(tokenize, batched=False)
        
    dataset.set_format("torch")
    return dataset


tokenized_dataset_squad = create_and_prepare_dataset(tokenizer, dataset_dict).remove_columns(["input"])
tokenized_dataset_squad

In [None]:
tokenized_dataset_squad['train'][0]

In [None]:
# Save dataset to s3
#s3_bucket = "***S3_BUCKET_NAME***"

#dataset_path_squad = f's3://{s3_bucket}/experiments-squad/train-test-contextwindow-padding-2048'
#tokenized_dataset_squad.save_to_disk(dataset_path_squad)

#print(f"Uploaded dataset to: {dataset_path_squad}")

In [None]:
import boto3
import s3fs

# Define the S3 path
s3_bucket = bucket_name
dataset_path_squad = f's3://{s3_bucket}/experiments-squad/train-test-contextwindow-padding-2048'

# Verify S3 bucket permissions first
s3_client = boto3.client('s3')
try:
    s3_client.head_bucket(Bucket=s3_bucket)
    print(f"Successfully connected to bucket: {s3_bucket}")
except Exception as e:
    print(f"Error connecting to bucket: {e}")
    raise


# Save the dataset to S3 using the appropriate filesystem
try:
    # Make sure you have the s3fs package installed
    tokenized_dataset_squad.save_to_disk(
        dataset_path_squad, 
        fs=S3FileSystem()
    )
    print(f"Successfully uploaded dataset to: {dataset_path_squad}")
except Exception as e:
    print(f"Error uploading to S3: {e}")
    
    # Alternative approach if the above fails
    print("Trying alternative approach...")
    tokenized_dataset_squad.save_to_disk(
        dataset_path_squad
    )
    print(f"Successfully uploaded dataset using alternative method to: {dataset_path_squad}")

# Training

In [None]:
import os
# Set path to config file for remote decorator
os.environ["SAGEMAKER_USER_CONFIG_OVERRIDE"] = os.getcwd()

In [None]:
hf_token

## Reward model training 

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )
    
def find_all_linear_names(hf_model):
    lora_module_names = set()
    for name, module in hf_model.named_modules():
        if isinstance(module, bnb.nn.Linear4bit):
            names = name.split(".")
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if "lm_head" in lora_module_names:  # needed for 16-bit
        lora_module_names.remove("lm_head")
    return list(lora_module_names)   

# Start training with remote decorator (https://docs.aws.amazon.com/sagemaker/latest/dg/train-remote-decorator.html). Additional job config is being pulled in from config.yaml. 
@remote(keep_alive_period_in_seconds=3600, volume_size=100, job_name_prefix=f"train-{model_id.split('/')[-1].replace('.', '-')}-reward", use_torchrun=True, nproc_per_node=1, include_local_workdir=True)
def train_fn(
        model_name,
        train_ds,
        lora_r=8,
        lora_alpha=32,
        lora_dropout=0.1,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=1,
        learning_rate=2e-4,
        num_train_epochs=1,
        fsdp="",
        fsdp_config=None,
        chunk_size=10000,
        gradient_checkpointing=False,
        merge_weights=False,
        seed=42,
        token=None,
        model_hub_repo_id=None,
        range_train=None,
        range_eval=None
):

    set_seed(seed)

    # Initialize Accelerator object handling distributed training
    accelerator = Accelerator()

    # Login to HuggingFace
    if token is not None:
        login(token)

    # Load tokenizer. Padding side is "left" because focus needs to be on completion
    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side = "left")

    # Set tokenizer's pad Token
    tokenizer.pad_token = tokenizer.eos_token 
    tokenizer.pad_token_id = tokenizer.eos_token_id 

    # Load data from S3
    s3 = s3fs.S3FileSystem()
    dataset = load_from_disk(train_ds)  
    
    
    # Allow for partial dataset training
    if range_train:
        train_dataset = dataset["train"].select(range(range_train))
    else: 
        train_dataset = dataset["train"]
  
    if range_eval:
        eval_dataset = dataset["test"].select(range(range_eval))
    else:
        eval_dataset = dataset["test"]

    # Specify quantization config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        quant_storage_dtype=torch.bfloat16
    )
    
    # Load model with classification head for reward
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        #num_labels=1,
        trust_remote_code=True,
        quantization_config=bnb_config,
        #attn_implementation="flash_attention_2",
        use_cache=False if gradient_checkpointing else True,
        cache_dir="/tmp/.cache"
    )
    
    # Pre-LoRA trainable paremeters
    print_trainable_parameters(model)     
    
    # Set model pad token id
    model.config.pad_token_id = tokenizer.pad_token_id
    
    # Prepare model for quantized training
    model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing)

    if gradient_checkpointing:
        model.gradient_checkpointing_enable()

    # Get lora target modules
    modules = find_all_linear_names(model)
    print(f"Found {len(modules)} modules to quantize: {modules}")
    
    # Specify LoRA config
    config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        target_modules=modules,
        lora_dropout=lora_dropout,
        bias="none",
        task_type="SEQ_CLS"
    )
    
    # Make sure to not train for CLM
    if config.task_type != "SEQ_CLS":
        warnings.warn(
            "You are using a `task_type` that is different than `SEQ_CLS` for PEFT. This will lead to silent bugs"
            " Make sure to pass --lora_task_type SEQ_CLS when using this script."
        )
    
    # Create PeftModel
    model = get_peft_model(model, config)
    
    # Post-LoRA trainable paremeters
    print_trainable_parameters(model)     
    
    # Specify training config
    reward_config = RewardConfig(
                        per_device_train_batch_size=per_device_train_batch_size,
                        per_device_eval_batch_size=per_device_eval_batch_size,
                        gradient_accumulation_steps=gradient_accumulation_steps,
                        gradient_checkpointing=gradient_checkpointing,
                        logging_strategy="steps",
                        logging_steps=100,
                        log_on_each_node=False,
                        num_train_epochs=num_train_epochs,
                        learning_rate=learning_rate,
                        bf16=True,
                        ddp_find_unused_parameters=False,
                        fsdp=fsdp,
                        fsdp_config=fsdp_config,
                        save_strategy="no",
                        output_dir="outputs",
                        max_length=512, 
                        remove_unused_columns=False,
                        gradient_checkpointing_kwargs = {"use_reentrant": False}
                        )
    
    # Initialize RewardTrainer object handling training
    trainer = RewardTrainer(
        model=model,
        tokenizer=tokenizer,
        args=reward_config,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

    trainer.train()

    
    trainer.model.save_pretrained("/opt/ml/model", safe_serialization=True)
    
    if model_hub_repo_id is not None:
        trainer.model.push_to_hub(repo_id=model_hub_repo_id)

    with accelerator.main_process_first():
        tokenizer.save_pretrained("/opt/ml/model")

Define the Hugging Face repository ID for pushing the reward model adapter

In [None]:
rm_adapter_hub_repo_id = "aristsakpinisaws/llama-31-hhrlhf-reward-adapter"


In [None]:
# Start training job
train_fn(
    model_id,
    train_ds=dataset_path_hhrlhf,  # Use S3 path instead of in-memory dataset
    #train_ds=tokenized_dataset_hhrlhf,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    num_train_epochs=1,
    token=hf_token,
    model_hub_repo_id=rm_adapter_hub_repo_id,
    range_train=100,
    range_eval=10
)

## Preference Alignment training with multi-adapter PPO 

In [None]:
import os
# Set path to config file for remote decorator
os.environ["SAGEMAKER_USER_CONFIG_OVERRIDE"] = os.getcwd()

In [None]:
import boto3

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

# Start training with remote decorator (https://docs.aws.amazon.com/sagemaker/latest/dg/train-remote-decorator.html). Additional job config is being pulled in from config.yaml. 
@remote(keep_alive_period_in_seconds=3600, volume_size=100, job_name_prefix=f"train-{model_id.split('/')[-1].replace('.', '-')}-multi-adapter-ppo", use_torchrun=True, nproc_per_node=1)
def train_fn(
        model_name,
        train_ds,
        rm_adapter,
        s3_output_path,
        log_with=None,
        use_safetensors=None,
        use_score_scaling=False,
        use_score_norm=False,
        score_clip=None,
        seed=42,
        token=None,
        model_hub_repo_id=None,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=2,
        gradient_checkpointing=True,
        num_train_epochs=1,
        merge_weights=True,
        range_train=None,
        ):

    set_seed(seed)

    # Initialize Accelerator object handling distributed training
    accelerator = Accelerator()
    
    # Login to HuggingFace 
    if token is not None:
        login(token)
        
    # Load tokenizer. Padding side is "left" because focus needs to be on completion
    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')

    # Set tokenizer's pad Token
    tokenizer.pad_token = tokenizer.eos_token 
    tokenizer.pad_token_id = tokenizer.eos_token_id  
    
    
    # Load data from S3
    dataset = load_from_disk(train_ds)
    
    
    # Allow for partial dataset training
    if range_train:
        train_dataset = dataset["train"].select(range(range_train))
    else: 
        train_dataset = dataset["train"]
    
    # Specify LoRA config
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )
    
    # Specify quantization config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16
    )
    
    # Load model
    model = AutoModelForCausalLMWithValueHead.from_pretrained(
        model_name,
        #device_map='auto',
        peft_config=lora_config,
        quantization_config=bnb_config,
        reward_adapter=rm_adapter,
        use_safetensors=use_safetensors,
        #attn_implementation="flash_attention_2",
    )
    
    # Set model pad token id
    model.config.pad_token_id = tokenizer.pad_token_id

    if gradient_checkpointing:
        model.gradient_checkpointing_enable()
        
    # Trainable paremeters
    print_trainable_parameters(model)    

    def collator(data):
        return {key: [d[key] for d in data] for key in data[0]}

    # Specify PPO training config
    config = PPOConfig(
        model_name,
        log_with=None,
        learning_rate=1e-5,
        batch_size=per_device_train_batch_size,
        mini_batch_size=1,
        gradient_accumulation_steps=gradient_accumulation_steps,
        optimize_cuda_cache=True,
        seed=42,
        use_score_scaling=False,
        use_score_norm=False,
        score_clip=None,
    )

    # Initialize PPOTrainer object handling training
    ppo_trainer = PPOTrainer(
        config,
        model,
        ref_model=None,
        tokenizer=tokenizer,
        dataset=train_dataset,
        data_collator=collator,
    )

    # Specifying inference params
    generation_kwargs = {
        #"top_k": 0.0,
        "top_p": 0.9,
        "do_sample": True,
        "pad_token_id": tokenizer.pad_token_id,
        "max_new_tokens": 32,
    }
    
    step = 0

    for _epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
        
        question_tensors = batch["input_ids"]
        
        # Inference through model being fine-tuned
        response_tensors = ppo_trainer.generate(
            question_tensors,
            return_prompt=False,
            **generation_kwargs,
        )
        
        # Decode response
        batch["response"] = tokenizer.batch_decode(response_tensors, skip_special_tokens=True)
        
        # Concat query and response
        texts = [q + r for q, r in zip(batch["query"], batch["response"])]
        
        # Tokenize query - response pair
        inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(ppo_trainer.accelerator.device)
        
        # Compute reward score
        raw_rewards = ppo_trainer.accelerator.unwrap_model(ppo_trainer.model).compute_reward_score(**inputs)
        rewards = [raw_rewards[i, -1, 1] for i in range(len(raw_rewards))]  # take last token

        # Run PPO step
        stats = ppo_trainer.step(question_tensors, response_tensors, rewards)
        ppo_trainer.log_stats(stats, batch, rewards)
        
        step = step + 1      


    if merge_weights:

        if accelerator.is_main_process:
            
            output_dir = "/tmp/model"
    

            ppo_trainer.save_pretrained(output_dir, safe_serialization=True)

       
            # clear memory
            del model
            del ppo_trainer

            torch.cuda.empty_cache()

            # load PEFT model
            model = AutoPeftModelForCausalLM.from_pretrained(
                output_dir,
                torch_dtype=torch.float16,
                low_cpu_mem_usage=True,
                trust_remote_code=True,
                use_cache=True,
                cache_dir="/tmp/.cache",
            )

            # Merge LoRA and base model and save
            model = model.merge_and_unload()
            model.save_pretrained(
                os.environ.get("SM_MODEL_DIR", "/opt/ml/model"),
                safe_serialization=True,
                max_shard_size="2GB"
            )
            if model_hub_repo_id is not None:
                model.push_to_hub(repo_id=model_hub_repo_id)

            tokenizer.save_pretrained(os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))

            if model_hub_repo_id is not None:
                tokenizer.push_to_hub(repo_id=model_hub_repo_id)

        accelerator.wait_for_everyone()

    else:
        if accelerator.is_main_process:
            
            ppo_trainer.model.module.save_pretrained(
                os.environ.get("SM_MODEL_DIR", "/opt/ml/model"),
                safe_serialization=True
            )
    
            if model_hub_repo_id is not None:
                ppo_trainer.push_to_hub(repo_id=model_hub_repo_id)
    
    
            tokenizer.save_pretrained(os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
    
            if model_hub_repo_id is not None:
                tokenizer.push_to_hub(repo_id=model_hub_repo_id)


        accelerator.wait_for_everyone()

    if accelerator.is_main_process:
        # Upload the model files to S3
       
        
        # Get the S3 output path from the environment variables
        # SageMaker automatically sets these environment variables
        if os.environ.get("SM_MODEL_DIR") and s3_output_path:
            model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
            
            print(f"Uploading model from {model_dir} to {s3_output_path}")
            
            # Initialize S3 client
            s3_client = boto3.client('s3')
            
            # Extract bucket name and prefix from S3 URI
            s3_uri_parts = s3_output_path.replace("s3://", "").split("/")
            bucket_name = s3_uri_parts[0]
            prefix = "/".join(s3_uri_parts[1:]) if len(s3_uri_parts) > 1 else ""
            
            # Walk through all files in the model directory and upload them
            for root, dirs, files in os.walk(model_dir):
                for file in files:
                    local_path = os.path.join(root, file)
                    # Create relative path to maintain directory structure
                    relative_path = os.path.relpath(local_path, model_dir)
                    s3_key = os.path.join(prefix, relative_path)
                    
                    print(f"Uploading {local_path} to s3://{bucket_name}/{s3_key}")
                    try:
                        s3_client.upload_file(local_path, bucket_name, s3_key)
                    except Exception as e:
                        print(f"Failed to upload {local_path} to S3: {e}")
            
            print("Model upload to S3 completed")

    # Wait for all processes to complete
    accelerator.wait_for_everyone()
    
    
    #if accelerator.is_main_process:

    #    ppo_trainer.save_pretrained("/opt/ml/model", safe_serialization=True)

     #   if model_hub_repo_id is not None:
      #      ppo_trainer.push_to_hub(repo_id=model_hub_repo_id)
       #     tokenizer.push_to_hub(repo_id=model_hub_repo_id)

    #with accelerator.main_process_first():
     #   tokenizer.save_pretrained("/opt/ml/model")

Let's point ot the adapter previously created and pushed in the Hugging Face Model Hub

Define the Hugging Face repository ID for pushing the model

In [None]:
#model_hub_repo_id = "aristsakpinisaws/llama-31-hhrlhf-squad-rlhf-adapter"
model_hub_repo_id = "aristsakpinisaws/llama-31-hhrlhf-squad-rlhf-policy-model"

In [None]:
hf_token

In [None]:
train_fn(
    model_id,
    train_ds=dataset_path_squad,  # Use S3 path instead of in-memory dataset
    #train_ds=tokenized_dataset_squad,
    s3_output_path=s3_output_path,
    rm_adapter=rm_adapter_hub_repo_id,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    num_train_epochs=1,
    token=hf_token,
    model_hub_repo_id=model_hub_repo_id,
    range_train=50,
    merge_weights=True
)

# Import fine-tuned model to Amazon Bedrock through Custom Model Import (CMI)

In [None]:
from botocore.exceptions import ClientError

# Initialize IAM client
iam = boto3.client('iam')

# Role name - consider making this unique if you create multiple roles
role_name = "BedrockCustomModelImportRole"

# Define trust policy to allow Bedrock to assume this role
trust_policy = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "bedrock.amazonaws.com"
            },
            "Action": "sts:AssumeRole"
        }
    ]
}

# Define permissions policy for Bedrock model import
bedrock_policy_document = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": [
                "s3:GetObject",
                "s3:ListBucket"
            ],
            "Resource": [
                "arn:aws:s3:::*/*",
                "arn:aws:s3:::*"
            ]
        },
        {
            "Effect": "Allow",
            "Action": [
                "bedrock:CreateModelCustomizationJob",
                "bedrock:CreateModelImportJob",
                "bedrock:GetModelCustomizationJob",
                "bedrock:GetModelImportJob",
                "bedrock:StopModelCustomizationJob",
                "bedrock:StopModelImportJob"
            ],
            "Resource": "*"
        },
        {
            "Effect": "Allow",
            "Action": [
                "kms:Decrypt",
                "kms:GenerateDataKey"
            ],
            "Resource": "*"
        },
        {
            "Effect": "Allow",
            "Action": [
                "cloudwatch:PutMetricData"
            ],
            "Resource": "*",
            "Condition": {
                "StringEquals": {
                    "cloudwatch:namespace": "AWS/Bedrock"
                }
            }
        },
        {
            "Effect": "Allow",
            "Action": [
                "logs:CreateLogGroup",
                "logs:CreateLogStream",
                "logs:PutLogEvents"
            ],
            "Resource": "arn:aws:logs:*:*:log-group:/aws/bedrock/*"
        }
    ]
}

# Create or update the role
try:
    # Check if role exists
    try:
        iam.get_role(RoleName=role_name)
        print(f"Role {role_name} already exists. Updating policies...")
        # Delete any existing policies to ensure clean slate
        for policy in iam.list_attached_role_policies(RoleName=role_name)['AttachedPolicies']:
            iam.detach_role_policy(RoleName=role_name, PolicyArn=policy['PolicyArn'])
    except ClientError as e:
        if e.response['Error']['Code'] == 'NoSuchEntity':
            # Create role with trust policy if it doesn't exist
            print(f"Creating new role: {role_name}")
            iam.create_role(
                RoleName=role_name,
                AssumeRolePolicyDocument=json.dumps(trust_policy),
                Description="Role for Amazon Bedrock Model Import operations"
            )
        else:
            raise e
    
    # Create policy for Bedrock operations
    policy_name = f"{role_name}Policy"
    
    # Check if policy exists and delete if it does
    try:
        existing_policy = iam.get_policy(PolicyArn=f"arn:aws:iam::{boto3.client('sts').get_caller_identity()['Account']}:policy/{policy_name}")
        
        # Detach policy if attached to our role
        try:
            iam.detach_role_policy(
                RoleName=role_name,
                PolicyArn=existing_policy['Policy']['Arn']
            )
        except ClientError:
            pass  # Policy may not be attached
        
        # Delete existing versions (except default)
        policy_versions = iam.list_policy_versions(PolicyArn=existing_policy['Policy']['Arn'])['Versions']
        for version in policy_versions:
            if not version['IsDefaultVersion']:
                iam.delete_policy_version(
                    PolicyArn=existing_policy['Policy']['Arn'],
                    VersionId=version['VersionId']
                )
        
        # Create new version and set as default
        iam.create_policy_version(
            PolicyArn=existing_policy['Policy']['Arn'],
            PolicyDocument=json.dumps(bedrock_policy_document),
            SetAsDefault=True
        )
        policy_arn = existing_policy['Policy']['Arn']
        
    except ClientError as e:
        if e.response['Error']['Code'] == 'NoSuchEntity':
            # Create new policy
            response = iam.create_policy(
                PolicyName=policy_name,
                PolicyDocument=json.dumps(bedrock_policy_document),
                Description="Permissions for Amazon Bedrock Model Import operations"
            )
            policy_arn = response['Policy']['Arn']
        else:
            raise e
    
    # Attach policy to role
    iam.attach_role_policy(
        RoleName=role_name,
        PolicyArn=policy_arn
    )
    
    # Wait for policy to propagate
    print(f"Waiting for IAM role and policies to propagate...")
    time.sleep(10)  # IAM changes can take a few seconds to propagate
    
    # Get the role ARN
    role = iam.get_role(RoleName=role_name)
    role_arn = role['Role']['Arn']
    
    print(f"Successfully configured role: {role_arn}")

except Exception as e:
    print(f"Error setting up IAM role: {str(e)}")
    role_arn = None  # Set to None if there was an error

In [None]:
from sagemaker.session import Session

# Initialize the Bedrock client
bedrock = boto3.client('bedrock', region_name=Session().boto_region_name)
role_arn = "arn:aws:iam::308819823671:role/service-role/executionRoleName-20250213T234639"

# Define name for imported model
imported_model_name = f'llama-31-hhrlhf-squad-rlhf-policy-model-{int(time.time())}'

# Create the model import job
response = bedrock.create_model_import_job(
    jobName=imported_model_name,
    importedModelName=imported_model_name,
    roleArn=role_arn,
    modelDataSource={
        's3DataSource': {
            's3Uri': s3_output_path
        }
    }
)

job_Arn = response['jobArn']

# Output the job ARN
print(f"Model import job created with ARN: {response['jobArn']}")

In [None]:
# Check CMI job status
while True:
    response = bedrock.get_model_import_job(jobIdentifier=job_Arn)
    status = response['status'].upper()
    print(f"Status: {status}")
    
    if status in ['COMPLETED', 'FAILED']:
        break
        
    time.sleep(60)  # Check every 60 seconds

# Get the model ID
model_arn = response['importedModelArn']


# Serverless model inference with imported model

In [None]:
from transformers import AutoTokenizer
import json
import boto3
from botocore.config import Config
from IPython.display import Markdown, display

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Initialize Bedrock Runtime client
session = boto3.Session()
client = session.client(
    service_name='bedrock-runtime',
    region_name='us-west-2',
    config=Config(
        connect_timeout=300,  # 5 minutes
        read_timeout=300,     # 5 minutes
        retries={'max_attempts': 3}
    )
)

In [None]:
def generate(messages, temperature=0.3, max_tokens=4096, top_p=0.9, continuation=False, max_retries=10):
    """
    Generate response using the model with proper tokenization and retry mechanism
    
    Parameters:
        messages (list): List of message dictionaries with 'role' and 'content'
        temperature (float): Controls randomness in generation (0.0-1.0)
        max_tokens (int): Maximum number of tokens to generate
        top_p (float): Nucleus sampling parameter (0.0-1.0)
        continuation (bool): Whether this is a continuation of previous generation
        max_retries (int): Maximum number of retry attempts
    
    Returns:
        dict: Model response containing generated text and metadata
    """
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, 
                                         add_generation_prompt=not continuation)
    
    attempt = 0
    while attempt < max_retries:
        try:
            response = client.invoke_model(
                modelId=model_arn,
                body=json.dumps({
                    'prompt': prompt,
                    'temperature': temperature,
                    'max_gen_len': max_tokens,
                    'top_p': top_p
                }),
                accept='application/json',
                contentType='application/json'
            )
            
            result = json.loads(response['body'].read().decode('utf-8'))
            return result
            
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {str(e)}")
            attempt += 1
            if attempt < max_retries:
                time.sleep(30)
    
    raise Exception("Failed to get response after maximum retries")

In [None]:
test_prompt = """Hi, how are you?
"""

messages = [{"role": "user", "content": test_prompt}]
response = generate(messages)
print("Model Response:")
print(response["generation"])

# Cleanup