# Prep work

Before starting the training, we need to create model repositories on the HuggingFace model hub for both our reward model adapters and our final RLHF model adapters.

Also, in oder to be able to use the Llama 3.1 8b instruct model we need to accept the license terms of the model in the HuggingFace model hub. 

To authenticate against the HuggingFace model hub we need to create an access token, which we will use later in the notebook.

In [None]:
# Define model to be fine-tuned
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

# Install dependencies

In [None]:
%pip install -U torch==2.2.0+cu118 --index-url https://download.pytorch.org/whl/cu118

In [None]:
%pip install -q transformers==4.41.0
%pip install -Uq bitsandbytes==0.43.0
%pip install -Uq peft==0.8.1
%pip install -Uq datasets==2.18.0 
%pip install -Uq tensorboardX==2.6.2.2
%pip install -Uq py7zr==0.21.0
%pip install -Uq einops==0.7.0
%pip install -q accelerate==0.27.2
%pip install huggingface_hub
%pip install trl

In [None]:
import bitsandbytes as bnb
import multiprocessing
import sys
import functools
import json
import torch
import transformers
import warnings
from dataclasses import dataclass, field
from typing import Optional
from datasets import load_dataset, load_from_disk, Dataset, DatasetDict
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM, BitsAndBytesConfig, set_seed
from trl import ModelConfig, RewardConfig, PPOConfig, PPOTrainer, RewardTrainer, AutoModelForCausalLMWithValueHead, get_kbit_device_map, get_peft_config, get_quantization_config
from trl.core import LengthSampler
from accelerate import Accelerator
from peft import AutoPeftModelForCausalLM, AutoPeftModelForSequenceClassification, LoraConfig, get_peft_model, prepare_model_for_kbit_training
from sagemaker.remote_function import remote
from tqdm import tqdm

In [None]:
%pip install s3fs

In [None]:
import s3fs

# Data preperation

## Reward model training dataset

Dataset used: Anthropic HH-RLHF (helpful) - https://huggingface.co/datasets/Anthropic/hh-rlhf

Target format: 
```json
DatasetDict({
    train: Dataset({
        features: ['input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
        num_rows: _
    })
    test: Dataset({
        features: ['input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
        num_rows: _
    })
})
```

In [None]:
# Login to huggingface
hf_token = "***HF_TOKEN***"
login(hf_token)

In [None]:
# Load dataset
ds = load_dataset("Anthropic/hh-rlhf", data_dir="helpful-base")
ds

In [None]:
ds['train'][67]

In [None]:
def extract_dialogue(input_text):
    # Split the input by lines and initialize variables
    lines = input_text.strip().split("\n\n")
    dialogue_list = []

    # Iterate through each line and extract the dialogue
    for line in lines:
        # Check if the line starts with "Human" or "Assistant" and split accordingly
        if line.startswith("Human:"):
            role = "user"
            content = line.replace("Human: ", "").strip()
        elif line.startswith("Assistant:"):
            role = "assistant"
            content = line.replace("Assistant: ", "").strip()
        else:
            # If the line doesn't start with "Human" or "Assistant", it's part of the previous message's content
            # Append it to the last message's content
            dialogue_list[-1]["content"] += "\n\n" + line.strip()
            continue

        # Append the extracted dialogue piece to the list
        dialogue_list.append({"role": role, "content": content})

    return dialogue_list

def process(row):
        row["chosen"] = extract_dialogue(row["chosen"])
        row["rejected"] = extract_dialogue(row["rejected"])
        row["prompt"] = row["chosen"][0]["content"]
        return row

In [None]:
ds_processed = ds.map(
        process,
        load_from_cache_file=False,
    )
ds_processed

In [None]:
ds_processed['train'][67]

In [None]:
# Adjusting to llama prompt template format: https://github.com/meta-llama/llama-recipes
system_prompt = "Please answer the user's question to the best of your knowledge. If you don't know the answer respond that you don't know."

def encode_dialogue_turn(message):
    return f'<|start_header_id|>{message.get("role")}<|end_header_id|>{message.get("content")}<|eot_id|>'

def encode_dialogue(dialogue):
    if system_prompt:
        return f'<|begin_of_text|><|start_header_id|>system<|end_header_id|>{system_prompt}<|eot_id|>{functools.reduce(lambda a, b: a + encode_dialogue_turn(b), dialogue, "")}'
    else:
        return f'<|begin_of_text|>{functools.reduce(lambda a, b: a + encode_dialogue_turn(b), dialogue, "")}'


def encode_row(item):
    return {"chosen": encode_dialogue(item["chosen"]), "rejected": encode_dialogue(item["rejected"]), "prompt": item["prompt"]}
                                      
def encode_dataset(dataset):
    return list(map(encode_row, dataset))

In [None]:
encoded_dataset = ds_processed.map(encode_row)
encoded_dataset

In [None]:
encoded_dataset['train'][67]

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
# Tokenize and stack into target format
def preprocess_function(examples):
    new_examples = {
        "input_ids_chosen": [],
        "attention_mask_chosen": [],
        "input_ids_rejected": [],
        "attention_mask_rejected": [],
    }
    for chosen, rejected in zip(examples["chosen"], examples["rejected"]):
        tokenized_chosen = tokenizer(chosen)
        tokenized_rejected = tokenizer(rejected)

        new_examples["input_ids_chosen"].append(tokenized_chosen["input_ids"])
        new_examples["attention_mask_chosen"].append(tokenized_chosen["attention_mask"])
        new_examples["input_ids_rejected"].append(tokenized_rejected["input_ids"])
        new_examples["attention_mask_rejected"].append(tokenized_rejected["attention_mask"])

    return new_examples

In [None]:
tokenized_dataset_hhrlhf = encoded_dataset.map(
        preprocess_function,
        batched=True,
    ).remove_columns(["chosen", "rejected", "prompt"])
tokenized_dataset_hhrlhf

In [6]:
import datetime
now = datetime.datetime.now()
hhrlhf_bucket = f"hhrlhf-{now.strftime('%Y%m%d-%H%M%S')}"

In [None]:
# Save dataset to s3
dataset_path_hhrlhf = f's3://{hhrlhf_bucket}/experiments-hhrlhf/helpful-base-train-test-tokenized-llama318binstruct'
tokenized_dataset_hhrlhf.save_to_disk(dataset_path_hhrlhf)

print(f"Uploaded dataset to: {dataset_path_hhrlhf}")

## PPO training dataset

Dataset used: Stanford Question&Answering Dataset (SQuAD) - https://rajpurkar.github.io/SQuAD-explorer/

Target format: 
```json

```

In [None]:
# Download SQuAD dataset
!wget --no-check-certificate https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
!wget --no-check-certificate https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json

In [None]:
# Load files
with open('./train-v2.0.json') as f:
    d_train = json.load(f)
with open('./dev-v2.0.json') as f:
    d_test = json.load(f)

In [None]:
def extract_questions(dataset):
    ret_questions = []
    for topic in dataset:
        paragraphs = topic['paragraphs']
        for paragraph in paragraphs:
            qas = paragraph['qas']
            for qa in qas:
                ret_questions.append([{
            "role": "system", "content": f'Instruction: Please answer the user\'s question to the best of your knowledge. If you don\'t know the answer respond that you don\'t know.',
        }, {
            "role": "user", "content": qa['question'],
        }])
    return ret_questions

# Adjusting to llama prompt template format: https://github.com/meta-llama/llama-recipes
def encode_dialogue_turn(message):
    message = message
    return f'<|start_header_id|>{message.get("role")}<|end_header_id|>{message.get("content")}<|eot_id|>'

def encode_dialogue(dialogue):
    return {'input': f'<|begin_of_text|>{functools.reduce(lambda a, b: a + encode_dialogue_turn(b), dialogue, "")}'}

                                      
def encode_dataset(dataset):
    #print(dataset)
    return list(map(encode_dialogue, dataset))

In [None]:
encoded_train = encode_dataset(extract_questions(d_train['data']))
encoded_test = encode_dataset(extract_questions(d_test['data']))

In [None]:
encoded_train[0]

In [None]:
# Create DatasetDict
dataset_dict = DatasetDict({
    "train": Dataset.from_list(encoded_train),
    "test": Dataset.from_list(encoded_test)
})
dataset_dict

In [None]:
# Restrict training context size (due to memory limitations, can be adjusted)
input_min_text_length = 1
input_max_text_length = 2048

def create_and_prepare_dataset(tokenizer, dataset):
    
    input_size = LengthSampler(input_min_text_length, input_max_text_length)

    def tokenize(example):
        text_size = input_size()
        example["input_ids"] = tokenizer.encode(example["input"])[:text_size]
        example["query"] = tokenizer.decode(example["input_ids"])
        return example

    dataset = dataset.map(tokenize, batched=False)
        
    dataset.set_format("torch")
    return dataset


tokenized_dataset_squad = create_and_prepare_dataset(tokenizer, dataset_dict).remove_columns(["input"])
tokenized_dataset_squad

In [None]:
tokenized_dataset_squad['train'][0]

In [None]:
# Save dataset to s3
s3_bucket = "***S3_BUCKET_NAME***"

dataset_path_squad = f's3://{s3_bucket}/experiments-squad/train-test-contextwindow-padding-2048'
tokenized_dataset_squad.save_to_disk(dataset_path_squad)

print(f"Uploaded dataset to: {dataset_path_squad}")

# Training

In [None]:
import os
# Set path to config file for remote decorator
os.environ["SAGEMAKER_USER_CONFIG_OVERRIDE"] = os.getcwd()

## Reward model training 

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )
    
def find_all_linear_names(hf_model):
    lora_module_names = set()
    for name, module in hf_model.named_modules():
        if isinstance(module, bnb.nn.Linear4bit):
            names = name.split(".")
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if "lm_head" in lora_module_names:  # needed for 16-bit
        lora_module_names.remove("lm_head")
    return list(lora_module_names)   

# Start training with remote decorator (https://docs.aws.amazon.com/sagemaker/latest/dg/train-remote-decorator.html). Additional job config is being pulled in from config.yaml. 
@remote(keep_alive_period_in_seconds=0, volume_size=100, job_name_prefix=f"train-{model_id.split('/')[-1].replace('.', '-')}-reward", use_torchrun=True, nproc_per_node=4)
def train_fn(
        model_name,
        train_ds,
        test_ds=None,
        lora_r=8,
        lora_alpha=32,
        lora_dropout=0.1,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=1,
        learning_rate=2e-4,
        num_train_epochs=1,
        fsdp="",
        fsdp_config=None,
        chunk_size=10000,
        gradient_checkpointing=False,
        merge_weights=False,
        seed=42,
        token=None,
        model_hub_repo_id=None,
        range_train=None,
        range_eval=None
):

    set_seed(seed)

    # Initialize Accelerator object handling distributed training
    accelerator = Accelerator()

    # Login to HuggingFace
    if token is not None:
        login(token=token)

    # Load tokenizer. Padding side is "left" because focus needs to be on completion
    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side = "left")

    # Set tokenizer's pad Token
    tokenizer.pad_token = tokenizer.eos_token 
    tokenizer.pad_token_id = tokenizer.eos_token_id 

    # Load data from S3
    s3 = s3fs.S3FileSystem()
    dataset = load_from_disk(train_ds)  
    
    
    # Allow for partial dataset training
    if range_train:
        train_dataset = dataset["train"].select(range(range_train))
    else: 
        train_dataset = dataset["train"]
  
    if range_eval:
        eval_dataset = dataset["test"].select(range(range_eval))
    else:
        eval_dataset = dataset["test"]

    # Specify quantization config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        quant_storage_dtype=torch.bfloat16
    )
    
    # Load model with classification head for reward
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        #num_labels=1,
        trust_remote_code=True,
        quantization_config=bnb_config,
        attn_implementation="flash_attention_2",
        use_cache=False if gradient_checkpointing else True,
        cache_dir="/tmp/.cache"
    )
    
    # Pre-LoRA trainable paremeters
    print_trainable_parameters(model)     
    
    # Set model pad token id
    model.config.pad_token_id = tokenizer.pad_token_id
    
    # Prepare model for quantized training
    model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing)

    if gradient_checkpointing:
        model.gradient_checkpointing_enable()

    # Get lora target modules
    modules = find_all_linear_names(model)
    print(f"Found {len(modules)} modules to quantize: {modules}")
    
    # Specify LoRA config
    config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        target_modules=modules,
        lora_dropout=lora_dropout,
        bias="none",
        task_type="SEQ_CLS"
    )
    
    # Make sure to not train for CLM
    if config.task_type != "SEQ_CLS":
        warnings.warn(
            "You are using a `task_type` that is different than `SEQ_CLS` for PEFT. This will lead to silent bugs"
            " Make sure to pass --lora_task_type SEQ_CLS when using this script."
        )
    
    # Create PeftModel
    model = get_peft_model(model, config)
    
    # Post-LoRA trainable paremeters
    print_trainable_parameters(model)     
    
    # Specify training config
    reward_config = RewardConfig(
                        per_device_train_batch_size=per_device_train_batch_size,
                        per_device_eval_batch_size=per_device_eval_batch_size,
                        gradient_accumulation_steps=gradient_accumulation_steps,
                        gradient_checkpointing=gradient_checkpointing,
                        logging_strategy="steps",
                        logging_steps=100,
                        log_on_each_node=False,
                        num_train_epochs=num_train_epochs,
                        learning_rate=learning_rate,
                        bf16=True,
                        ddp_find_unused_parameters=False,
                        fsdp=fsdp,
                        fsdp_config=fsdp_config,
                        save_strategy="no",
                        output_dir="outputs",
                        max_length=512, 
                        remove_unused_columns=False,
                        gradient_checkpointing_kwargs = {"use_reentrant": False}
                        )
    
    # Initialize RewardTrainer object handling training
    trainer = RewardTrainer(
        model=model,
        tokenizer=tokenizer,
        args=reward_config,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

    trainer.train()

    
    trainer.model.save_pretrained("/opt/ml/model", safe_serialization=True)
    
    if model_hub_repo_id is not None:
        trainer.model.push_to_hub(repo_id=model_hub_repo_id)

    with accelerator.main_process_first():
        tokenizer.save_pretrained("/opt/ml/model")

Define the Hugging Face repository ID for pushing the model

In [None]:
model_hub_repo_id = "***HF_REPO_ID***"

In [None]:
# Start training job
train_fn(
    model_id,
    train_ds=dataset_path_hhrlhf,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    num_train_epochs=1,
    token=hf_token,
    model_hub_repo_id=model_hub_repo_id,
    range_train=100,
    range_eval=10
)

## Preference Alignment training with multi-adapter PPO 

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

# Start training with remote decorator (https://docs.aws.amazon.com/sagemaker/latest/dg/train-remote-decorator.html). Additional job config is being pulled in from config.yaml. 
@remote(keep_alive_period_in_seconds=0, volume_size=100, job_name_prefix=f"train-{model_id.split('/')[-1].replace('.', '-')}-multi-adapter-ppo", use_torchrun=True, nproc_per_node=4)
def train_fn(
        model_name,
        train_ds,
        rm_adapter,
        log_with=None,
        use_safetensors=None,
        use_score_scaling=False,
        use_score_norm=False,
        score_clip=None,
        seed=42,
        token=None,
        model_hub_repo_id=None,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=2,
        gradient_checkpointing=True,
        num_train_epochs=1,
        merge_weights=True,
        range_train=None,
        ):

    set_seed(seed)

    # Initialize Accelerator object handling distributed training
    accelerator = Accelerator()
    
    # Login to HuggingFace 
    if token is not None:
        login(token=token)
        
    # Load tokenizer. Padding side is "left" because focus needs to be on completion
    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side = "left")

    # Set tokenizer's pad Token
    tokenizer.pad_token = tokenizer.eos_token 
    tokenizer.pad_token_id = tokenizer.eos_token_id  
    
    
    # Load data from S3
    s3 = s3fs.S3FileSystem()
    dataset = load_from_disk(train_ds)  
    
    
    # Allow for partial dataset training
    if range_train:
        train_dataset = dataset["train"].select(range(range_train))
    else: 
        train_dataset = dataset["train"]
    
    # Specify LoRA config
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )
    
    # Specify quantization config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16
    )
    
    # Load model
    model = AutoModelForCausalLMWithValueHead.from_pretrained(
        model_name,
        #device_map='auto',
        peft_config=lora_config,
        quantization_config=bnb_config,
        reward_adapter=rm_adapter,
        use_safetensors=use_safetensors,
        #attn_implementation="flash_attention_2",
    )
    
    # Set model pad token id
    model.config.pad_token_id = tokenizer.pad_token_id

    if gradient_checkpointing:
        model.gradient_checkpointing_enable()
        
    # Trainable paremeters
    print_trainable_parameters(model)    

    def collator(data):
        return {key: [d[key] for d in data] for key in data[0]}

    # Specify PPO training config
    config = PPOConfig(
        model_name,
        log_with=None,
        learning_rate=1e-5,
        batch_size=per_device_train_batch_size,
        mini_batch_size=1,
        gradient_accumulation_steps=gradient_accumulation_steps,
        optimize_cuda_cache=True,
        seed=42,
        use_score_scaling=False,
        use_score_norm=False,
        score_clip=None,
    )

    # Initialize PPOTrainer object handling training
    ppo_trainer = PPOTrainer(
        config,
        model,
        ref_model=None,
        tokenizer=tokenizer,
        dataset=train_dataset,
        data_collator=collator,
    )

    # Specifying inference params
    generation_kwargs = {
        "top_k": 0.0,
        "top_p": 0.9,
        "do_sample": True,
        "pad_token_id": tokenizer.pad_token_id,
        "max_new_tokens": 32,
    }
    
    step = 0

    for _epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
        
        question_tensors = batch["input_ids"]
        
        # Inference through model being fine-tuned
        response_tensors = ppo_trainer.generate(
            question_tensors,
            return_prompt=False,
            **generation_kwargs,
        )
        
        # Decode response
        batch["response"] = tokenizer.batch_decode(response_tensors, skip_special_tokens=True)
        
        # Concat query and response
        texts = [q + r for q, r in zip(batch["query"], batch["response"])]
        
        # Tokenize query - response pair
        inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(ppo_trainer.accelerator.device)
        
        # Compute reward score
        raw_rewards = ppo_trainer.accelerator.unwrap_model(ppo_trainer.model).compute_reward_score(**inputs)
        rewards = [raw_rewards[i, -1, 1] for i in range(len(raw_rewards))]  # take last token

        # Run PPO step
        stats = ppo_trainer.step(question_tensors, response_tensors, rewards)
        ppo_trainer.log_stats(stats, batch, rewards)
        
        step = step + 1      
    
    if accelerator.is_main_process:

        ppo_trainer.save_pretrained("/opt/ml/model", safe_serialization=True)

        if model_hub_repo_id is not None:
            ppo_trainer.push_to_hub(repo_id=model_hub_repo_id)
            tokenizer.push_to_hub(repo_id=model_hub_repo_id)

    with accelerator.main_process_first():
        tokenizer.save_pretrained("/opt/ml/model")

Let's point ot the adapter previously created and pushed in the Hugging Face Model Hub

In [None]:
rm_adapter = "**HF_REPO_ID**"

Define the Hugging Face repository ID for pushing the model

In [None]:
model_hub_repo_id = "***HF_REPO_ID***"

In [None]:
train_fn(
    model_id,
    train_ds=dataset_path_squad,
    rm_adapter=rm_adapter,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    num_train_epochs=1,
    token=hf_token,
    model_hub_repo_id=model_hub_repo_id,
    range_train=100
)

# Deploy fine-tuned model

In [None]:
import sagemaker
import boto3
from sagemaker.huggingface import get_huggingface_llm_image_uri
from sagemaker.huggingface import  HuggingFaceModel
from datetime import datetime

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

In [None]:
# sagemaker config
instance_type = "ml.g5.4xlarge"
number_of_gpu = 1
health_check_timeout = 300

# TGI config
config = {
#'HF_MODEL_ID': "aristsakpinisaws/llama-31-hhrlhf-rlhf", # path to where sagemaker stores the model
'HF_MODEL_ID': "meta-llama/Meta-Llama-3.1-8B-Instruct",
'LORA_ADAPTERS': "**HF_REPO_ID**",
'SM_NUM_GPUS': json.dumps(1), # Number of GPU used per replica
'MAX_INPUT_LENGTH': json.dumps(1024),  # Max length of input text
'MAX_TOTAL_TOKENS': json.dumps(2048),  # Max length of the generation (including input text),
'QUANTIZE': "bitsandbytes", # comment in to quantize
'HUGGING_FACE_HUB_TOKEN': hf_token
}

image_uri = get_huggingface_llm_image_uri(
    "huggingface",
    version="2.0"
)

# create HuggingFaceModel
llm_model = HuggingFaceModel(
    role=role,
    image_uri=image_uri,
    env=config
)

In [None]:
# Deploy model to an endpoint
# https://sagemaker.readthedocs.io/en/stable/api/inference/model.html#sagemaker.model.Model.deploy
llm = llm_model.deploy(
    endpoint_name=f'llama-31-8b-instruct-rlhf-{datetime.now().strftime("%Y%m%d%H%M%S")}', # alternatively "llama-2-13b-hf-nyc-finetuned"
    initial_instance_count=1,
    instance_type=instance_type,
    container_startup_health_check_timeout=health_check_timeout, # 10 minutes to be able to load the model
)

In [None]:
parameters = {
        "top_p": 0.8,
        "temperature": 0.1,
        "return_full_text": True,
        "stop": [],
    }

encoded_message = encode_dialogue([{'content': 'Who won the FIFA World cup 2014 in Brazil?', 'role': 'user'}])
                   
response = llm.predict({"inputs": encoded_message['input'], **parameters})

response

# Cleanup

In [None]:
# Delete model and endpoint
llm.delete_model()
llm.delete_endpoint()