Fine-tuning on Amazon Bedrock

In [None]:
#installing necessary packages
!pip install datasets==2.15.0

In [None]:
#import the packages
import boto3
import json
import datetime
import os

In [None]:
#Setup bucket, IAM role and policy
iam = boto3.client("iam")
s3 = boto3.client('s3')

# Create the new bucket
account_id = boto3.client('sts').get_caller_identity()['Account']
bucket_name = f"bedrock-finetuning-{account_id}"
s3.create_bucket(Bucket=bucket_name)

# Create IAM Role and Policy
role = iam.create_role(
    RoleName=f"Bedrock-Finetuning-Role-{account_id}",
    AssumeRolePolicyDocument=json.dumps({
        "Version": "2012-10-17",
        "Statement": [
            {
                "Effect": "Allow",
                "Principal": {
                    "Service": "bedrock.amazonaws.com"
                },
                "Action": "sts:AssumeRole"
            }
        ] 
    })
)['Role']['RoleName']

policy_arn = iam.create_policy(
    PolicyName="Bedrock-Finetuning-Role-Policy",
    PolicyDocument=json.dumps({
        "Version": "2012-10-17",
        "Statement": [
            {
                "Effect": "Allow",
                "Action": [
                    "s3:GetObject",
                    "s3:PutObject",
                    "s3:ListBucket"
                ],
                "Resource": [
                    f"arn:aws:s3:::{bucket_name}",
                    f"arn:aws:s3:::{bucket_name}/*"
                ]
            }
        ]
    })
)['Policy']['Arn']

iam.attach_role_policy(
    RoleName=role,
    PolicyArn=policy_arn
)

Let's import the dataset, modify it, and uploading to S3 bucket

In [None]:
#importing data from huggingface dataset
#Citation - https://huggingface.co/datasets/knkarthick/dialogsum
from datasets import load_dataset
ds = load_dataset("knkarthick/dialogsum", split="train")

In [None]:
dataset = ds.remove_columns("id")
dataset = dataset.remove_columns("summary")
dataset = dataset.select(range(10000))

# We split the dataset into two where test data is used to evaluate at the end.
train_and_validation_dataset = dataset.train_test_split(test_size=0.1)

dataset_dir = "dataset"
def format_save_dataset(filename, dataset):
    os.makedirs(dataset_dir, exist_ok=True)
    with open(f"{dataset_dir}/{filename}", "w") as f:
        for i in dataset:
            dialogue = i["dialogue"]
            topic = i["topic"]
            template = {
                "prompt": f"Identify the key topic representing the dialoge. \n\nDialogue: {dialogue}",
                "completion": f"{topic}",
            }
            json.dump(template, f)
            f.write('\n')
    return 

# format_save_dataset("fulldataset.jsonl", dataset)
format_save_dataset("train.jsonl", train_and_validation_dataset["train"])
format_save_dataset("validation.jsonl", train_and_validation_dataset["test"])

In [None]:
# Upload dataset to S3 bucket
s3 = boto3.client('s3')
account_id = boto3.client('sts').get_caller_identity()['Account']
bucket_name = f"bedrock-finetuning-{account_id}"

for root, dirs, files in os.walk(dataset_dir):
    for file in files:
        full_path = os.path.join(root, file)
        relative_path = os.path.relpath(full_path, dataset_dir)
        s3.upload_file(full_path, bucket_name, relative_path)

Now that the datasets are uploaded to S3, we are ready to create a fine-tuning job to start model customization

In [None]:
bedrock = boto3.client(service_name='bedrock')
account_id = boto3.client('sts').get_caller_identity()['Account']

In [None]:
# Set parameters
datetime_string = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
customizationType = "FINE_TUNING"
customModelName = "custom-titan-lite-model"
baseModelIdentifier = "arn:aws:bedrock:us-east-1::foundation-model/amazon.titan-text-lite-v1:0:4k"
roleArn=f"arn:aws:iam::{account_id}:role/Bedrock-Finetuning-Role-{account_id}"
jobName=f"Titan-Lite-Finetune-Job-{datetime_string}"
hyperParameters = {
        "epochCount": "1",
        "batchSize": "1",
        "learningRate": ".0001",
        "learningRateWarmupSteps": "0"
}

# Create job
response_ft = bedrock.create_model_customization_job(
    jobName=jobName,
    customModelName=customModelName,
    customizationType=customizationType,
    roleArn=roleArn,
    baseModelIdentifier=baseModelIdentifier,
    hyperParameters=hyperParameters,
    trainingDataConfig={"s3Uri": f"s3://bedrock-finetuning-{account_id}/train.jsonl"},
    validationDataConfig={'validators': [ {"s3Uri": f"s3://bedrock-finetuning-{account_id}/validation.jsonl"} ]},
    outputDataConfig={"s3Uri": f"s3://bedrock-finetuning-{account_id}/finetuning-output"},
)

In [None]:
jobArn = response_ft.get('jobArn')
print(jobArn)

Training the customized model would take few hours, so periodically check the status of the job.

In [None]:
# Check for the job status, wait until it is "Complete"
status = bedrock.get_model_customization_job(jobIdentifier=jobName)["status"]
print(status)

Once the model is trained, we need to purchase provisioned throughput before we can start using the model

In [None]:
#Purchase provisioned throughput
response_pt = bedrock.create_provisioned_model_throughput(
    modelId=customModelName,
    provisionedModelName="ProvisionedCustomTitanLite",
    modelUnits=1
)

provisionedModelArn = response_pt.get('provisionedModelArn')

Let's test our customized model

In [None]:
bedrock_runtime=boto3.client(service_name='bedrock-runtime')
prompt ="ENTER_PROMPT"

body = {
    "prompt": prompt,
    "temperature": 0.5,
    "p": 0.9,
    "max_tokens": 512,
}

response = bedrock_runtime.invoke_model(
	modelId=provisionedModelArn,
    body=json.dumps(body)
)