In [31]:
!pip install --upgrade pip
%pip install --no-build-isolation --force-reinstall \
    "boto3>=1.28.57" \
    "awscli>=1.29.57" \
    "botocore>=1.31.57"
!pip install -qU ipywidgets>=7,<8
!pip install jsonlines
!pip install datasets==2.15.0
!pip install pandas==2.1.3


Collecting boto3>=1.28.57
  Using cached boto3-1.34.104-py3-none-any.whl.metadata (6.6 kB)
Collecting awscli>=1.29.57
  Using cached awscli-1.32.104-py3-none-any.whl.metadata (11 kB)
Collecting botocore>=1.31.57
  Using cached botocore-1.34.104-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3>=1.28.57)
  Using cached jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3>=1.28.57)
  Using cached s3transfer-0.10.1-py3-none-any.whl.metadata (1.7 kB)
Collecting docutils<0.17,>=0.10 (from awscli>=1.29.57)
  Using cached docutils-0.16-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting PyYAML<6.1,>=3.10 (from awscli>=1.29.57)
  Using cached PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl.metadata (2.1 kB)
Collecting colorama<0.4.7,>=0.2.5 (from awscli>=1.29.57)
  Using cached colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting rsa<4.8,>=3.1.2 (from awscli>=1.29.57)
  Using cached rsa-4.7.2-py3-none-any.whl.

In [None]:
# restart kernel for packages to take effect
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

### Setup
import all the needed libraries and dependencies to setup this notebook.

In [None]:
import warnings
warnings.filterwarnings('ignore')
import json
import os
import sys
import boto3 
import time
import pprint
from datasets import load_dataset
import random
import jsonlines

### Environment initialization
initiate boto3 clients for S3, IAM and Bedrock.
These are required to create required S3 bucket for data store, required IAM roles and invoking Bedrock endpoints.

In [None]:
session = boto3.session.Session()
#Change the region according to your preference.
region = 'us-west-2'
sts_client = boto3.client('sts', region_name = region)
account_id = sts_client.get_caller_identity()["Account"]

s3_suffix = f"{region}-{account_id}"
bucket_name = f"bedrock-customization-finetune-{s3_suffix}"
s3_client = boto3.client('s3', region_name = region)

bedrock = boto3.client(service_name="bedrock", region_name = region)
bedrock_runtime = boto3.client(service_name="bedrock-runtime", region_name = region )

iam = boto3.client('iam', region_name=region)


In [None]:
role_name = "AmazonBedrockCustomizationRole_FineTuning"
s3_bedrock_finetuning_access_policy="AmazonBedrockCustomizationPolicy_FineTuning"
customization_role = f"arn:aws:iam::{account_id}:role/{role_name}"

In [None]:
#List foundational models available for fine tuning
for model in bedrock.list_foundation_models(
    byCustomizationType="FINE_TUNING")["modelSummaries"]:
    for key, value in model.items():
        print(key, ":", value)
    print("-----\n")

### Create s3 bucket
In this step we will create a s3 bucket, which will be used to store data for continued pre-training notebooks. 

In [None]:
# Create S3 bucket for storing datasets for fine tuning
s3bucket = s3_client.create_bucket(
    Bucket=bucket_name,
    CreateBucketConfiguration={
        'LocationConstraint':region,
    },
)

## Creating role and policies required to run customization jobs with Amazon Bedrock

This JSON object defines the trust relationship that allows the bedrock service to assume a role that will give it the ability to talk to other required AWS services. The conditions set restrict the assumption of the role to a specfic account ID and a specific component of the bedrock service (model_customization_jobs)

In [None]:
ROLE_DOC = f"""{{
    "Version": "2012-10-17",
    "Statement": [
        {{
            "Effect": "Allow",
            "Principal": {{
                "Service": "bedrock.amazonaws.com"
            }},
            "Action": "sts:AssumeRole",
            "Condition": {{
                "StringEquals": {{
                    "aws:SourceAccount": "{account_id}"
                }},
                "ArnEquals": {{
                    "aws:SourceArn": "arn:aws:bedrock:{region}:{account_id}:model-customization-job/*"
                }}
            }}
        }}
    ]
}}
"""

This JSON object defines the permissions of the role we want bedrock to assume to allow access to the S3 bucket that we created that will hold our fine-tuning datasets and allow certain bucket and object manipulations.

In [None]:
ACCESS_POLICY_DOC = f"""{{
    "Version": "2012-10-17",
    "Statement": [
        {{
            "Effect": "Allow",
            "Action": [
                "s3:AbortMultipartUpload",
                "s3:DeleteObject",
                "s3:PutObject",
                "s3:GetObject",
                "s3:GetBucketAcl",
                "s3:GetBucketNotification",
                "s3:ListBucket",
                "s3:PutBucketNotification"
            ],
            "Resource": [
                "arn:aws:s3:::{bucket_name}",
                "arn:aws:s3:::{bucket_name}/*"
            ]
        }}
    ]
}}"""


In [None]:
response = iam.create_role(
    RoleName=role_name,
    AssumeRolePolicyDocument=ROLE_DOC,
    Description="Role for Bedrock to access S3 for training",
)
pprint.pp(response)

In [None]:
role_arn = response["Role"]["Arn"]
pprint.pp(role_arn)

In [None]:
response = iam.create_policy(
    PolicyName=s3_bedrock_finetuning_access_policy,
    PolicyDocument=ACCESS_POLICY_DOC,
)
pprint.pp(response)

In [None]:
policy_arn = response["Policy"]["Arn"]
pprint.pp(policy_arn)

In [None]:
iam.attach_role_policy(
    RoleName=role_name,
    PolicyArn=policy_arn,
)

### Preparing the data to be used for fine tuning.
In this notebook we are using data in csv format. This is a complaints dataset from the consumer finance protection bureau.
You can potentially use any csv dataset which could be used for pre-training the model.

In [32]:
#load complaints dataset from cfpb ( consumer finance protection bureau)

complaints = load_dataset("csv", data_files="complaints-2024-04-08_12_34.csv")

### Bedrock fine tuning limits for Titan Text Express
[BaseModel = amazon.titan-text-express-v1:0:8k, Epochs <= 10, BatchSize <= 4, TotalRecords <= 10000, InputChars <= 12288, OutputChars <= 12288, TotalChars <= 12288]

Based on the above limits the total records in the training set cannot exceed 10,000

In [33]:
#split the dataset between train, test and validation

#The dataset is split into 2 subsets in 80/20 percentage. 80 percent for training and rest 20 for test and validation.
train_test_split = complaints['train'].train_test_split(test_size=0.2)
#From the remaining 20 percent, we split it into 90 percent for validation and 10 percent for test.
train_val_split = train_test_split['test'].train_test_split(test_size=0.10)  

dataset_train = train_test_split['train']
dataset_valid = train_val_split['train']
dataset_test = train_val_split['test']
# View the structure of the dataset and the number of rows in each set.
# The dataset is a dictionary with the keys 'train', 'validation', and 'test'. Each key contains a list of dictionaries, where each dictionary represents a single datapoint.

print(dataset_train)
print(dataset_valid)
print(dataset_test)


Dataset({
    features: ['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue', 'Consumer complaint narrative', 'Company public response', 'Company', 'State', 'ZIP code', 'Tags', 'Consumer consent provided?', 'Submitted via', 'Date sent to company', 'Company response to consumer', 'Timely response?', 'Consumer disputed?', 'Complaint ID'],
    num_rows: 50627
})
Dataset({
    features: ['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue', 'Consumer complaint narrative', 'Company public response', 'Company', 'State', 'ZIP code', 'Tags', 'Consumer consent provided?', 'Submitted via', 'Date sent to company', 'Company response to consumer', 'Timely response?', 'Consumer disputed?', 'Complaint ID'],
    num_rows: 11391
})
Dataset({
    features: ['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue', 'Consumer complaint narrative', 'Company public response', 'Company', 'State', 'ZIP code', 'Tags', 'Consumer consent provided?', 'Submitted via', 'Date sent to co

Prepare the Datasets train, test and valid in the format required for fine tuning
In this example, we are using a .jsonl dataset following example format:

{"prompt": "<prompt1>", "completion": "<expected generated text>"}

See more guidance on how to do fine tuning [https://docs.aws.amazon.com/bedrock/latest/userguide/model-customization-prepare.html]

For guidance on customization quotas [https://docs.aws.amazon.com/bedrock/latest/userguide/quotas.html#quotas-model-customization]

In [34]:
#prepare the test set
dataset_test_format=[]
for dp in dataset_test:
    temp_dict={}
    temp_dict['prompt']= 'consumer complaints and resolutions for fiancial products'
    dp= str(dp)
    temp_dict['completion']= dp
    dataset_test_format.append(temp_dict)

In [35]:
#prepare the validation set
#Sum of training and validation records allowed for fine-tuning is 10,000.
#So we are considering 1000 records for validation set.
dataset_valid_format=[]
for dp in dataset_valid:
    temp_dict={}
    temp_dict['prompt']= 'consumer complaints and resolutions for fiancial products'
    #convert the dictionary dp to a string
    dp= str(dp)
    temp_dict['completion']= dp
    dataset_valid_format.append(temp_dict)
    if len(dataset_valid_format) == 1000:
        break


In [36]:
#prepare the train set
#Sum of training and validation records allowed for fine-tuning is 10,000.
#So we are considering 9000 records for training set.
dataset_train_format=[]
for dp in dataset_train:
    temp_dict={}
    temp_dict['prompt']= 'consumer complaints and resolutions for fiancial products'
    dp= str(dp)
    temp_dict['completion']= dp
    dataset_train_format.append(temp_dict)
    if len(dataset_train_format) == 9000:
        break


### Preparing the datasets
LLM's have different token limits. Here we restricted to 4096 characters.

In [37]:
def jsonl_converter(dataset,file_name):
     with jsonlines.open(file_name, 'w') as writer:
        for line in dataset:
            # Convert the line to a JSON string to check its length
            json_string = json.dumps(line)
            if len(json_string) < 4096:
                # If so, write the line to the file
                writer.write(line)

### Create local directory for datasets
Please not that your training dataset for fine-tuning cannot be greater than 10K records, and validation dataset has a maximum limit of 1K records.

In [38]:
dataset_folder="fine-tuning-datasets"
train_file_name="train-cfpb-complaints.jsonl"
validation_file_name="validation-cfpb-complaints.jsonl"
test_file_name="test-cfpb-complaints.jsonl"
!mkdir fine-tuning-datasets
abs_path=os.path.abspath(dataset_folder)

mkdir: fine-tuning-datasets: File exists


### Create JSONL format datasets for Bedrock fine tuning

In [None]:
#Convert the datasets to jsonl files and save in local directory for each dataset
jsonl_converter(dataset_test_format,f'{abs_path}/{test_file_name}')
jsonl_converter(dataset_train_format,f'{abs_path}/{train_file_name}')
jsonl_converter(dataset_valid_format,f'{abs_path}/{validation_file_name}')

### Upload datasets to s3 bucket

In [None]:
s3_client.upload_file(f'{abs_path}/{train_file_name}', bucket_name, f'fine-tuning-datasets/train/{train_file_name}')
s3_client.upload_file(f'{abs_path}/{validation_file_name}', bucket_name, f'fine-tuning-datasets/validation/{validation_file_name}')
s3_client.upload_file(f'{abs_path}/{test_file_name}', bucket_name, f'fine-tuning-datasets/test/{test_file_name}')

## For fine-tuning , execute the notebook fine-tune_titan_express_bedrock.ipynb