## > Setup for all Labs

In [1]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


## > Initial setup Lab01

In [2]:
from utils import (
    upload_file_to_s3,
)
import sagemaker
import os
import time
from botocore.exceptions import ClientError
from tqdm import tqdm

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()  # execution role for the endpoint
region = sagemaker_session._region_name

bucket = sagemaker_session.default_bucket()
prefix = "swagger_codegen"

data_dir = "../data/lab01"
yml_dir = f"{data_dir}/yml_files"
uml_dir = f"{data_dir}/uml_diagrams"

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


### > Upload data to S3 to setup the lab

In [3]:
data_dirs= [yml_dir,
            uml_dir,
            f"{data_dir}/yml_questions",
            f"{data_dir}/uml_questions"]

for ddir in data_dirs:
    for filename in os.listdir(ddir):
        filepath = os.path.join(ddir, filename)

        # yml upload file to s3
        key = f"{prefix}/{filepath.replace(data_dir+'/', '')}"
        s3_path = upload_file_to_s3(filepath, bucket, key)

    print(f"Sync data to S3 from {ddir} =========")

File '../data/lab01/yml_files/bookstore.yml' uploaded successfully to s3://sagemaker-us-east-1-376678947624/swagger_codegen/yml_files/bookstore.yml
File '../data/lab01/yml_files/flowerstore.yml' uploaded successfully to s3://sagemaker-us-east-1-376678947624/swagger_codegen/yml_files/flowerstore.yml
File '../data/lab01/yml_files/link.yml' uploaded successfully to s3://sagemaker-us-east-1-376678947624/swagger_codegen/yml_files/link.yml
File '../data/lab01/yml_files/petstore.yml' uploaded successfully to s3://sagemaker-us-east-1-376678947624/swagger_codegen/yml_files/petstore.yml
File '../data/lab01/yml_files/uspto.yml' uploaded successfully to s3://sagemaker-us-east-1-376678947624/swagger_codegen/yml_files/uspto.yml
File '../data/lab01/uml_diagrams/bookstore.jpg' uploaded successfully to s3://sagemaker-us-east-1-376678947624/swagger_codegen/uml_diagrams/bookstore.jpg
File '../data/lab01/uml_diagrams/flowerstore.jpg' uploaded successfully to s3://sagemaker-us-east-1-376678947624/swagger_c

### > Store the parameter for future labs

In [4]:
%store bucket
%store prefix
%store yml_dir
%store uml_dir
%store data_dir
print(bucket)

Stored 'bucket' (str)
Stored 'prefix' (str)
Stored 'yml_dir' (str)
Stored 'uml_dir' (str)
Stored 'data_dir' (str)
sagemaker-us-east-1-376678947624


## > Initial Setup Lab02

In [5]:
import boto3
import pprint as pp
import random
from os_utils import create_bedrock_execution_role, create_policies_in_oss, create_oss_policy_attach_bedrock_execution_role

boto3_session = boto3.Session()
region_name = boto3_session.region_name
aoss_client = boto3_session.client('opensearchserverless')

suffix = random.randrange(200, 900)

vector_store_name = f'swagger-api-{suffix}'

# exists = aoss_client.list_collections(
#     collectionFilters={'name': vector_store_name},
#     maxResults=10
# )

# if len(exists['collectionSummaries']) == 0:
print(f"creating vector collection {vector_store_name}")

## create an execution role
bedrock_kb_execution_role = create_bedrock_execution_role(bucket_name=bucket)
bedrock_kb_execution_role_arn = bedrock_kb_execution_role['Role']['Arn']

# create security, network and data access policies within OSS
encryption_policy, network_policy, access_policy = create_policies_in_oss(vector_store_name=vector_store_name,
                                                                           aoss_client=aoss_client,
                                                                           bedrock_kb_execution_role_arn=bedrock_kb_execution_role_arn)

vector_collection = aoss_client.create_collection(name=vector_store_name,type='VECTORSEARCH')
vector_collection_arn = vector_collection["createCollectionDetail"]['arn']
vector_collection_id = vector_collection['createCollectionDetail']['id']
pp.pprint(vector_collection)
time.sleep(10)
# else:
#     print(f"vector collection {vector_store_name} exists")

#     ## create an execution role
#     bedrock_kb_execution_role = create_bedrock_execution_role(bucket_name=bucket)
#     bedrock_kb_execution_role_arn = bedrock_kb_execution_role['Role']['Arn']
    
#     vector_collection_arn = exists['collectionSummaries'][0]['arn']
#     vector_collection_id = exists['collectionSummaries'][0]['id']
#     vector_store_name = exists['collectionSummaries'][0]['name']

#     # create security, network and data access policies within OSS
#     encryption_policy, network_policy, access_policy = create_policies_in_oss(vector_store_name=vector_store_name,
#                                                                                aoss_client=aoss_client,
#                                                                                bedrock_kb_execution_role_arn=bedrock_kb_execution_role_arn)

vector_host = vector_collection_id + '.' + region_name + '.aoss.amazonaws.com'
print(vector_host)

# create oss policy and attach it to Bedrock execution role
create_oss_policy_attach_bedrock_execution_role(collection_id=vector_collection_id,
                                                bedrock_kb_execution_role=bedrock_kb_execution_role)

creating vector collection swagger-api-817
{'ResponseMetadata': {'HTTPHeaders': {'connection': 'keep-alive',
                                      'content-length': '307',
                                      'content-type': 'application/x-amz-json-1.0',
                                      'date': 'Sun, 21 Jul 2024 02:06:47 GMT',
                                      'x-amzn-requestid': 'e67d5f71-76f4-4f07-ab13-a9fdd75ca241'},
                      'HTTPStatusCode': 200,
                      'RequestId': 'e67d5f71-76f4-4f07-ab13-a9fdd75ca241',
                      'RetryAttempts': 0},
 'createCollectionDetail': {'arn': 'arn:aws:aoss:us-east-1:376678947624:collection/60mr1y2hqd26e2u4okkj',
                            'createdDate': 1721527607831,
                            'id': '60mr1y2hqd26e2u4okkj',
                            'kmsKeyArn': 'auto',
                            'lastModifiedDate': 1721527607831,
                            'name': 'swagger-api-817',
              

In [6]:
%store vector_store_name
%store vector_collection_arn
%store vector_collection_id
%store vector_host
%store bedrock_kb_execution_role_arn

Stored 'vector_store_name' (str)
Stored 'vector_collection_arn' (str)
Stored 'vector_collection_id' (str)
Stored 'vector_host' (str)
Stored 'bedrock_kb_execution_role_arn' (str)


### > Initial Setup Lab 03

In [7]:
# Create an S3 client
s3 = boto3.client('s3')
image_prep_prefix = 'aws-genai-rag-workshop/images'

root_dir = "../data/lab03/"

jsonl_files = ["simple_image_query.json", "complex_image_query.json"]

In [8]:
import json

for jsonl in jsonl_files:

    print(f"Prepare image data file: {jsonl}")

    jsonl_path = os.path.join(root_dir, jsonl)
    
    with open(jsonl_path, 'r+') as f:
        dataset = json.load(f)

    for node_id, image_obj in tqdm(dataset['corpus'].items()):
        # Check if the image reference is a local file path or a URL
        image_local_path = os.path.join(root_dir, image_obj["image-path"])
        if os.path.isfile(image_local_path):
            # Local file path
            file_name = os.path.basename(image_local_path)
            s3_key = f"{prefix}/{file_name}"
            try:
                s3.upload_file(image_local_path, bucket, s3_key)
                # print(f'Uploaded {file_name} to S3 bucket {bucket}')
                image_obj["image-ref"] = f"s3://{bucket}/{s3_key}"
            except ClientError as e:
                print(f'Error uploading {file_name}: {e}')
        else:
            assert(f"{image_local_path} file not available")

    with open(jsonl_path, 'w+') as f:
        json.dump(dataset, f)

Prepare image data file: simple_image_query.json


100%|██████████| 50/50 [00:06<00:00,  7.16it/s]


Prepare image data file: complex_image_query.json


100%|██████████| 28/28 [00:03<00:00,  8.22it/s]


In [9]:
%store root_dir
%store jsonl_files

Stored 'root_dir' (str)
Stored 'jsonl_files' (list)


### > Initial Setup Lab04

Install ffmpeg

In [10]:
!sudo apt install ffmpeg -y

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 0 not upgraded.


In [11]:
video_prep_prefix = "aws-genai-rag-workshop/videos"

In [19]:
%store role
%store region
%store video_prep_prefix

Stored 'role' (str)
Stored 'region' (str)
Stored 'video_prep_prefix' (str)


## > Initial Setup Lab05

In [13]:
embedding_prefix = "finetune-embedding"

model_id = "sentence-transformers/msmarco-bert-base-dot-v5"

In [14]:
train_data = "train_dataset.json"
train_local_path = f"../data/lab04/{train_data}"

train_s3_path = f"s3://{bucket}/{embedding_prefix}/{train_data}"

!aws s3 cp {train_local_path} {train_s3_path}

upload: ../data/lab04/train_dataset.json to s3://sagemaker-us-east-1-376678947624/finetune-embedding/train_dataset.json


In [15]:
valid_data = "val_dataset.json"
valid_local_path = f"../data/lab04/{valid_data}"

valid_s3_path = f"s3://{bucket}/{embedding_prefix}/{valid_data}"

!aws s3 cp {valid_local_path} {valid_s3_path}

upload: ../data/lab04/val_dataset.json to s3://sagemaker-us-east-1-376678947624/finetune-embedding/val_dataset.json


In [16]:
%store embedding_prefix
%store train_s3_path
%store valid_s3_path
%store train_local_path
%store valid_local_path
%store model_id

Stored 'embedding_prefix' (str)
Stored 'train_s3_path' (str)
Stored 'valid_s3_path' (str)
Stored 'train_local_path' (str)
Stored 'valid_local_path' (str)
Stored 'model_id' (str)


## > Initial Setup LabX

In [17]:
from os_utils import create_lambda_role
from os_utils import create_lambda

# create Lambda Role
agent_name = f'swagger-api-agent-{suffix}'
lambda_iam_role = create_lambda_role(agent_name)

# create Lambda Function

lambda_function_name = f'{agent_name}-lambda'
lambda_function = create_lambda("lambda_function.py", lambda_function_name, lambda_iam_role)

lambda_arn = lambda_function['FunctionArn']
print(f"Lab 03 Lambda ARN: {lambda_arn}")

Lab 03 Lambda ARN: arn:aws:lambda:us-east-1:376678947624:function:swagger-api-agent-817-lambda


In [18]:
%store lambda_arn
%store lambda_function_name

Stored 'lambda_arn' (str)
Stored 'lambda_function_name' (str)
