## > Setup for all Labs

In [None]:
%pip install -r requirements.txt

## > Initial setup Lab01

In [None]:
from utils import (
    upload_file_to_s3,
)
import sagemaker
import os
import time
from botocore.exceptions import ClientError
from tqdm import tqdm

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()  # execution role for the endpoint
region = sagemaker_session._region_name

bucket = sagemaker_session.default_bucket()
prefix = "swagger_codegen"

data_dir = "../data/lab01"
yml_dir = f"{data_dir}/yml_files"
uml_dir = f"{data_dir}/uml_diagrams"

### > Upload data to S3 to setup the lab

In [None]:
data_dirs= [yml_dir,
            uml_dir,
            f"{data_dir}/yml_questions",
            f"{data_dir}/uml_questions"]

for ddir in data_dirs:
    for filename in os.listdir(ddir):
        filepath = os.path.join(ddir, filename)

        # yml upload file to s3
        key = f"{prefix}/{filepath.replace(data_dir+'/', '')}"
        s3_path = upload_file_to_s3(filepath, bucket, key)

    print(f"Sync data to S3 from {ddir} =========")

### > Store the parameter for future labs

In [None]:
%store bucket
%store prefix
%store yml_dir
%store uml_dir
%store data_dir
print(bucket)

## > Initial Setup Lab02

In [None]:
import boto3
from IPython.display import JSON
cf = boto3.client(service_name="cloudformation")
stack = response = cf.describe_stacks(StackName='workshop')

In [None]:
JSON(stack)

In [None]:
vector_host = next(item["OutputValue"] for item in stack['Stacks'][0]['Outputs'] if item["OutputKey"] == "AOSSCollectionEndpoint")
vector_host = vector_host.replace('https://', '')
vector_collection_arn = next(item["OutputValue"] for item in stack['Stacks'][0]['Outputs'] if item["OutputKey"] == "AOSSCollectionArn")
vector_collection_id = next(item["OutputValue"] for item in stack['Stacks'][0]['Outputs'] if item["OutputKey"] == "AOSSCollectionId")
from sagemaker import get_execution_role
bedrock_kb_execution_role_arn = get_execution_role()
%store bedrock_kb_execution_role_arn
%store vector_host
%store vector_collection_arn
%store vector_collection_id

### > Initial Setup Lab 03-01

In [None]:
# Create an S3 client
s3 = boto3.client('s3')
image_prep_prefix = 'aws-genai-rag-workshop/images'

root_dir = "../data/lab03/"

jsonl_files = ["simple_image_query.json", "complex_image_query.json"]

In [None]:
import json

for jsonl in jsonl_files:

    print(f"Prepare image data file: {jsonl}")

    jsonl_path = os.path.join(root_dir, jsonl)
    
    with open(jsonl_path, 'r+') as f:
        dataset = json.load(f)

    for node_id, image_obj in tqdm(dataset['corpus'].items()):
        # Check if the image reference is a local file path or a URL
        image_local_path = os.path.join(root_dir, image_obj["image-path"])
        if os.path.isfile(image_local_path):
            # Local file path
            file_name = os.path.basename(image_local_path)
            s3_key = f"{prefix}/{file_name}"
            try:
                s3.upload_file(image_local_path, bucket, s3_key)
                # print(f'Uploaded {file_name} to S3 bucket {bucket}')
                image_obj["image-ref"] = f"s3://{bucket}/{s3_key}"
            except ClientError as e:
                print(f'Error uploading {file_name}: {e}')
        else:
            assert(f"{image_local_path} file not available")

    with open(jsonl_path, 'w+') as f:
        json.dump(dataset, f)

In [None]:
%store root_dir
%store jsonl_files

### > Initial Setup Lab03-02

Install ffmpeg

In [None]:
!sudo apt update 
!sudo apt install ffmpeg -y --fix-missing

In [None]:
video_prep_prefix = "aws-genai-rag-workshop/videos"

In [None]:
%store role
%store region
%store video_prep_prefix

## > Initial Setup Lab03-03

In [None]:
embedding_prefix = "finetune-embedding"

model_id = "sentence-transformers/msmarco-bert-base-dot-v5"

In [None]:
train_data = "train_dataset.json"
train_local_path = f"../data/lab04/{train_data}"

train_s3_path = f"s3://{bucket}/{embedding_prefix}/{train_data}"

!aws s3 cp {train_local_path} {train_s3_path}

In [None]:
valid_data = "val_dataset.json"
valid_local_path = f"../data/lab04/{valid_data}"

valid_s3_path = f"s3://{bucket}/{embedding_prefix}/{valid_data}"

!aws s3 cp {valid_local_path} {valid_s3_path}

In [None]:
%store embedding_prefix
%store train_s3_path
%store valid_s3_path
%store train_local_path
%store valid_local_path
%store model_id

## > Initial Setup Lab03-04 and Lab03-06

In [None]:
amzn10k_prefix = "amazon_10k"
amzn10k_s3_path = f"s3://{bucket}/{amzn10k_prefix}/"
amzn10k_dir = "../data/lab03/amazon/"

In [None]:
!mkdir -p {amzn10k_dir}
!wget 'https://s2.q4cdn.com/299287126/files/doc_financials/2023/q4/c7c14359-36fa-40c3-b3ca-5bf7f3fa0b96.pdf' --no-check-certificate -O '../data/lab03/amazon/amazon_2023.pdf'
!wget 'https://s2.q4cdn.com/299287126/files/doc_financials/2022/q4/d2fde7ee-05f7-419d-9ce8-186de4c96e25.pdf' --no-check-certificate -O '../data/lab03/amazon/amazon_2022.pdf'

!aws s3 sync {amzn10k_dir} {amzn10k_s3_path}

In [None]:
%store amzn10k_prefix
%store amzn10k_s3_path

## > Initial Setup Lab04

In [None]:
import boto3
import sagemaker

# Get the execution role
role = sagemaker.get_execution_role()
role_name = role.split('/')[-1]  # Extract role name from ARN

# Initialize IAM client
iam = boto3.client('iam')

# Policy ARNs to add
policies_to_add = [
    'arn:aws:iam::aws:policy/AmazonOpenSearchIngestionFullAccess',
    'arn:aws:iam::aws:policy/AmazonOpenSearchServiceFullAccess'
]

# Attach policies
for policy_arn in policies_to_add:
    try:
        iam.attach_role_policy(
            RoleName=role_name,
            PolicyArn=policy_arn
        )
        print(f"Successfully attached {policy_arn} to role {role_name}")
    except Exception as e:
        print(f"Error attaching {policy_arn}: {str(e)}")

# Verify attached policies
try:
    response = iam.list_attached_role_policies(RoleName=role_name)
    print("\nCurrently attached policies:")
    for policy in response['AttachedPolicies']:
        print(f"- {policy['PolicyName']}")
except Exception as e:
    print(f"Error listing policies: {str(e)}")

In [None]:
from os_utils import (
    create_lambda_role,
    create_lambda,
    suffix
)

# create Lambda Role
agent_name = f'swagger-api-agent-{suffix}'
lambda_iam_role = create_lambda_role(agent_name)

# create Lambda Function

lambda_function_name = f'{agent_name}-lambda'
lambda_function = create_lambda("lambda_function.py", lambda_function_name, lambda_iam_role)

lambda_arn = lambda_function['FunctionArn']
print(f"Lab 03 Lambda ARN: {lambda_arn}")

In [None]:
%store lambda_arn
%store lambda_function_name

In [None]:
print('\x1b[42m'+'\t\t\t\n'+'\x1b[0m')
print('\x1b[1;32m'+'  All cells executed\n'+'\x1b[0m')
print('\x1b[42m'+'\t\t\t\n'+'\x1b[0m')
print('\x1b[1;35m'+'\n Check cell outputs to verify there where NO execution errors.'+'\x1b[0m')