# This notebook prepares data for agent's access. 
### Pre-requisites
This notebook requires permissions to:

create and delete Amazon IAM roles
create, update and delete Amazon S3 buckets
access Amazon Bedrock
access to Amazon OpenSearch Serverless

If running on SageMaker Studio, you should add the following managed policies to your role:

IAMFullAccess, AWSLambda_FullAccess, AmazonS3FullAccess, AmazonBedrockFullAccess

Custom policy for Amazon OpenSearch Serverless such as:
{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": "aoss:*",
            "Resource": "*"
        }
    ]
}

## Import csv files to sqlite

In [None]:
import sqlite3
import pandas as pd

# Function to create SQLite DB table from CSV
def create_db_table_from_csv(csv_file_path, db_name, table_name):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)
    
    # Connect to the SQLite database (or create it if it doesn't exist)
    conn = sqlite3.connect(db_name)
    
    # Write the DataFrame to an SQLite table
    df.to_sql(table_name, conn, if_exists='replace', index=False)
    
    # Close the connection
    conn.close()


In [None]:
def create_db_tables_from_csv_files(csv_file_paths, db_name, table_names):
    # Ensure the list of CSV file paths and table names are of the same length
    if len(csv_file_paths) != len(table_names):
        raise ValueError("The number of CSV files must match the number of table names.")
    
    # Iterate over the CSV file paths and table names
    for csv_file_path, table_name in zip(csv_file_paths, table_names):
        create_db_table_from_csv(csv_file_path, db_name, table_name)


csv_file_paths = ['data/porterville_student_schedule.csv', 'data/porterville_student_data.csv', 'data/porterville_course_schedule.csv'] 
db_name = 'porterville_academic.db'
table_names = ['student_schedule', 'student_data', 'course_schedule']

create_db_tables_from_csv_files(csv_file_paths, db_name, table_names)

## Ingest course catalogue to Bedrock Knowledgebase

#### You can add multiple and different data sources (S3, Confluence, Sharepoint, Salesforce, Web Crawler) to a Knowledge Base. For this demo, create a Knowledge Base with a S3 where course catalogue pdf is stored. Follow this notebook and upload portervill_2024-2025_Catalo: https://github.com/aws-samples/amazon-bedrock-samples/blob/main/knowledge-bases/features-examples/01-rag-concepts/01_create_ingest_documents_test_kb_multi_ds.ipynb

In [16]:
import importlib
import utility
importlib.reload(utility)
import boto3
import time
import json
vector_store_name = f'bedrock-sample-rag-{suffix}'
index_name = f"bedrock-sample-rag-index-{suffix}"

bedrock_kb_execution_role = create_bedrock_execution_role_multi_ds(bucket_names=[d["bucket_name"] for d in data_sources if d['type']== 'S3'],
                                secrets_arns = [d["credentialsSecretArn"] for d in data_sources if d['type']== 'CONFLUENCE'or d['type']=='SHAREPOINT' or d['type']=='SALESFORCE'])
bedrock_kb_execution_role_arn = bedrock_kb_execution_role['Role']['Arn']

In [17]:
bedrock_kb_execution_role_arn

'arn:aws:iam::827930657850:role/AmazonBedrockExecutionRoleForKnowledgeBase_825'

In [22]:
# create security, network and data access policies within OSS
aoss_client = boto3_session.client('opensearchserverless')
encryption_policy, network_policy, access_policy = create_policies_in_oss(vector_store_name=vector_store_name,
                       aoss_client=aoss_client,
                       bedrock_kb_execution_role_arn=bedrock_kb_execution_role_arn)
collection = aoss_client.create_collection(name=vector_store_name,type='VECTORSEARCH')

In [23]:
pp.pprint(collection)

{ 'ResponseMetadata': { 'HTTPHeaders': { 'connection': 'keep-alive',
                                         'content-length': '314',
                                         'content-type': 'application/x-amz-json-1.0',
                                         'date': 'Wed, 21 Aug 2024 15:32:42 '
                                                 'GMT',
                                         'x-amzn-requestid': '71ff13d0-7bd0-4678-a199-10c4aa350086'},
                        'HTTPStatusCode': 200,
                        'RequestId': '71ff13d0-7bd0-4678-a199-10c4aa350086',
                        'RetryAttempts': 0},
  'createCollectionDetail': { 'arn': 'arn:aws:aoss:us-east-1:827930657850:collection/myxkiwm5k0iupxlsyi24',
                              'createdDate': 1724254362484,
                              'id': 'myxkiwm5k0iupxlsyi24',
                              'kmsKeyArn': 'auto',
                              'lastModifiedDate': 1724254362484,
                             

In [24]:
# Get the OpenSearch serverless collection URL
collection_id = collection['createCollectionDetail']['id']
host = collection_id + '.' + region_name + '.aoss.amazonaws.com'
print(host)

myxkiwm5k0iupxlsyi24.us-east-1.aoss.amazonaws.com


In [25]:
import time
# wait for collection creation
# This can take couple of minutes to finish
response = aoss_client.batch_get_collection(names=[vector_store_name])
# Periodically check collection status
while (response['collectionDetails'][0]['status']) == 'CREATING':
    print('Creating collection...')
    interactive_sleep(30)
    response = aoss_client.batch_get_collection(names=[vector_store_name])
print('\nCollection successfully created:')
pp.pprint(response["collectionDetails"])

Creating collection...
..............................
Collection successfully created:
[ { 'arn': 'arn:aws:aoss:us-east-1:827930657850:collection/myxkiwm5k0iupxlsyi24',
    'collectionEndpoint': 'https://myxkiwm5k0iupxlsyi24.us-east-1.aoss.amazonaws.com',
    'createdDate': 1724254362484,
    'dashboardEndpoint': 'https://myxkiwm5k0iupxlsyi24.us-east-1.aoss.amazonaws.com/_dashboards',
    'id': 'myxkiwm5k0iupxlsyi24',
    'kmsKeyArn': 'auto',
    'lastModifiedDate': 1724254390191,
    'name': 'bedrock-sample-rag-798',
    'standbyReplicas': 'ENABLED',
    'status': 'ACTIVE',
    'type': 'VECTORSEARCH'}]


In [26]:
# create opensearch serverless access policy and attach it to Bedrock execution role
try:
    create_oss_policy_attach_bedrock_execution_role(collection_id=collection_id,
                                                    bedrock_kb_execution_role=bedrock_kb_execution_role)
    # It can take up to a minute for data access rules to be enforced
    interactive_sleep(60)
except Exception as e:
    print("Policy already exists")
    pp.pprint(e)

Opensearch serverless arn:  arn:aws:iam::827930657850:policy/AmazonBedrockOSSPolicyForKnowledgeBase_825
............................................................