# This notebook prepares data for agent's access. 

## Structured data preparation

### Import csv files to sqlite (this is mockup data for demo purpose, no PII contained in any of the csv files)

In [None]:
import sqlite3
import pandas as pd

# Function to create SQLite DB table from CSV
def create_db_table_from_csv(csv_file_path, db_name, table_name):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)
    
    # Connect to the SQLite database (or create it if it doesn't exist)
    conn = sqlite3.connect(db_name)
    
    # Write the DataFrame to an SQLite table
    df.to_sql(table_name, conn, if_exists='replace', index=False)
    
    # Close the connection
    conn.close()


In [None]:
def create_db_tables_from_csv_files(csv_file_paths, db_name, table_names):
    # Ensure the list of CSV file paths and table names are of the same length
    if len(csv_file_paths) != len(table_names):
        raise ValueError("The number of CSV files must match the number of table names.")
    
    # Iterate over the CSV file paths and table names
    for csv_file_path, table_name in zip(csv_file_paths, table_names):
        create_db_table_from_csv(csv_file_path, db_name, table_name)


csv_file_paths = ['data/porterville_student_schedule.csv', 'data/porterville_student_data.csv', 'data/porterville_course_schedule.csv'] 
db_name = 'porterville_academic.db'
table_names = ['student_schedule', 'student_data', 'course_schedule']

create_db_tables_from_csv_files(csv_file_paths, db_name, table_names)

## KB for unstructured data

In [9]:
import boto3
from utils.knowledge_base import BedrockKnowledgeBase
import time

# Get the current timestamp
current_time = time.time()

# Format the timestamp as a string
timestamp_str = time.strftime("%Y%m%d%H%M%S", time.localtime(current_time))[-7:]
# Create the suffix using the timestamp
suffix = f"{timestamp_str}"

In [11]:
import pprint
data_bucket_name = f'bedrock-kb-{suffix}-1' # replace it with your first bucket name.

data_sources=[
                {"type": "S3", "bucket_name": data_bucket_name}, 
            ]
                
pp = pprint.PrettyPrinter(indent=2)

In [12]:
knowledge_base_name = f"course-catalogue-sample-kb-{suffix}"
knowledge_base_description = "course catalogue"

In [13]:
knowledge_base = BedrockKnowledgeBase(
    kb_name=f'{knowledge_base_name}',
    kb_description=knowledge_base_description,
    data_sources=data_sources,
    chunking_strategy = "FIXED_SIZE", 
    suffix = f'{suffix}-f'
)

Step 1 - Creating or retrieving S3 bucket(s) for Knowledge Base documents
['bedrock-kb-4222921-1']
buckets_to_check:  ['bedrock-kb-4222921-1']
Creating bucket bedrock-kb-4222921-1
Step 2 - Creating Knowledge Base Execution Role (AmazonBedrockExecutionRoleForKnowledgeBase_4222921-f) and Policies
Step 3 - Creating OSS encryption, network and data access policies
Step 4 - Creating OSS Collection (this step takes a couple of minutes to complete)
{ 'ResponseMetadata': { 'HTTPHeaders': { 'connection': 'keep-alive',
                                         'content-length': '320',
                                         'content-type': 'application/x-amz-json-1.0',
                                         'date': 'Tue, 04 Feb 2025 22:31:01 '
                                                 'GMT',
                                         'x-amzn-requestid': '9260fa9f-f4d3-4973-90fc-88aee4ba2808'},
                        'HTTPStatusCode': 200,
                        'RequestId': '9260fa9f-f4

### Download data and ingest to KB

In [5]:
import requests
# Download the PDF
url = "https://portervillecollege.edu/_resources/assets/pdfs/Academics/2024-2025_Catalog.pdf"
response = requests.get(url)

# Save PDF locally first
local_file = "2024-2025_Catalog.pdf"
with open(local_file, 'wb') as f:
    f.write(response.content)

In [14]:
s3_client = boto3.client('s3')
s3_client.upload_file(local_file,data_bucket_name,local_file)

print(f"File uploaded to S3 at: {data_bucket_name}")

File uploaded to S3 at: bedrock-kb-4222921-1


In [15]:
# ensure that the kb is available
time.sleep(30)
# sync knowledge base
knowledge_base.start_ingestion_job()

job 1 started successfully

{ 'dataSourceId': 'UFTEMSROVW',
  'ingestionJobId': 'UM2QF5J5R4',
  'knowledgeBaseId': 'FI6FUNO3UI',
  'startedAt': datetime.datetime(2025, 2, 4, 22, 35, 38, 696452, tzinfo=tzlocal()),
  'statistics': { 'numberOfDocumentsDeleted': 0,
                  'numberOfDocumentsFailed': 0,
                  'numberOfDocumentsScanned': 1,
                  'numberOfMetadataDocumentsModified': 0,
                  'numberOfMetadataDocumentsScanned': 0,
                  'numberOfModifiedDocumentsIndexed': 0,
                  'numberOfNewDocumentsIndexed': 1},
  'status': 'COMPLETE',
  'updatedAt': datetime.datetime(2025, 2, 4, 22, 36, 56, 926867, tzinfo=tzlocal())}
........................................

In [16]:
# keep the kb_id for Agent
kb_id = knowledge_base.get_knowledge_base_id()
%store kb_id

'FI6FUNO3UI'
Stored 'kb_id' (str)
