# Introduction to Bedrock - Code Generation - Natural Language to SQL Using RAG

1.  Set up Bedrock embedding Model and LLM to Embed the table metadata.
2.  Fetch the metadata from Athena , Get the questions what can be answered asking LLM provided schema information then Embed all the metadata in a Vector Store

**Suggested SageMaker JupterLab Notebook Environment set up is as follows:**

Sagemaker Image: sagemaker-distribution-cpu

Kernel: Python 3

Instance Type: ml.m5.large

# Dependencies installations

Here we will install all the required dependencies to run this notebook. 

In [None]:
!python -m ensurepip --upgrade
!pip install langchain --quiet
!pip install jq --quiet
!pip install faiss-cpu --quiet

**Restart your Kernel before proceeding**

## 1. Configure Bedrock embedding Model and LLM .

In [None]:
import os
import json
from langchain.document_loaders.json_loader import JSONLoader
from langchain.docstore.document import Document
import json
import boto3
from botocore.config import Config
import re
from langchain.vectorstores import FAISS

![Alt text](content/rag.png)

#### Now let's set up our connection to the Amazon Bedrock SDK using Boto3

In [None]:
ATHENA_RESULTS_S3_LOCATION = "<workshop bucket name>" # available in cloudformation outputs
ATHENA_CATALOG_NAME = "<athena catalog name>" # available in cloudformation outputs
DB_NAME = "tpcds1"
DB_FAISS_PATH = './vectorstore/db_faiss'

In [None]:
bedrock_region = athena_region = boto3.session.Session().region_name

In [None]:
retry_config = Config(retries = {'max_attempts': 100})
session = boto3.Session(region_name=bedrock_region)
bedrock = session.client('bedrock-runtime', region_name=bedrock_region, config=retry_config)

### 1.2. Configure LLM Model

In [None]:
def ask_llm(question,modelId):
    
    body = ''
    if 'titan' in modelId:
         model_kwargs = {"maxTokenCount": 200, "temperature": 0.001} 
         input_body = dict()
         input_body["inputText"] = question
         input_body["textGenerationConfig"] = {**model_kwargs}
         body = json.dumps(input_body)

    else:
          body = json.dumps({
                    "prompt": question,
                    "max_tokens_to_sample":4096,
                    "temperature":0.5,
                    "top_k":250,
                    "top_p":0.5,
                  }) 
    
    accept = 'application/json'
    contentType = 'application/json'

    response = bedrock.invoke_model(body=body, modelId=modelId, accept=accept, contentType=contentType)
    response_body = json.loads(response.get('body').read())
    print(response_body)
    return response_body

### 1.3 Helper functions 

Once the LLM returns the list of questions that can be answered using the given table and column , store them locally in a json file

In [None]:
def write_questions_to_file(question_list_filename, table_name, table_schema, answer):
    data_list = []
    question_list_obj = answer
    questions_list = question_list_obj.splitlines()
    print(questions_list)
    # Open the file in write mode
    with open(question_list_filename, mode="w", newline="") as file:
        for question in questions_list:

            # Skip if it doesn't really have a question
            if "?" not in question:
                continue

            questionSplit = re.split(r"\d{1,5}.||. ||- ", question, maxsplit=1)
            print(questionSplit)
            question = questionSplit[1]
            data = {
                "tableName": table_name,
                "question": question,
                "tableSchema": table_schema.lstrip(" "),
            }
            data_list.append(data)

        json.dump(data_list, file)

 ### 1.4  Configure Bedrock Embedding Model

In [None]:
from langchain.embeddings import BedrockEmbeddings
bedrock_embeddings = BedrockEmbeddings(client=bedrock)

### 1.5 Create the Document object with correct metadata for embedding

In [None]:
# Create new docs with the right metadata we need for indexing
def create_docs_with_correct_metadata(documents):
    # We are going to return a list of new documents
    new_docs = []

    # For each document
    for doc in documents:
        # Get it's metadata and contents
        metadata = doc.metadata
        contents = json.loads(doc.page_content)

        # Now calculate the new metadata that we want to add
        new_metadata = {
            "tableName": contents["tableName"],
            "question": contents["question"],
            "tableSchema": contents["tableSchema"],
        }

        # Print out the new metadata for our documents
        # print(new_metadata)

        new_docs.append(
            Document(page_content=new_metadata["question"], metadata=new_metadata)
        )

    return new_docs

def load_json_file(filename):
    loader = JSONLoader(file_path=filename, jq_schema=".[]", text_content=False)

    # This is our internal Langchain document data structure
    docs = loader.load()
    return docs

### 1.6 Below prompt is used to get the questions in natural language and it calls all the helper function to embed the questions and table metadata

In [None]:
# This function asks the LLM to inspect a table schema, generate some questions which could be answered
# by that schema, and then it stores those questions to file, loads them all into a single vectorDB
def add_new_table(schema, table_name,model_id,is_incremental):
    """
    :schema         :   
    :table_name     :
    :model_id       :
    :is_incremental :
    """
    print(f"Adding table {table_name} with schema {schema}")
    
    question = f"""
    \n\nHuman: 
    only return the a bulleted numbered list of unique and detailed questions that could be answered by this table called {table_name} with schema:
    {schema}.
    Instructions:
        Use natural language descriptions only.
        Do not use SQL.
        Produced a varied list of questions, but the questions should be unique and detailed.
        The questions should be in a format that is easy to understand and answer.
        Ask about as much of the information in the table as possible.
        You can ask about more than one aspect of the data at a time.
        Qustions should begin with, 'What', 'Which', 'How', 'When' or 'Can'. Use variable names. 
        The questions should use relevant buisness vocabularly and terminology only. 
        Do not use column names in your output - use relevant natural language descriptions only. 
        Do not output any numeric values.
        Output questions starting with bulleted numbered list. 
         
        \n Questions: 1.
        \n Assistant:
        """
       

    response = ask_llm(question,model_id)
    answer = response['completion']
    question_list_filename = f"../questionList{table_name}.json"

    # # Get rid of anything before the 1.
    # if re.match(r"^[^\d+]\. ", answer) and re.search(r"\d+\. ", answer):
    #     answer = "1. " + answer.split("1. ")[1]
    # else:
    #     answer = "1. " + answer

    print(
        f"Writing questions to {question_list_filename}, with schema {schema}, with table name {table_name} and answer {answer}.\n\n"
    )

    write_questions_to_file(question_list_filename, table_name, schema, answer)
    docs = load_json_file(question_list_filename)
    docs = create_docs_with_correct_metadata(docs)
    new_questions = FAISS.from_documents(docs, bedrock_embeddings)
    db_exists = True if os.path.exists(f"{DB_FAISS_PATH}/index.faiss") else False
    # Add new tables
    if is_incremental and db_exists:
            question_db = FAISS.load_local(DB_FAISS_PATH, bedrock_embeddings)
            question_db.merge_from(new_questions)
            question_db.save_local(DB_FAISS_PATH)

    # Load for the first time
    else:
        print(f"is_incremental set to {str(is_incremental)} and/or no vector db found. Creating...")
        new_questions.save_local(DB_FAISS_PATH)

 ### 2.  Fetch tpc-ds dataset Tables and Columns information 

In [None]:
from sqlalchemy import MetaData
from sqlalchemy import create_engine
import boto3
from botocore.config import Config



def get_sqlalchemy_athena(database,catalog,s3stagingathena,region):

    athena_connection_str = f'awsathena+rest://:@athena.{region}.amazonaws.com:443/{database}?s3_staging_dir={s3stagingathena}&catalog_name={catalog}'
    # Create Athena engine
    athena_engine = create_engine(athena_connection_str) 
    return athena_engine


def get_tpc_ds_dataset(database,catalog,s3stagingathena,region):
# Reflect db schema
    
    column_table  =[]
    columns_str= ''
    table_name = ''
    metadata = MetaData()
    engine = get_sqlalchemy_athena(database,catalog,s3stagingathena,region)
    metadata.reflect(bind=engine)

    # Get list of table names
    print(metadata.tables.keys()) 

    # Loop through tables
    for table in metadata.tables:
        print(f"Table: {table}")
        table_name= table
        columns_str= ''
        tuple = ''
        print(f"Schema: {metadata.tables[table].schema}")
        print(f"Columns: {metadata.tables[table].columns.keys()}")                

        for column in metadata.tables[table].columns.keys():
                columns_str=columns_str+f'{column}'+"|"  
                
        tuple = columns_str,table_name
        column_table.append(tuple)      
    return  column_table      



In [None]:
tpc_ds = get_tpc_ds_dataset(DB_NAME,ATHENA_CATALOG_NAME,ATHENA_RESULTS_S3_LOCATION,athena_region)

### Embed all the questions and metadata to a vector store


In [None]:
model_id = 'anthropic.claude-v2'
for x in tpc_ds:
    print(x)
    add_new_table(x[0],x[1],model_id,True)

### We can now execute the inference notebook to Ask a question