In [21]:
#cell 1
from pymongo import MongoClient
import boto3
from botocore.exceptions import ClientError
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.embeddings import BedrockEmbeddings
from langchain.llms import Bedrock
from langchain.vectorstores import DocumentDBVectorSearch
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.chains import LLMChain
import json

In [24]:
#cell 2
#storing and retrieving DocumentDB credentials in AWS Secrets Manager see the following for more info in retrieving credentials
#https://docs.aws.amazon.com/secretsmanager/latest/userguide/retrieving-secrets.html#retrieving-secrets-code

secret_name = "<secret name>"
region_name = "<region>"

    # Create a Secrets Manager client
session = boto3.session.Session()
secrets_client = session.client(
        service_name='secretsmanager',
        region_name=region_name
    )

try:
    get_secret_value_response = secrets_client.get_secret_value(
        SecretId=secret_name
        )
except ClientError as e:
        # For a list of exceptions thrown, see
        # https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html
    raise e

secret = json.loads(get_secret_value_response['SecretString'])


In [25]:
#cell 3
username = secret['username']
password = secret['password']
host = secret['host']
port = secret['port']
tls_file_location = '/path/to/tls/file.pem' #see https://docs.aws.amazon.com/documentdb/latest/developerguide/connect_programmatically.html#connect_programmatically-tls_enabled

In [26]:
#cell 4
#see the following for information on connecting to DocumentDB: https://docs.aws.amazon.com/documentdb/latest/developerguide/connect_programmatically.html
connection_string = f'mongodb://{username}:{password}@{host}:{port}/?tls=true&tlsCAFile={tls_file_location}&replicaSet=rs0&readPreference=secondaryPreferred&retryWrites=false'
client = MongoClient(connection_string)
database = client.database #pymongo utilizes dot notation, if a database is named "database", use client.database
collection = database.collection #as above, utlize dot notation for collections with databases as follows database.collection

In [28]:
#cell 5
#below is creating a vector index named on field "vectorContent". By default, Langchain will insert chunks with the following fields: vectorContent, source, page, textContent
# see the following for vector options in creating an index https://docs.aws.amazon.com/documentdb/latest/developerguide/vector-search.html#w5aac21c11c11, 

collection.create_index([("vectorContent","vector")], 
    vectorOptions= {
        "type": "hnsw", 
        "similarity": "<similarity>",
        "dimensions": 1536,
        "m": 16,
        "efConstruction": 64},
    name="hnsw")

'hnsw'

In [29]:
#cell 6
#initializing a boto3 Bedrock client to call LLM's with Bedrock https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/bedrock-runtime.html
bedrock_client = boto3.client('bedrock-runtime', region_name)

In [30]:
#chunking pdf files using Langchain Recursive text splitter and ingesting documents to DocumentDB, embeddings created with Amazon Titan

files = '/path/to/files/' #just use the directory, do not use path to specific files

loader = PyPDFDirectoryLoader(files)
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    
    chunk_size = 1000,
    chunk_overlap  = 100,
)
docs = text_splitter.split_documents(documents)



In [31]:
#cell 7
# initializing embeddings with Amazon Titan embeddings model
#ingestion of chunked documents into DocumentDB vector search index that can be later used as a retriver for Retrieval QA
embeddings = BedrockEmbeddings(model_id= "amazon.titan-embed-text-v1", client=bedrock_client)
INDEX_NAME = "hnsw"
vector_store = DocumentDBVectorSearch.from_documents(
    documents=docs,
    embedding=embeddings,
    collection=collection,
    index_name=INDEX_NAME,
)

In [44]:
#cell 8
#initilaizing an existing DocumentDB as a callable vector store, use this when you do not need to ingest documents like the code above
name_space = "database.collection" #a namespace follows dot notation of a database and collection 
vector_store = DocumentDBVectorSearch.from_connection_string(
    connection_string=connection_string,
    namespace = name_space, 
    index_name = "hnsw",
    embedding = embeddings
)

In [45]:
#cell 9
#initializing Anthropic Claude for Amazon Bedrock as reasoning agent 
llm = Bedrock(model_id="anthropic.claude-v2:1", client=bedrock_client)

In [46]:
#cell 10
#creating a prompt template https://api.python.langchain.com/en/latest/prompts/langchain_core.prompts.prompt.PromptTemplate.html
prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

context: {context}
Question: {question}
"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [47]:
#cell 11
#initializing our DocumentDB vector store as a retriever
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 5},
)

In [None]:
#cell 12
#Using RetrievalQA chain to perform RAG. https://api.python.langchain.com/en/latest/chains/langchain.chains.retrieval_qa.base.RetrievalQA.html
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT},
)

output = qa.invoke({"query": "<query>"})

print("query: ", output['query'])
print("result: " ,output["result"])

In [None]:
#cell 13
#A different way to perform a rag that takes a query,chat history, context as arguments
chat_history = []


query = "<query>"
docs = vector_store.similarity_search(query)
result = [doc.page_content for doc in docs]

# Create a PromptTemplate for the user's question
question_prompt_template = PromptTemplate(
input_variables=["context", "query", "chat_history"],
template="Given this text extracts:\n-----\n{context}\n-----\n and also consider the history of this chat {chat_history}\nPlease answer the following question: {query}",
    )

    # Create an LLMChain
llm_chain = LLMChain(prompt=question_prompt_template, llm=llm)

    # Get the user's question and context documents
question = query
context_documents = result

    # Prepare the input for the LLMChain
input_data = {
        "context": "\n".join(context_documents),
        "query": question,
        "chat_history": chat_history,
    }

    # Run the LLMChain
output = llm_chain.invoke(input_data)
print("context: ", output['context'])
print("query: ", output['query'])
print("output: ", output['text'])

In [None]:
#cell 14
#performing vector search with Aggregation pipelines https://docs.aws.amazon.com/documentdb/latest/developerguide/vector-search.html#w5aac21c11c15
embedded_query = embeddings.embed_query("<query>")
search = collection.aggregate([{'$search': {"vectorSearch" : {"vector" : embedded_query, "path": "vectorContent", "similarity": "<similarity>", "k": 5}}}])
results = list(search)
text = [result['textContent'] for result in results]
print(text[0], "\n")
print(text[1], "\n")
print(text[2], "\n")
print(text[3], "\n")
print(text[4], "\n")