In [None]:
# Install required libraries
!pip3.8 install pymongo boto3 langchain langchain_openai gradio

In [None]:
# Load the required libraries
import json
import boto3
from langchain_openai import OpenAIEmbeddings
from langchain_openai import OpenAI
from langchain.vectorstores import MongoDBAtlasVectorSearch
from langchain.document_loaders.text import TextLoader
import gradio as gr
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pymongo import MongoClient 
from langchain_community.embeddings import BedrockEmbeddings
from langchain_community.llms import Bedrock

In [None]:
# Set up a connection to your Amazon DocumentDB (MongoDB compatibility) cluster and creating the database
client = pymongo.MongoClient(
"<Amazon DocumentDB database cluster connection string>",
port=27017,
username="<username>",
password="<password>",
retryWrites=False,
tls='true',
tlsCAFile="/home/ec2-user/global-bundle.pem") #Check the path as per your destination
db = client.ragdemo
collection = db.rag

In [None]:
#Create connection to Amazon Bedrock - Omit if using OpenAI
#Set the region as desired 
bedrock_runtime = boto3.client(service_name='bedrock-runtime', region_name='select region')

In [None]:
# If using OpenAI - Store API Key, Select and Configure models - Omit entire block if using Amazon Bedrock
"""
my_key= "<your Open AI key>"
embeddings = OpenAIEmbeddings(openai_api_key=my_key)
llm = OpenAI(openai_api_key=my_key, temperature=0)

# Load your .txt file by putting in the relevant path

loader = TextLoader('transcript.txt') #you can use .txt file of your choice
data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
docs = text_splitter.split_documents(data)

#Using MongoDB Langchain integration as DocumentDB is compatible with MongoDB insert API

vectorStore = MongoDBAtlasVectorSearch.from_documents(documents=docs, embedding=embeddings, collection=collection) 

"""

In [None]:
#Omit entire block if using OpenAI 

model_id_embed = 'amazon.titan-embed-text-v1' 
accept = 'application/json' 
content_type = 'application/json'

# Initialize BedrockEmbeddings client
embeddings_client = BedrockEmbeddings(model_id=model_id_embed, client=bedrock_runtime)

# Load your .txt file
loader = TextLoader('transcript.txt')
data = loader.load()

# Split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
chunks = text_splitter.split_documents(data)

# Initialize a list to store embeddings from all chunks
all_embeddings = []

for chunk in chunks:
    # Extract 'page_content' for the current chunk
    chunk_content = chunk.page_content

    # Prepare the request body with the chunk's content
    body = json.dumps({
        "inputText": chunk_content,
    })

    # Check if the chunk content exceeds the maximum allowed length
    if len(chunk_content) <= 50000:
        # Invoke the model for the current chunk
        response = bedrock_runtime.invoke_model(
            body=body, 
            modelId=model_id, 
            accept=accept, 
            contentType=content_type
        )

        # Extract the response for the current chunk
        response_body = json.loads(response['body'].read())
        chunk_embeddings = response_body['embedding']

        # Store the embeddings for the current chunk
        all_embeddings.append(chunk_embeddings)
    else:
        # Handle the case where a single chunk exceeds the maximum allowed length
        # You might need to split the chunk further or apply some other logic
        print(f"A chunk exceeded the maximum allowed length: {len(chunk_content)} characters")

#Using MongoDB Langchain integration as DocumentDB is compatible with MongoDB insert API

vectorStore = MongoDBAtlasVectorSearch.from_documents(
    documents=chunks, 
    embedding=embeddings_client,  
    collection=collection
)

In [None]:
# Create a HNSW vector search index. You can also create an ivfflat index.

collection.create_index ([("embedding","vector")], 
    vectorOptions= {
        "type": "hnsw", 
        "similarity": "euclidean",
        "dimensions": 1536,
        "m": 16,
        "efConstruction": 64},
    name="my_index")

In [None]:
# Select and configure foundational model - Amazon Bedrock - Omit entire block if using OpenAI

claude = Bedrock(
   credentials_profile_name="default", model_id="anthropic.claude-instant-v1",
)
claude.model_kwargs = {'temperature': 0.0, 'max_tokens_to_sample': 1400,'top_k':10}

llm=claude


# Uncomment the following two lines to validate your connection if desired
"""
response = llm.invoke("Provide a list of the top agricultural products of Florida")
print(response) 
"""

# Uncomment if desired to see which models are available to you from Amazon Bedrock
"""
bedrock_client = boto3.client(
    service_name = "bedrock"
)

FM_list = bedrock_client.list_foundation_models()

for model in FM_list['modelSummaries']:
    print(model['modelId'])
"""

In [None]:
# Create a chat function - Amazon Bedrock - Omit this block if using OpenAI 

chat_history = []

def query_data(query, chat_history):

    embedded_query = embeddings_client.embed_query(query)
    docs = collection.aggregate([{'$search': {"vectorSearch" : {"vector" : embedded_query, "path": "embedding", "similarity": "euclidean", "k": 2}}}])
    result = [doc['text'] for doc in docs]

    # Create a PromptTemplate for the user's question
    question_prompt_template = PromptTemplate(
        input_variables=["context", "query", "chat_history"],
        template="Given this text extracts:\n-----\n{context}\n-----\n and also consider the history of this chat {chat_history}\nPlease answer the following question: {query}",
    )
    # Create an LLMChain
    llm_chain = LLMChain(prompt=question_prompt_template, llm=claude)

    # Get the user's question and context documents
    question = query
    context_documents = result

    # Prepare the input for the LLMChain
    input_data = {
        "context": "\n".join(context_documents),
        "query": question,
        "chat_history": chat_history,
    }

    # Run the LLMChain
    output = llm_chain.run(input_data)
    
    return output

#Uncomment output print function for debugging as required 
#print(query_data("What is the name of the company?", chat_history))

In [None]:
# Create a chat function - OpenAI - Omit if using using Amazon Bedrock 

"""
chat_history = []

def query_data(query, chat_history):
    embedded_query = embeddings.embed_query(query)
    docs = collection.aggregate([{'$search': {"vectorSearch" : {"vector" : embedded_query, "path": "embedding", "similarity": "euclidean", "k": 2}}}])
    result = [doc['text'] for doc in docs]

    # Create a PromptTemplate for the user's question
    question_prompt_template = PromptTemplate(
        input_variables=["context", "query", "chat_history"],
        template="Given this text extracts:\n-----\n{context}\n-----\n and also consider the history of this chat {chat_history}\nPlease answer the following question: {query}",
    )

    # Create an LLMChain
    llm_chain = LLMChain(prompt=question_prompt_template, llm=llm)

    # Get the user's question and context documents
    question = query
    context_documents = result

    # Prepare the input for the LLMChain
    input_data = {
        "context": "\n".join(context_documents),
        "query": question,
        "chat_history": chat_history,
    }

    # Run the LLMChain
    output = llm_chain.run(input_data)
    
    return output

#Uncomment output print function for debugging as required 
print(query_data("What is the name of the company?", chat_history))

"""

In [None]:
# Present the chatbot using Gradio.
with gr.Blocks() as demo:
    gr.Markdown(
    """
    # Amazon DocumentDB Powered Chatbot Demo
    """)
    gr.ChatInterface(query_data)

demo.launch()