In [None]:
# Install required libraries
!pip3.8 install pymongo langchain gradio

In [None]:
# Load the required libraries
import pymongo
from langchain_openai import OpenAIEmbeddings
from langchain_openai import OpenAI
from langchain.vectorstores import MongoDBAtlasVectorSearch
from langchain.document_loaders.text import TextLoader
import gradio as gr
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
# Set up a connection to your Amazon DocumentDB (MongoDB compatibility) cluster and creating the database
client = pymongo.MongoClient(
"<Amazon DocumentDB database cluster connection string>",
port=27017,
username="<username>",
password="<password>",
retryWrites=False,
tls='true',
tlsCAFile="/home/ec2-user/global-bundle.pem") #Check the path as per your destination
db = client.ragdemo
collection = db.rag

In [None]:
# Set up your OpenAI key
my_key= "<your Open AI key>"
embeddings = OpenAIEmbeddings(openai_api_key=my_key)

In [None]:
# Load your .txt file by putting in the relevant path
loader = TextLoader('/home/ec2-user/sample_files/transcript.txt') #you can use .txt file of your choice
data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
docs = text_splitter.split_documents(data)

#Using MongoDB Langchain integration as DocumentDB is compatible with MongoDB insert API
vectorStore = MongoDBAtlasVectorSearch.from_documents(documents=docs, embedding=embeddings, collection=collection) 

In [None]:
# Create a HNSW vector search index. You can also create an ivfflat index.

collection.create_index ([("embedding","vector")], 
    vectorOptions= {
        "type": "hnsw", 
        "similarity": "euclidean",
        "dimensions": 1536,
        "m": 16,
        "efConstruction": 64},
    name="my_index")

In [None]:
# Define your Open AI LLM model. In this case , we are using the default of Langchain.
llm = OpenAI(openai_api_key=my_key, temperature=0)

In [None]:
# Create a chat function

chat_history = []

def query_data(query, chat_history):
    embedded_query = embeddings.embed_query(query)
    docs = collection.aggregate([{'$search': {"vectorSearch" : {"vector" : embedded_query, "path": "embedding", "similarity": "euclidean", "k": 2}}}])
    result = [doc['text'] for doc in docs]

    # Create a PromptTemplate for the user's question
    question_prompt_template = PromptTemplate(
        input_variables=["context", "query", "chat_history"],
        template="Given this text extracts:\n-----\n{context}\n-----\n and also consider the history of this chat {chat_history}\nPlease answer the following question: {query}",
    )

    # Create an LLMChain
    llm_chain = LLMChain(prompt=question_prompt_template, llm=llm)

    # Get the user's question and context documents
    question = query
    context_documents = result

    # Prepare the input for the LLMChain
    input_data = {
        "context": "\n".join(context_documents),
        "query": question,
        "chat_history": chat_history,
    }

    # Run the LLMChain
    output = llm_chain.run(input_data)
    
    return output

In [None]:
# Present the chatbot using Gradio.
with gr.Blocks() as demo:
    gr.Markdown(
    """
    # Amazon DocumentDB Powered Chatbot Demo
    """)
    gr.ChatInterface(query_data)

demo.launch()