In [None]:
!pip install IProgress ipywidgets langchain_community langchain_openai opensearch_py tqdm unstructured networkx

Start local opensearch node by running:
```bash podman run -d -p 9200:9200 -p 9600:9600 --name elastic -e "discovery.type=single-node" -e "plugins.security.disabled=true" opensearchproject/opensearch:latest```

# Knowledge Base Chatbot - Langchain + Opensearch

In [None]:
from langchain import PromptTemplate, LLMChain
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import OpenSearchVectorSearch
from langchain.document_loaders import DirectoryLoader
from tqdm import tqdm
from getpass import getpass

openai_api_key = getpass("OpenAI API Key: ")
es_url = getpass("Opensearch URL: ")
index_name = getpass("Opensearch Index Name: ")
hf = OpenAIEmbeddings(openai_api_key=openai_api_key, model="text-embedding-ada-002")
db = OpenSearchVectorSearch(embedding_function=hf, opensearch_url=es_url, index_name=index_name, ssl_verify=False)

## Data Modeling

### Ingest

In [None]:
loader = DirectoryLoader('pages', glob="./*.txt", show_progress=True, recursive=True)
data = loader.load()
batchtext = []
count = 0
for doc in tqdm(data, desc="Processing documents"):
    # assuming data is a list of documents, containing paragraphs separated by \n
    paragraphs = doc.page_content.split('\n')
    for p in paragraphs:
        batchtext.append(p)
        count += 1
print(f"Total paragraphs: {count}")

### Embedding

In [None]:
total_paragraphs = len(batchtext)
pbar = tqdm(total=total_paragraphs, desc="Embedding paragraphs")
chunk_size = 1000
for i in range(0, total_paragraphs, chunk_size):
    chunk = batchtext[i:i+chunk_size]
    db.from_texts(chunk, embedding=hf, opensearch_url=es_url, index_name=index_name, bulk_size=4000)
    pbar.update(len(chunk))
pbar.close()

### Inference

In [None]:
topic = "Sinapto srl technology consulting knowledge base"
template_informed = """
Sono un assistente e custode di una knowledge base testuale. Rispondo alle domande basandomi sul contesto fornito. Se non conosco la risposta, dico che non lo so.
Conosco il contesto: {context}
Quando mi viene chiesto: {question}
la mia risposta, basata solo sulle informazioni del contesto, è: """
prompt_informed = PromptTemplate(template=template_informed, input_variables=["context", "question"])
llm_chain_informed = LLMChain(prompt=prompt_informed, llm=ChatOpenAI(model="gpt-4o", temperature=0, openai_api_key=openai_api_key))

def ask_a_question(question):
    # composing context informed prompt and response from retrieved similarity search results
    similar_docs = db.similarity_search(question, k=1000)
    informed_context = (lambda docs: "".join(doc.page_content for doc in docs))(similar_docs)
    informed_response = (lambda context, question: llm_chain_informed.run(context=context, question=question))(informed_context, question)
    return informed_response

#### Run

In [None]:
print(f'"{topic}": advanced search')
while True:
    question = input("User Question >> ")
    response = ask_a_question(question)
    print(f"\tQuestion: {question}")
    print(f"\tAnswer  : {response}")