# Chat with html pages using LLM
This jupyter notebook reads html pages into Chroma DB 
and then uses LangChain with LLM as chat agent


In [1]:
!pip3 install -r requirements.txt

ERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'


In [2]:
from langchain.chains import RetrievalQA
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.callbacks.manager import CallbackManager
from langchain_community.llms import Ollama
from langchain_community.llms.openai import OpenAI
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain.prompts import PromptTemplate
from langchain.prompts.chat import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.memory import ConversationBufferMemory
# import streamlit as st
import os
import time

In [28]:
DOC_FOLDER = r".\kiray.com.sg"
MODE = "local"  #local or remote
RAG_CONFIGS = [
    {   "name":"remote",
        "base_url":"http://192.168.1.66:1234/v1",
        "model":"mistral",
        "api_key":"123"
    },
    {
        "name":"local",
        "base_url":"http://127.0.0.1:11434",
        "model":"mistral",
    }
]
CONFIG = [c for c in RAG_CONFIGS if c["name"] == MODE][0]


## Add document along with unique IDs so we don't include the same documents again

In [14]:
embedding = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Chroma(persist_directory=os.path.join(DOC_FOLDER, "db"),
                     embedding_function=embedding)             

all_docs = vectorstore.get()
print(all_docs)


{'ids': ['contact (1)', 'contact (10)', 'contact (11)', 'contact (12)', 'contact (13)', 'contact (14)', 'contact (15)', 'contact (16)', 'contact (17)', 'contact (2)', 'contact (3)', 'contact (4)', 'contact (5)', 'contact (6)', 'contact (7)', 'contact (8)', 'contact (9)', 'eyebrow-eyelash-forte (1)', 'eyebrow-eyelash-forte (10)', 'eyebrow-eyelash-forte (11)', 'eyebrow-eyelash-forte (12)', 'eyebrow-eyelash-forte (13)', 'eyebrow-eyelash-forte (14)', 'eyebrow-eyelash-forte (15)', 'eyebrow-eyelash-forte (16)', 'eyebrow-eyelash-forte (17)', 'eyebrow-eyelash-forte (18)', 'eyebrow-eyelash-forte (2)', 'eyebrow-eyelash-forte (3)', 'eyebrow-eyelash-forte (4)', 'eyebrow-eyelash-forte (5)', 'eyebrow-eyelash-forte (6)', 'eyebrow-eyelash-forte (7)', 'eyebrow-eyelash-forte (8)', 'eyebrow-eyelash-forte (9)', 'facial-forte (1)', 'facial-forte (10)', 'facial-forte (11)', 'facial-forte (12)', 'facial-forte (13)', 'facial-forte (14)', 'facial-forte (15)', 'facial-forte (16)', 'facial-forte (17)', 'facial-f

In [15]:

docs = []
ids = []
# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=20,
    length_function=len
)

def find_ids(f, list_ids):
    for n in list_ids:
        if f in n:
            print(f"found {f} in {n}")
            return True
        else:
            print(f"not found {f} in {n}")
            return False

for file in os.listdir(DOC_FOLDER):
    if file.endswith(".html"):
        filename =os.path.splitext(file)[0]
        found = [f for f in all_docs['ids'] if filename in f]
        if found == []:
            print(f"cannot find {filename}")
            loader = TextLoader(os.path.join(DOC_FOLDER, file))
            doc = loader.load()
            splitted = text_splitter.split_documents(doc)
            print(splitted)
            i = 0
            for split in splitted:
                i += 1
                docs.append(split)
                ids.append(f"{filename} ({str(i)})")
                print(f"added {file}")
        else:
            print(f"found {filename}")

if not ids == []:
    db = vectorstore.from_documents(documents=docs, embedding=embedding, ids=ids, persist_directory=os.path.join(DOC_FOLDER, "db"))
    db.persist()
    all_docs = db.get()
    print(all_docs['ids'])
else:
    db = vectorstore
    print("nothing added")

retriever = db.as_retriever()



found contact
found eyebrow-eyelash-forte
found facial-forte
found fine-hair-forte
found index
found latest
found promotion
found promotions
found testimonials
found treatments
nothing added


In [16]:
db.similarity_search_with_score("treatment", 6)

[(Document(page_content='<li>Restores skin\'s youthfulness</li>\n                                <li>Leaving your skin firmer, fuller and re-sculpt, taking years off your actual age</li>\n                            </ul>\n                        </li>\n                        <div class="clear"></div>\n                    </ul>\n                    \n                    <div class="clear"></div>\n                    \n                    <ul class="dark">\n                        <li class="content">\n                            <h2>Pure O<sub>2</sub></h2>\n                            <p>Pure O<sub>2</sub> Infusion therapy involves infusing of pure oxygen to the skin as well as inhalation of pure oxygen.</p>\n                            <p><strong>Benefits:</strong></p>\n                            <ul>\n                                <li>Energise tired, dull and stressed skin</li>\n                                <li>Increase cells regeneration</li>', metadata={'source': '.\\kiray.c

In [30]:


if MODE == "local":
    llm = Ollama(
        base_url=CONFIG["base_url"],
        model=CONFIG["model"],
        temperature=0,
        verbose=True,
        callback_manager=CallbackManager(
            [StreamingStdOutCallbackHandler()]),
        )
else:
    llm = OpenAI(
        base_url=CONFIG["base_url"],
        model=CONFIG["model"],
        openai_api_key=CONFIG["api_key"],
        temperature=0.7,
        verbose=True,
        callback_manager=CallbackManager(
            [StreamingStdOutCallbackHandler()]),
        )


In [31]:
template = """Answer the following question based only on the provided context. Do not use other information other than the context. If the question is not contained in the context, just say "Not found":

<context>
{context}
</context>

Question: {question}"""


prompt = PromptTemplate(
        input_variables=["context", "question"],
        template=template,
    )


In [32]:
qa_chain = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type='stuff',
            retriever=retriever,
            verbose=True,
            chain_type_kwargs={
                "verbose": True,
                "prompt": prompt,
            })

In [35]:
user_input = "What skin treatments do you offer?"
response = qa_chain(user_input)
print(response)



[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mAnswer the following question based only on the provided context. Do not use other information other than the context. If the question is not contained in the context, just say "Not found":

<context>
<li class="image">&#8220;</li>
                        <li><h1>The freckles have reduced tremendously and the skin complexion seems to have improved, too.</h1></li>
                        <div class="clear"></div>
                    </ul>
                    <div class="clear"></div>
                    <p>I used to have oily and sensitive complexion but not anymore! My skin has shown great improvement after Kiray's introduction of <a href="equipment.html#maya" target="_blank">Maya Treatment</a> to me. Based on my personal experience, my skin appears to be firmer, livelier and the open pores are gone. Results

In [10]:
import os
for root, dirs, files in os.walk(r".\obsidian"):
    for file in files:
        if file.endswith(".md"):
            print(os.path.join(root, file))

.\obsidian\Amazon EKS.md
.\obsidian\Amazon Feature Store.md
.\obsidian\Diary\2022-12-01.md
.\obsidian\Diary\2022-12-02.md
.\obsidian\Diary\2022-12-03.md
.\obsidian\Diary\2022-12-04.md
.\obsidian\Diary\2022-12-05.md
.\obsidian\Diary\2022-12-06.md
.\obsidian\Diary\2022-12-07.md
.\obsidian\Diary\2022-12-08.md
.\obsidian\Diary\2022-12-09.md
.\obsidian\Diary\2022-12-10.md
.\obsidian\Diary\2022-12-14.md
.\obsidian\Diary\2022-12-16.md
.\obsidian\Diary\2022-12-17.md
.\obsidian\Diary\2022-12-20.md
.\obsidian\Diary\2022-12-21.md
.\obsidian\Diary\2022-12-23.md
.\obsidian\Diary\2022-12-25.md
.\obsidian\Diary\2022-12-26.md
.\obsidian\Diary\2022-12-27.md
.\obsidian\Diary\2023-01-03.md
.\obsidian\Diary\2023-01-06.md
.\obsidian\Diary\2023-01-13.md
.\obsidian\Diary\2023-02-11.md
.\obsidian\Diary\2023-02-12.md
.\obsidian\Diary\2023-02-25.md
.\obsidian\Diary\2023-02-26.md
.\obsidian\Diary\2023-02-28.md
.\obsidian\Diary\2023-03-02.md
.\obsidian\Diary\2023-03-05.md
.\obsidian\Diary\2023-03-09.md
.\obsidian