<a href="https://colab.research.google.com/github/lamld203844/chat-any/blob/main/chat_any.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# System flow

In [1]:
!pip install -r requirements.txt

## Load file
- load website

In [3]:

# -------------------------------------------
# Load data from a website via Llamaindex Loader
#
# -------------------------------------------

from llama_index.core import SummaryIndex
from llama_index.readers.web import SimpleWebPageReader
from IPython.display import Markdown, display
import os

url = 'https://cinnamon.is/en/company/'
loader = SimpleWebPageReader(html_to_text=True)
docs = loader.load_data([url])
docs


[Document(id_='https://cinnamon.is/en/', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='![cinnamon](https://cinnamon.is/en/wp-\ncontent/themes/Cinnamon-2017-en/images/logo.png)\n\n  * [English](https://cinnamon.is/en/ "English")[日本語](https://cinnamon.is/ "日本語")[Tiếng Việt](https://cinnamon.is/vi/ "Tiếng Việt")[繁體中文](https://cinnamon.is/tw/ "繁體中文")\n\n  * HOME\n  * PRODUCT\n  * NEWS\n  * [COMPANY](https://cinnamon.is/en/company/)\n  * [RECRUITING](https://cinnamon.is/en/recruiting/)\n  * CONTACT\n\n  * HOME\n  * PRODUCT\n  * NEWS\n  * [COMPANY](https://cinnamon.is/en/company/)\n  * [RECRUITING](https://cinnamon.is/en/recruiting/)\n  * CONTACT\n\n  * [English](https://cinnamon.is/en/ "English")[日本語](https://cinnamon.is/ "日本語")[Tiếng Việt](https://cinnamon.is/vi/ "Tiếng Việt")[繁體中文](https://cinnamon.is/tw/ "繁體中文")\n\n# Extend human potential with AI\n\n## At Cinnamon we are working to make a world  \nwhere human creativ

## Chunking and creating embeddings model

- Download model for run locally embedding models (optional)

In [None]:
!git lfs install
!git clone https://huggingface.co/BAAI/bge-small-en

- Load model

In [27]:
# --------------------------------------------
# Chunking and create embeddings
# Automatic via llama index VectorStoreIndex
# --------------------------------------------

# # Optional
# os.environ["HF_HOME"] = '/workspaces/chat-any/weights/'

from torch import cuda 
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.langchain import LangchainEmbedding
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

def load_embedding_model(
    model_name: str = "BAAI/bge-small-en",
    device: str = "cuda" if cuda.is_available() else "cpu"
) -> HuggingFaceBgeEmbeddings:
    model_kwargs = {"device": device}
    encode_kwargs = {
        "normalize_embeddings": True
    }  # set True to compute cosine similarity
    embedding_model = HuggingFaceBgeEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs,
    )
    return embedding_model

# setting up the embedding model
lc_embedding_model = load_embedding_model()
embed_model = LangchainEmbedding(lc_embedding_model)

### Sanity check embedding model
embedding = lc_embedding_model.embed_query('Hello, world')
embedding = embedding[:10]
print(f'Embedding: {embedding}')



Embedding: [-0.02927730418741703, -0.009784802794456482, -6.592511635972187e-05, -0.049279142171144485, 0.02880450338125229, 0.009868807159364223, 0.011057917959988117, 0.04655618220567703, -0.012723397463560104, 0.0009546040673740208]


In [28]:
from llama_index.core import Settings

# ====== Create vector store and upload data ======
Settings.embed_model = lc_embedding_model
index = VectorStoreIndex.from_documents(docs, show_progress=True)
# TODO try async index creation for faster emebdding generation & persist it to memory!
# index = VectorStoreIndex(docs, use_async=True)

Parsing nodes: 100%|██████████| 1/1 [00:00<00:00, 235.70it/s]
Generating embeddings: 100%|██████████| 2/2 [00:00<00:00,  2.74it/s]


## Load llm

In [35]:
# setting up the llm
import os
from dotenv import load_dotenv
load_dotenv()
from llama_index.llms.gemini import Gemini

google_api = os.environ['GEMINI_API']
llm = Gemini(model_name="models/gemini-pro", api_key=google_api)

# Sanity check llm
resp = llm.complete("Hello, world")
print(resp)

Hello, world! I am a large language model, trained by Google.


## Prompt template

In [36]:

# ====== Setup a query engine ======
Settings.llm = llm
query_engine = index.as_query_engine(similarity_top_k=4)

# ---------------------------------------
# Customise prompt template + augmenting
# ---------------------------------------

from llama_index.core import PromptTemplate

qa_prompt_tmpl_str = (
  "You are a formal, friendly and supportive assistant for question answering from given website (Answer questions in complete sentences).\n"
  "Answer the question using the following information delimited by triple brackque.:\n\n"
  "```\n{context_str}\n```"
  "Question: {query_str}\n"
  "\nYou can format ouput in a aesthetic way. Remember: Don't say based on information provided or something like that"
  "\nIn case you don't know the answer or any exception occur, say 'I don't know!'"
)

qa_prompt_tmpl = PromptTemplate(qa_prompt_tmpl_str)

query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
)

In [37]:
# ---------- Chatting -----------
from IPython.display import Markdown, display
response = query_engine.query('What is this website about?')
display(Markdown(str(response)))

This website is about Cinnamon, an artificial intelligence technology startup that provides AI products and services to businesses.

# Wrapping

In [38]:
import os

os.environ["HF_HOME"] = "/workspaces/chat-any/weights"
os.environ["TORCH_HOME"] = "/workspaces/chat-any/weights"

import gc
import re # website url validation
import uuid # unique id for each session
import nest_asyncio # allows nested access to the event loop
nest_asyncio.apply()

from torch import cuda
from dotenv import load_dotenv
load_dotenv() # Load Gemini API


from llama_index.core import Settings
from llama_index.core import PromptTemplate
from llama_index.core import SummaryIndex
from llama_index.readers.web import SimpleWebPageReader
from IPython.display import Markdown, display
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.langchain import LangchainEmbedding
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

from llama_index.llms.gemini import Gemini

In [43]:
# setup embedding model

def load_embedding_model(
    model_name: str = "BAAI/bge-small-en",
    device: str = "cuda" if cuda.is_available() else "cpu"
) -> HuggingFaceBgeEmbeddings:
    model_kwargs = {"device": device}
    encode_kwargs = {
        "normalize_embeddings": True
    }  # set True to compute cosine similarity
    embedding_model = HuggingFaceBgeEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs,
    )
    return embedding_model

lc_embedding_model = load_embedding_model()
embed_model = LangchainEmbedding(lc_embedding_model)

# setup llm
from dotenv import load_dotenv
load_dotenv()
from llama_index.llms.gemini import Gemini

google_api = os.environ['GEMINI_API']
llm = Gemini(model_name="models/gemini-pro", api_key=google_api)

In [44]:
import re 

def validate_website_url(url):

    url_pattern = re.compile(
        r'http[s]?://'  # http:// or https://
        r'(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|'  # domain...
        r'(?:%[0-9a-fA-F][0-9a-fA-F]))+'  # ...or percent-encoded characters
        r'(?:\:[0-9]{1,5})?'  # optional port number
        r'(?:/[a-zA-Z0-9$-_@.&+!*\\(\\),=%]*)*'  # path
        r'(?:\?[a-zA-Z0-9$-_@.&+!*\\(\\),=%]*)?'  # query string
        r'(?:#[a-zA-Z0-9$-_@.&+!*\\(\\),=%]*)?'  # fragment
    )
    return bool(url_pattern.match(url))

In [45]:
def setup_query_engine(website_url):
    if validate_website_url(website_url):
        try:
            # -------------------------------------------
            # Load data from a website via Llamaindex Loader
            # -------------------------------------------
            loader = SimpleWebPageReader()
            docs = loader.load_data([website_url])

            # ---- Create vector store and upload data ---
            # Chunking and create embeddings
            # Automatic via llama index VectorStoreIndex
            # --------------------------------------------
            Settings.embed_model = embed_model
            index = VectorStoreIndex.from_documents(docs, show_progress=True)

            # ====== Setup a query engine ======
            Settings.llm = llm
            query_engine = index.as_query_engine(similarity_top_k=4)

            # ====== Customise prompt template ======
            qa_prompt_tmpl_str = (
                "You are a formal, friendly and supportive assistant for question answering from given website (Answer questions in complete sentences).\n"
                "Answer the question using the following information delimited by triple brackque.:\n\n"
                "```\n{context_str}\n```"
                "Question: {query_str}\n"
                "\nYou can format ouput in a aesthetic way. Remember: Don't say based on information provided or something like that"
                "\nIn case you don't know the answer or any exception occur, say 'I don't know!'"
            )
            qa_prompt_tmpl = PromptTemplate(qa_prompt_tmpl_str)

            query_engine.update_prompts(
                {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
            )
            # ======= Complete setting up !!!! ========
            if docs:
                print("Data loaded successfully!!")
                print("Ready to chat!!")
            else:
                print("No data found, check if the repository is not empty!")
            
            return query_engine
        except Exception as e:
                print(f"An error occurred: {e}")
    else:
        print('Invalid github repo, try again!')
        return None

In [46]:
url = 'https://cinnamon.is/en/company/'
query_engine = setup_query_engine(url)

Parsing nodes: 100%|██████████| 1/1 [00:00<00:00, 32.04it/s]
Generating embeddings: 100%|██████████| 13/13 [00:04<00:00,  2.61it/s]

Data loaded successfully!!
Ready to chat!!





In [47]:
# ---------- Chatting -----------
from IPython.display import Markdown, display
response = query_engine.query('role of Dr. Hajime Hotta in company')
display(Markdown(str(response)))

Dr. Hajime Hotta is the Co-CEO & Founder of Cinnamon Inc.

# GUI with Streamlit


In [48]:
!pip install -r requirements.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[K[?25hm#################[0m[100;90m⠂[0m) ⠧ reify:yargs-parser: [32;40mhttp[0m [35mfetch[0m GET 200 https://registry.[0m[K
added 22 packages in 5s

3 packages are looking for funding
  run `npm fund` for details
[37;40mnpm[0m [0m[36;40mnotice[0m[35m[0m 
[0m[37;40mnpm[0m [0m[36;40mnotice[0m[35m[0m New [33mminor[39m version of npm available! [31m10.5.2[39m -> [32m10.8.0[39m
[0m[37;40mnpm[0m [0m[36;40mnotice[0m[35m[0m Changelog: [36mhttps://github.com/npm/cli/releases/tag/v10.8.0[39m
[0m[37;40mnpm[0m [0m[36;40mnotice[0m[35m[0m Run [32mnpm install -g npm@10.8.0[39m to update!
[0m[37;40mnpm[0m [0m[36;40mnotice[0m[35m[0m 
[0m

In [1]:
%%writefile app.py

import os

# # Optional
# os.environ["HF_HOME"] = "/workspaces/chat-any/weights"

import gc
import re # website url validation
import uuid # unique id for each session
import nest_asyncio # allows nested access to the event loop
nest_asyncio.apply()

import streamlit as st
from torch import cuda
# from dotenv import load_dotenv
# load_dotenv() # Load Gemini API


from llama_index.core import Settings
from llama_index.core import PromptTemplate
from llama_index.core import SummaryIndex
from llama_index.readers.web import SimpleWebPageReader
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.langchain import LangchainEmbedding
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

from llama_index.llms.gemini import Gemini


# ---------- Init + Helper function ----------------

# os.environ['HF_HOME'] = '\lit-chat_with_code_RAG\weights' # for run embedding model locally

# setting up the embedding model
def load_embedding_model(
    model_name: str = "BAAI/bge-small-en",
    device: str = "cuda" if cuda.is_available() else "cpu"
) -> HuggingFaceBgeEmbeddings:
    model_kwargs = {"device": device}
    encode_kwargs = {
        "normalize_embeddings": True
    }  # set True to compute cosine similarity
    embedding_model = HuggingFaceBgeEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs,
    )
    return embedding_model

lc_embedding_model = load_embedding_model()
embed_model = LangchainEmbedding(lc_embedding_model)

# setting up session
if "id" not in st.session_state:
    st.session_state.id = uuid.uuid4()
    st.session_state.file_cache = {}

session_id = st.session_state.id
client = None

# setting up the llm
from dotenv import load_dotenv
load_dotenv()
llm = Gemini(model_name="models/gemini-pro", api_key=os.environ['GOOGLE_API_KEY'])

# helper func
def reset_chat():
    st.session_state.messages = []
    st.session_state.context = None
    gc.collect() # free up memory

def validate_website_url(url):

    url_pattern = re.compile(
        r'http[s]?://'  # http:// or https://
        r'(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|'  # domain...
        r'(?:%[0-9a-fA-F][0-9a-fA-F]))+'  # ...or percent-encoded characters
        r'(?:\:[0-9]{1,5})?'  # optional port number
        r'(?:/[a-zA-Z0-9$-_@.&+!*\\(\\),=%]*)*'  # path
        r'(?:\?[a-zA-Z0-9$-_@.&+!*\\(\\),=%]*)?'  # query string
        r'(?:#[a-zA-Z0-9$-_@.&+!*\\(\\),=%]*)?'  # fragment
    )
    return bool(url_pattern.match(url))

# ---------- End helper function ----------------

with st.sidebar:
    # Input for URL
    website_url = st.text_input("URL")

    # Button to load and process url
    load_button = st.button("Load")

    message_container = st.empty()  # Placeholder for dynamic messages

    if load_button and website_url:
        if validate_website_url(website_url):
            with st.spinner(f"Loading website..."):
                try:
                    # -------------------------------------------
                    # Load data from a website via Llamaindex Loader
                    # -------------------------------------------
                    loader = SimpleWebPageReader()
                    docs = loader.load_data([website_url])

                    # ---- Create vector store and upload data ---
                    # Chunking and create embeddings
                    # Automatic via llama index VectorStoreIndex
                    # --------------------------------------------
                    Settings.embed_model = embed_model
                    index = VectorStoreIndex.from_documents(docs)

                    # ====== Setup a query engine ======
                    Settings.llm = llm
                    query_engine = index.as_query_engine(similarity_top_k=4) # TODO
                    # query_engine = index.as_query_engine(streaming=True, similarity_top_k=4) # TODO

                    # ====== Customise prompt template ======
                    qa_prompt_tmpl_str = (
                        "You are a formal, friendly and supportive assistant for question answering from given website (Answer questions in complete sentences).\n"
                        "Answer the question using the following information delimited by triple brackque.:\n\n"
                        "```\n{context_str}\n```"
                        "Question: {query_str}\n"
                        "\nYou can format ouput in a aesthetic way. Remember: Don't say based on information provided or something like that"
                        "\nIn case you don't know the answer or any exception occur, say 'I don't know!'"
                    )
                    qa_prompt_tmpl = PromptTemplate(qa_prompt_tmpl_str)

                    query_engine.update_prompts(
                        {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
                    )
                    # ======= Complete setting up !!!! ========
                    if docs:
                        message_container.success("Data loaded successfully!!")
                    else:
                        message_container.write(
                            "No data found, check if the repository is not empty!"
                        )
                    st.session_state.query_engine = query_engine

                except Exception as e:
                    st.error(f"An error occurred: {e}")
                    st.stop()

                st.success("Ready to Chat!")
        else:
            st.error('Invalid url')
            st.stop()

col1, col2 = st.columns([6, 1])

with col1:
    st.header(f"Chat with any website")

with col2:
    st.button("Clear ↺", on_click=reset_chat)


# Initialize chat history
if "messages" not in st.session_state:
    reset_chat()


# Display chat messages from history on app rerun
for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.markdown(message["content"])


# Accept user input
if prompt := st.chat_input("What's up?"):
    # Add user message to chat history
    st.session_state.messages.append({"role": "user", "content": prompt})
    # Display user message in chat message container
    with st.chat_message("user"):
        st.markdown(prompt)

    # Display assistant response in chat message container
    with st.chat_message("assistant"):
        message_placeholder = st.empty()

        query_engine = st.session_state.query_engine
        full_response = query_engine.query(prompt)
        # # TODO: Simulate stream of response with milliseconds delay
        # full_response = ""
        # streaming_response = query_engine.query(prompt)

        # for chunk in streaming_response.response_gen:
        #     full_response += chunk
        #     message_placeholder.markdown(full_response + "▌")

        message_placeholder.markdown(full_response)

    # Add assistant response to chat history
    st.session_state.messages.append({"role": "assistant", "content": full_response})

Writing app.py


In [5]:
!streamlit run app.py &>/content/logs.txt & npx localtunnel --port 8501 & curl ipv4.icanhazip.com

/bin/bash: streamlit: command not found
4.194.153.202
