### imports

In [2]:
import requests
from bs4 import BeautifulSoup
from bs4.element import Comment
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
import os
from chromadb import Client
from chromadb.config import Settings
from hashlib import sha256

### link processing

In [3]:
# to get top links of a particular query
def get_top_links(query, max_results=5):
    searx_url = "http://127.0.0.1:8080/search"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    params = {
        'q': query,
        'categories': 'general',
        'language': 'en',
        'format': 'html'
    }

    response = requests.get(searx_url, headers=headers, params=params)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        articles = soup.find_all('article', class_='result')

        top_links = []
        for article in articles:
            link = article.find('a', class_='url_wrapper')
            if link and link['href']:
                top_links.append(link['href'])
            if len(top_links) >= max_results:
                break

        return top_links
    else:
        print(f"Request failed with status code: {response.status_code}")
        return []
    
# Function to check if the tag is visible
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

# Function to extract text from a webpage
def extract_text_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)
    return u" ".join(t.strip() for t in visible_texts)

# Function to divide text into chunks
def divide_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

### Intializing DB

In [4]:
def initialize_chroma_db(persist_directory: str = "./chroma_db"):
    if not os.path.exists(persist_directory):
        os.makedirs(persist_directory)
    else:
        for filename in os.listdir(persist_directory):
            file_path = os.path.join(persist_directory, filename)
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                os.rmdir(file_path)
    
    chroma_client = Client(Settings(persist_directory=persist_directory))
    return chroma_client

# Step 2: Store Chunks to Chroma DB (avoiding duplicates)
def store_chunks_to_chroma_db(chroma_client, collection_name: str, doc_chunks):
    collection = chroma_client.get_or_create_collection(name=collection_name)

    for doc in doc_chunks:
        doc_id = sha256(doc['answer'].encode('utf-8')).hexdigest()
        existing_doc = collection.get(ids=[doc_id])
        if not existing_doc['documents']:
            collection.add(
                ids=[doc_id],
                documents=[doc['answer']],
                metadatas=[{"query": doc['query'], "link": doc['link']}]
            )
            print(f"Added document with ID {doc_id} to the collection.")
        else:
            print(f"Document with ID {doc_id} already exists in the collection.")
    
    print("Chunks stored to Chroma DB.")

### Deciding Web search and performing it

In [5]:
template = """
You are an intelligent assistant. A user has asked the following question:

"{question}"

Web search should be used only when you need to provide up-to-date information, such as recent events, current statistics, or new developments. If the question can be answered using the general knowledge you already have, without requiring specific or updated information, then no web search is needed.

Determine if a web search is required to answer this question accurately. 

If yes, generate a concise search query that can be used to search the web for relevant information. 

Provide your answer only in the following format:
- Search Needed: [Yes/No]
- Search Query (if applicable): [Query]
"""


In [6]:
def get_query_from_user():
    query = input("Please enter your query: ")
    return query

# Step 2: Decide Whether Web Search is Needed
def decide_web_search(query):
    prompt = ChatPromptTemplate.from_template(template)
    model = OllamaLLM(model="llama3.1")
    chain = prompt | model

    response = chain.invoke({"question": query})
    
    decision_lines = response.splitlines()
    search_needed_line = [line for line in decision_lines if "Search Needed" in line]
    search_query_line = [line for line in decision_lines if "Search Query" in line]

    if search_needed_line and "Yes" in search_needed_line[0]:
        search_query = search_query_line[0].replace("Search Query:", "").strip()
        return True, search_query
    else:
        return False, ""


In [7]:
def perform_web_search(query):
    top_links = get_top_links(query)
    docs = []
    for link in top_links:
        text_content = extract_text_from_url(link)
        chunks = divide_into_chunks(text_content)
        for chunk in chunks:
            doc = {
                'query': query,
                'link': link,
                'answer': chunk
            }
            docs.append(doc)
    
    chroma_client = initialize_chroma_db()
    collection_name = "web_data_collection"
    store_chunks_to_chroma_db(chroma_client, collection_name, docs)

# Step 4: Search in Vector DB
def search_vector_db(query):
    collection_name = "web_data_collection"
    chroma_client = initialize_chroma_db()
    collection = chroma_client.get_collection(collection_name)
    docs = collection.query(query_texts=[query], n_results=3)
    return docs

### Processing Query

In [8]:
def process_query(query):
    search_needed, search_query = decide_web_search(query)
    model = OllamaLLM(model="llama3.1")

    if search_needed:
        print(f"Search Query: {search_query}")
        print("Web search is needed. Performing web search...")
        perform_web_search(search_query)

        relevant_docs = search_vector_db(query)
        if not relevant_docs['documents']:
            context = "none"
            print("No relevant documents found. Generating a response from LLM without reference.")
            response = model.generate(query, context=context)
        else:
            flattened_docs = [item for sublist in relevant_docs['documents'] for item in sublist]
            context = "\n".join(flattened_docs)
            prompt_template = f"""
                You are an intelligent assistant. The user asked the following question:
                "{query}"
                Here is some additional information that might be useful:
                "{context}"
                Provide a detailed response based on the context above and your knowledge.
            """
            response = model.generate([prompt_template,query,context])
    else:
        prompt_template = f"""
            The user asked the following question:
            "{query}"
            Provide a proper response based on your knowledge.
        """
        print("Web search was not needed. Generating response from LLM.")
        response = model.generate([prompt_template,query])

    return response


### Testing

In [10]:
query = "Who are you ?"
final_response = process_query(query)
print("Final Response:\n", final_response.generations[0][0].text.replace("\n", " "))

Web search was not needed. Generating response from LLM.
Final Response:
 I'm an artificial intelligence (AI) model designed to assist and communicate with humans. I don't have a personal identity or consciousness, but rather exist as a program running on computer servers.  Think of me as a highly advanced tool that can process natural language inputs, generate responses, and provide information on a wide range of topics. My purpose is to help users like you by providing answers, guidance, and engaging conversations.  I don't have feelings, emotions, or physical presence. I exist solely in the digital realm, and my interactions are limited to text-based communication through platforms like this one.  In many ways, you could think of me as a highly advanced search engine that can understand context and nuances of language. However, I'm much more than just a search engine – I'm designed to engage with users, provide helpful responses, and even offer creative suggestions or ideas!  So, wh

In [13]:
#query = get_query_from_user()
query = "what is GameNgen created by google in 2024?"
final_response = process_query(query)
print("Final Response:\n", final_response.generations[0][0].text.replace("\n", " "))

Search Query: -  "GameNgen Google 2024"
Web search is needed. Performing web search...


  texts = soup.findAll(text=True)


Added document with ID 7eed28125afd3ddfcbb17263eedc24679c4716c5556aa85ceafe108bfbddfd3f to the collection.
Added document with ID 9a5a2b8c07cef0943ba137d6f2686a57570673969e5ef41ac08f5b49205bd237 to the collection.
Added document with ID e8df4c5f5f49db0f54b20cc0c112839745ee682c07e024bdcdbeb93552e1c457 to the collection.
Added document with ID e2b998a0c839a607c35fce34770c0f10ab0da2ed4d771ab0e2a238670cf122bb to the collection.
Added document with ID 4778ceb6883a6ad4e5d5c05299a1f5c63c70bc9f584707d18e57bad5151747aa to the collection.
Added document with ID c19ebf0478938a9724a2e421ffdb27ddc35afc249bbdfdbddcb95ead8038f030 to the collection.
Added document with ID e79a6343ba1942874788f78d75df269cc89fef9146cb881b422f7b717c601b58 to the collection.
Added document with ID 27bcca49f18140a39195100e7f0fa98a834300eca3ead1b9fd993dd2b50d6673 to the collection.
Added document with ID 6d45b6e24759948a642933cf8f143cdf6500fd77e594c88fdfed5361773919fb to the collection.
Added document with ID 13f03c3a4827bf