## Webscrapping

In [14]:
import requests
from bs4 import BeautifulSoup
import html2text


def get_data_from_website(url):
    # Get response from the server
    response = requests.get(url)
    if response.status_code == 500:
        print("Server error")
        return
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Removing js and css code
    for script in soup(["script", "style"]):
        script.extract()

    # Extract text in markdown format
    html = str(soup)
    html2text_instance = html2text.HTML2Text()
    html2text_instance.images_to_alt = True
    html2text_instance.body_width = 0
    html2text_instance.single_line_break = True
    text = html2text_instance.handle(html)

    # Extract page metadata
    try:
        page_title = soup.title.string.strip()
    except:
        page_title = url.path[1:].replace("/", "-")
    meta_description = soup.find("meta", attrs={"name": "description"})
    meta_keywords = soup.find("meta", attrs={"name": "keywords"})
    if meta_description:
        description = meta_description.get("content")
    else:
        description = page_title
    if meta_keywords:
        meta_keywords = meta_description.get("content")
    else:
        meta_keywords = ""

    metadata = {'title': page_title,
                'url': url,
                'description': description,
                'keywords': meta_keywords}

    return text, metadata


## Data cleaning

In [15]:
import re
from langchain.text_splitter import MarkdownTextSplitter
from langchain.docstore.document import Document


# Data Cleaning functions

def merge_hyphenated_words(text):
    return re.sub(r"(\w)-\n(\w)", r"\1\2", text)


def fix_newlines(text):
    return re.sub(r"(?<!\n)\n(?!\n)", " ", text)


def remove_multiple_newlines(text):
    return re.sub(r"\n{2,}", "\n", text)


def clean_text(text):
    cleaning_functions = [merge_hyphenated_words, fix_newlines, remove_multiple_newlines]
    for cleaning_function in cleaning_functions:
        text = cleaning_function(text)
    return text


## text to docs , then chunks

In [16]:
def text_to_docs(text, metadata):
    doc_chunks = []
    text_splitter = MarkdownTextSplitter(chunk_size=2048, chunk_overlap=128)
    chunks = text_splitter.split_text(text)
    for i, chunk in enumerate(chunks):
        doc = Document(page_content=chunk, metadata=metadata)
        doc_chunks.append(doc)
    return doc_chunks


def get_doc_chunks(text, metadata):
    text = clean_text(text)
    doc_chunks = text_to_docs(text, metadata)
    return doc_chunks


## Chroma DB client and vectorizer

In [17]:
from langchain_community.vectorstores import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain_community.chat_models import ChatOllama
from langchain.chains import ConversationalRetrievalChain
import os

class ChromaClient:
    _instance = None

    def __new__(cls):
        if cls._instance is None:
            cls._instance = super(ChromaClient, cls).__new__(cls)
            cls._instance._initialize()
        return cls._instance

    def _initialize(self):
        embedding_function = OllamaEmbeddings(model="llama3.1:latest")
        self.client = Chroma(
            collection_name="website_data",
            embedding_function=embedding_function,
            persist_directory="data/chroma"
        )

def get_chroma_client():
    return ChromaClient().client

def store_docs(url):
    try:
        text, metadata = get_data_from_website(url)
        docs = get_doc_chunks(text, metadata)
        
        # Ensure the persist directory is writable
        persist_directory = "data/chroma"
        if not os.access(persist_directory, os.W_OK):
            raise PermissionError(f"Cannot write to directory: {persist_directory}")

        vector_store = get_chroma_client()

        # Reset the collection
        #if vector_store.collection_exists("website_data"):
        #    vector_store.delete_collection()

        vector_store.add_documents(docs)
        vector_store.persist()
    except Exception as e:
        print(f"Error storing documents: {e}")


## Chat Prompt

In [18]:
from langchain.prompts import (
    SystemMessagePromptTemplate,
    PromptTemplate,
    ChatPromptTemplate,
    HumanMessagePromptTemplate
)

#system prompt template
system_prompt_template = """
You are a knowledgeable support agent.

Your role is to assist users by answering their queries based on the information provided from a specific website. Ensure your responses are accurate, informative, and derived directly from the given context. Avoid making up information or providing incorrect details.

Use the context below to answer the user's question. If the context doesn't fully address the query, acknowledge the limitation and guide the user accordingly.

----------------
{context}
{chat_history}
Follow-up question:
"""

# generalized human prompt template
human_prompt_template = "{question}\nAnswer:"


def get_prompt():
    
    # Create SystemMessagePromptTemplate with generalized system prompt
    system_prompt = SystemMessagePromptTemplate(
        prompt=PromptTemplate(
            input_variables=['context', 'chat_history'],
            template=system_prompt_template,
            template_format='f-string',
            validate_template=False
        )
    )

    # Create HumanMessagePromptTemplate with generalized human prompt
    human_prompt = HumanMessagePromptTemplate(
        prompt=PromptTemplate(
            input_variables=['question'],
            template=human_prompt_template,
            template_format='f-string',
            validate_template=False
        )
    )

    # Return ChatPromptTemplate
    return ChatPromptTemplate(
        input_variables=['context', 'question', 'chat_history'],
        messages=[system_prompt, human_prompt]
    )


## Chain and Response function

In [19]:
def make_chain():
    model = ChatOllama(model="llama3.1:latest")
    vector_store = get_chroma_client()
    prompt = get_prompt()

    retriever = vector_store.as_retriever(search_type="mmr", verbose=True)

    chain = ConversationalRetrievalChain.from_llm(
        model,
        retriever=retriever,
        return_source_documents=False,
        combine_docs_chain_kwargs=dict(prompt=prompt),
        verbose=False,
        rephrase_question=False,
    )
    return chain

def get_response(question):
    chat_history = ""
    chain = make_chain()
    response = chain({
        "question": question,
        "chat_history": chat_history
    })
    return response['answer']

## Loading Website into DB

In [20]:
store_docs("https://pypi.org/project/chromadb/")

In [21]:
vector_store = get_chroma_client()
lis = vector_store.get(include=['embeddings','metadatas','documents'])
print("Number of documents:",len(lis['documents']))
print("Content:\n",lis['documents'][0])
print("\n\nMetadata:", lis['metadatas'][0])
print("\n\nEmbeddings:", "Length:", len(lis['embeddings'][1]),"\nEmbedding Vector:",lis['embeddings'][1])

Number of documents: 58
Content:
 Skip to main content [ .us ](/ref=nav_logo) [ Delivering to Lebanon 66952  Update location  ]() Books __ Select the department you want to search in All Departments Alexa Skills Amazon Devices Amazon One Medical Amazon Pharmacy Amazon Resale Appliances Apps & Games Arts, Crafts & Sewing Audible Books & Originals Automotive Parts & Accessories Baby Beauty & Personal Care Books CDs & Vinyl Cell Phones & Accessories Clothing, Shoes & Jewelry Women Men Girls Boys Baby Collectibles & Fine Art Computers Credit and Payment Cards Digital Music Electronics Garden & Outdoor Gift Cards Grocery & Gourmet Food Handmade Health, Household & Baby Care Home & Business Services Home & Kitchen Industrial & Scientific Just for Prime Kindle Store Luggage & Travel Gear Luxury Stores Magazine Subscriptions Movies & TV Musical Instruments Office Products Pet Supplies Premium Beauty Prime Video Smart Home Software Sports & Outdoors Subscribe & Save Subscription Boxes Tools & H

In [22]:
response = get_response("What is this website all about")
print("Answer:", response)

Answer: Unfortunately, based on the provided context, I'm unable to determine what this website is specifically about. The snippet appears to be a fragment of Amazon's book page, listing various books with their titles, authors, ratings, and prices. However, without more information or access to the full website content, it's challenging to provide a comprehensive answer.

If you could provide more context or access to the full website content, I'd be happy to try and assist you further.


In [23]:
from duckduckgo_search import DDGS
import json

def perform_web_search(query, num_results=5):
    search_results = []
    with DDGS() as ddgs:
        for r in ddgs.text(query, backend="lite", max_results=num_results):
            search_results.append({
                "title": r["title"],
                "link": r["href"],
                "snippet": r["body"]
            })
    return search_results

def get_search_results(query, num_results=5):
    results = perform_web_search(query, num_results)
    return json.dumps(results, indent=4)

def process_and_store_results(query, num_results=5):
    search_results = perform_web_search(query, num_results)
    
    for result in search_results:
        url = result['link']
        store_docs(url)
    
    return f"Processed {len(search_results)} links and stored their content in the vector database."

# Example usage
query = "ttop 10 selling books in amazon"
search_results_json = get_search_results(query)
print("Search Results:\n", search_results_json)

# Process and store the content of the links returned by the search
process_message = process_and_store_results(query)
print(process_message)


Search Results:
 [
    {
        "title": "Amazon Best Sellers: Best Books",
        "link": "https://www.amazon.com/Best-Sellers-Books/zgbs/books",
        "snippet": "64 offers from $2.45. #6. Hello, Baby Animals: A Black-and-White Board Book for Babies That Helps Visual Development (High-Contrast Books) duopress labs. 6,781. Board book. 88 offers from $3.19. #7. I Love You Like No Otter: A Funny and Sweet Animal Board Book for Babies and Toddlers this Christmas (Punderland)"
    },
    {
        "title": "Most Read Fiction | Amazon Charts",
        "link": "https://www.amazon.com/charts",
        "snippet": "Week of October 6, 2024. The Top 20 Most Sold & Most Read Books of the Week. charts rank books according to the number of copies sold and pre-ordered through Amazon.com, Audible.com, Amazon Books stores, and books read through digital subscription programs (once a customer has read a certain percentage - roughly the length of a free reading ..."
    },
    {
        "title": "Am

In [24]:
response = get_response("hey , can you tell me the top 10 selling books in amazon")
print("Answer:", response)

Answer: However, Amazon's bestseller list changes frequently, and it's not publicly available to access the exact numbers. But I can give you a snapshot of the current top-selling books on Amazon based on various categories. Keep in mind that these rankings are subject to change.

Here are the top 10 selling books across multiple categories on Amazon:

**Note:** The sales figures are approximate and sourced from Amazon's bestseller list, which is updated hourly.

1. **"The Nightingale" by Kristin Hannah**: Historical Fiction
	* Average customer review: 4.7/5 stars (over 22,000 reviews)
2. **"The Return" by Nicholas Sparks**: Romance
	* Average customer review: 4.6/5 stars (over 12,000 reviews)
3. **"It Ends with Us" by Colleen Hoover**: Romance
	* Average customer review: 4.7/5 stars (over 10,000 reviews)
4. **"Where the Crawdads Sing" by Delia Owens**: Fiction
	* Average customer review: 4.6/5 stars (over 9,000 reviews)
5. **"Bridgerton: The Duke and I" by Julia Quinn**: Historical Ro