<a href="https://colab.research.google.com/github/arijitmazumdar/colab-repo/blob/main/website_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import libraries

In [None]:
!pip install aiofiles
!pip install aiohappyeyeballs
!pip install aiohttp
!pip install aiosignal
!pip install altair
!pip install annotated-types
!pip install annoy
!pip install anthropic
!pip install anyio
!pip install appdirs
!pip install attrs
!pip install audioread
!pip install beautifulsoup4
!pip install blinker
!pip install cachetools
!pip install certifi
!pip install cffi
!pip install charset-normalizer
!pip install click
!pip install contourpy
!pip install cryptography
!pip install cycler
!pip install dataclasses-json
!pip install datasets
!pip install decorator
!pip install defusedxml
!pip install dill
!pip install distro
!pip install faiss-cpu
!pip install fastapi
!pip install ffmpy
!pip install filelock
!pip install fonttools
!pip install frozenlist
!pip install fsspec
!pip install gensim
!pip install gitdb
!pip install GitPython
!pip install gradio
!pip install gradio_client
!pip install h11
!pip install httpcore
!pip install httpx
!pip install huggingface-hub
!pip install idna
!pip install importlib_resources
!pip install Jinja2
!pip install jiter
!pip install joblib
!pip install jsonpatch
!pip install jsonpointer
!pip install jsonschema
!pip install jsonschema-specifications
!pip install kiwisolver
!pip install langchain
!pip install langchain-anthropic
!pip install langchain-community
!pip install langchain-core
!pip install langchain-openai
!pip install langchain-google-genai
!pip install langchain-text-splitters
!pip install langsmith
!pip install lazy_loader
!pip install librosa
!pip install llvmlite
!pip install lxml
!pip install markdown-it-py
!pip install MarkupSafe
!pip install marshmallow
!pip install matplotlib
!pip install mdurl
!pip install mpmath
!pip install msgpack
!pip install multidict
!pip install multiprocess
!pip install mypy-extensions
!pip install narwhals
!pip install nest-asyncio
!pip install networkx
!pip install node2vec
!pip install numba
!pip install numpy
!pip install openai
!pip install orjson
!pip install packaging
!pip install pandas
!pip install pdfminer.six
!pip install pdfplumber
!pip install pillow
!pip install pinecone
!pip install pinecone-plugin-inference
!pip install pinecone-plugin-interface
!pip install platformdirs
!pip install pooch
!pip install protobuf
!pip install pyarrow
!pip install pycparser
!pip install pydantic
!pip install pydantic-settings
!pip install pydantic_core
!pip install pydeck
!pip install pydub
!pip install Pygments
!pip install pyparsing
!pip install PyPDF2
!pip install pypdfium2
!pip install pysbd
!pip install python-dateutil
!pip install python-dotenv
!pip install python-multipart
!pip install pytz
!pip install PyYAML
!pip install ragas
!pip install referencing
!pip install regex
!pip install requests
!pip install rich
!pip install rpds-py
!pip install ruff
!pip install safetensors
!pip install scikit-learn
!pip install scipy
!pip install semantic-version
!pip install sentence-transformers
!pip install shellingham
!pip install six
!pip install smart-open
!pip install smmap
!pip install sniffio
!pip install soundfile
!pip install soupsieve
!pip install soxr
!pip install SQLAlchemy
!pip install starlette
!pip install streamlit
!pip install sympy
!pip install tabulate
!pip install tenacity
!pip install threadpoolctl
!pip install tiktoken
!pip install tokenizers
!pip install toml
!pip install tomlkit
!pip install torch
!pip install torchvision
!pip install tornado
!pip install tqdm
!pip install transformers
!pip install typer
!pip install typing-inspect
!pip install typing_extensions
!pip install tzdata
!pip install urllib3
!pip install uvicorn
!pip install websockets
!pip install wrapt
!pip install xxhash
!pip install yarl
!pip install fitz
!pip install pytesseract
!pip install pymupdf


Add import

In [None]:
import os
import requests
from bs4 import BeautifulSoup
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
import numpy as np
import time
import random
import tempfile
from langchain_community.document_loaders import BSHTMLLoader
from langchain.memory import ConversationBufferMemory
from google.colab import userdata
from langchain_google_genai import ChatGoogleGenerativeAI


Pull Open API Key from secret

In [None]:
os.environ["OPENAI_API_KEY"] = userdata.get('open_api_key')

Configuration variables

In [None]:
CHUNK_SIZE = 300
CHUNK_OVERLAP = 50
MAX_TOKENS = 15000
MODEL_NAME = "gpt-4o-mini"
TEMPERATURE = 0.4

Functions for cleanly fetch html content from website

In [None]:
def scrape_website(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()

        # Get text from various elements
        content = []
        for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'span', 'div']):
            if elem.text.strip():
                content.append(elem.text.strip())

        # If no content found, try to get all text from body
        if not content:
            body = soup.find('body')
            if body:
                content = [body.get_text(separator='\n', strip=True)]

        if not content:
            print("Warning: No content found. The website might have unusual structure or require JavaScript.")
            return []

        return content
    except requests.RequestException as e:
        print(f"Error scraping the website: {e}")
        return []

def clean_content(content_list):
    # Remove very short or common unwanted items
    cleaned = [text for text in content_list if len(text) > 20 and not any(item in text.lower() for item in ['sign up', 'sign in', 'cookie', 'privacy policy'])]
    return cleaned

def fetch_html(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching the website: {e}")
        return None

def process_website(url):
    html_content = fetch_html(url)
    if not html_content:
        raise ValueError("No content could be fetched from the website.")

    # Use a temporary file to store the HTML content
    with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.html') as temp_file:
        temp_file.write(html_content)
        temp_file_path = temp_file.name

    try:
        # Try to use BSHTMLLoader with default settings (which uses 'lxml')
        loader = BSHTMLLoader(temp_file_path)
        documents = loader.load()
    except ImportError:
        print("'lxml' is not installed. Falling back to built-in 'html.parser'.")
        # If 'lxml' is not available, use the built-in 'html.parser'
        loader = BSHTMLLoader(temp_file_path, bs_kwargs={'features': 'html.parser'})
        documents = loader.load()

    # Clean up the temporary file
    os.unlink(temp_file_path)

    print(f"\nNumber of documents loaded: {len(documents)}")
    if documents:
        print("Sample of loaded content:")
        print(documents[0].page_content[:200] + "...")
        print(f"Metadata: {documents[0].metadata}")

    text_splitter = CharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
    texts = text_splitter.split_documents(documents)
    print(f"Number of text chunks after splitting: {len(texts)}")
    return texts

Utility method to print embeddings

In [None]:
def print_sample_embeddings(texts, embeddings):
    if texts:
        sample_text = texts[0].page_content
        sample_embedding = embeddings.embed_query(sample_text)
        print("\nSample Text:")
        print(sample_text[:200] + "..." if len(sample_text) > 200 else sample_text)
        print("\nSample Embedding (first 10 dimensions):")
        print(np.array(sample_embedding[:10]))
        print(f"\nEmbedding shape: {np.array(sample_embedding).shape}")
    else:
        print("No texts available for embedding sample.")

Configure LLM

In [None]:
# #Set up OpenAI language model
# from langchain_openai import ChatOpenAI

# llm = ChatOpenAI(
#     model_name=MODEL_NAME,
#     temperature=TEMPERATURE,
#     max_tokens=MAX_TOKENS
# )

# Configuration variables
TEMPERATURE = 0.4

# Set up Google Gemini language model
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", temperature=TEMPERATURE, google_api_key=userdata.get('GOOGLE_API_KEY'))


# Set up the retrieval-based QA system with a simplified prompt template
template = """Context: {context}

Question: {question}

Answer the question concisely based only on the given context. If the context doesn't contain relevant information, say "I don't have enough information to answer that question."

But, if the question is generic, then go ahead and answer the question, example what is a electric vehicle?
"""

from langchain.prompts import PromptTemplate
PROMPT = PromptTemplate(
    template=template, input_variables=["context", "question"]
)

from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

# # Set up Ollama language model
# from langchain_community.llms import Ollama
# llm = Ollama(model="tinyllama:latest", temperature=TEMPERATURE, num_predict=MAX_TOKENS)

Setup RAG pipeline

In [None]:

def rag_pipeline(query, qa_chain, vectorstore):
    relevant_docs = vectorstore.similarity_search_with_score(query, k=3)

    print("\nTop 3 most relevant chunks:")
    context = ""
    for i, (doc, score) in enumerate(relevant_docs, 1):
        print(f"{i}. Relevance Score: {score:.4f}")
        print(f"   Content: {doc.page_content[:200]}...")
        print()
        context += doc.page_content + "\n\n"

    # Print the full prompt
    full_prompt = PROMPT.format(context=context, question=query)
    print("\nFull Prompt sent to the model:")
    print(full_prompt)
    print("\n" + "="*50 + "\n")


    response = qa_chain.invoke({"query": query})
    return response['result']

Setup main method

In [None]:
if __name__ == "__main__":
    print("Welcome to the Enhanced Web Scraping RAG Pipeline.")

    while True:
        url = input("Please enter the URL of the website you want to query (or 'quit' to exit): ")
        if url.lower() == 'quit':
            print("Exiting the program. Goodbye!")
            break

        try:
            print("Processing website content...")
            texts = process_website(url)

            if not texts:
                print("No content found on the website. Please try a different URL.")
                continue

            print("Creating embeddings and vector store...")
            from langchain_community.embeddings import HuggingFaceEmbeddings
            embeddings = HuggingFaceEmbeddings()


            print_sample_embeddings(texts, embeddings)

            vectorstore = FAISS.from_documents(texts, embeddings)

            qa = RetrievalQA.from_chain_type(
                llm=llm,
                chain_type="stuff",
                retriever=vectorstore.as_retriever(),
                memory = memory,
                chain_type_kwargs={"prompt": PROMPT}
            )

            print("\nRAG Pipeline initialized. You can now enter your queries.")
            print("Enter 'new' to query a new website or 'quit' to exit the program.")

            while True:
                user_query = input("\nEnter your query: ")
                if user_query.lower() == 'quit':
                    print("Exiting the program. Goodbye!")
                    exit()
                elif user_query.lower() == 'new':
                    break

                result = rag_pipeline(user_query, qa, vectorstore)
                print(f"RAG Response: {result}")

        except Exception as e:
            print(f"An error occurred: {e}")
            print("Please try a different URL or check your internet connection.")