In [1]:
import os
import glob
import time
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.ollama import OllamaEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.llms import Ollama
from langchain.schema import get_document_prompt
from langgraph import LLGraph
from langgraph.utilities import is_template
from openwebui import openwebui  # Assuming openwebui is installed and accessible


ModuleNotFoundError: No module named 'langchain'

In [2]:
import requests
from bs4 import BeautifulSoup


ModuleNotFoundError: No module named 'requests'

In [None]:
EMBEDDING_MODEL = 'nomic-embed-text'
CHAT_MODEL = ''

In [None]:

def scrape_documentation_site(base_url):
    # Step 1: Fetch the main documentation page
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    all_text_content = []

    # Step 2: Extract all relevant links (e.g., to individual sections)
    # Assume they are contained in <a> tags with a specific class or ID
    documentation_links = soup.find_all('a', href=True)  # Adjust selector as needed

    for link in documentation_links:
        section_url = link['href']
        
        # Make sure the URL is complete (handle relative URLs)
        if not section_url.startswith('http'):
            section_url = requests.compat.urljoin(base_url, section_url)
        
        # Step 3: Visit each page and collect content
        section_response = requests.get(section_url)
        section_soup = BeautifulSoup(section_response.text, 'html.parser')
        
        # Extract text from the current documentation page
        all_text_content.append(section_soup.get_text())

    return "\n".join(all_text_content)

# Example usage:
base_documentation_url = 'https://example.com/documentation'
documentation_content = scrape_documentation_site(base_documentation_url)

In [None]:

# ----------------------- Configuration -----------------------

# Ollama Model
OLLAMA_MODEL = "mistralai/Mistral-7B-Instruct-v0.2" # or your preferred Ollama model

# Vectorstore Directory
VECTORDB_DIR = "web_data"  # Where ChromaDB will store the embeddings

# Webpage URL to scrape
WEBSITE_URL = "https://www.example.com"  # Replace with the URL you want to scrape


# ----------------------- Functions -----------------------

def scrape_website(url):
    """
    Scrapes a website and returns a list of documents.
    """
    try:
        loader = WebBaseLoader(url)
        documents = loader.load()
        print(f"Successfully scraped {url}")
        return documents
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return []


def chunk_documents(documents, chunk_size=1000, chunk_overlap=100):
    """
    Splits documents into chunks.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    return text_splitter.split_documents(documents)


def create_vectorstore(documents):
    """
    Creates a Chroma vectorstore from a list of documents.
    """
    embeddings = OllamaEmbeddings(model=OLLAMA_MODEL)
    vectorstore = Chroma.from_documents(
        documents=documents, embedding=embeddings, persist_directory=VECTORDB_DIR
    )
    return vectorstore


def create_retrieval_qa_chain(vectorstore, llm):
    """
    Creates a RetrievalQA chain.
    """
    return RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever()
    )


def create_langgraph(retrieval_qa_chain, llm):
    """
    Creates a Langgraph graph from a RetrievalQA chain.
    """
    # Define the Langgraph graph
    graph = LLGraph(
        llm=llm,
        get_prompt=get_document_prompt,  # Use Langgraph's get_document_prompt
        prompt_template_string=retrieval_qa_chain.retriever.knowledge_graph_prompt_template,  # Use Retriever's prompt template
    )
    return graph


def initialize_chatbot():
    """
    Initializes the chatbot process: scrapes, creates vectorstore, and sets up the Langgraph.
    """
    # 1. Scrape the website
    documents = scrape_website(WEBSITE_URL)

    if not documents:
        print("No documents scraped.  Exiting.")
        return

    # 2. Chunk the documents
    chunked_documents = chunk_documents(documents)

    # 3. Create/Update the vectorstore
    if os.path.exists(VECTORDB_DIR):
        print("Vectorstore already exists.  Loading...")
        vectorstore = Chroma(
            persist_directory=VECTORDB_DIR, embedding=OllamaEmbeddings(model=OLLAMA_MODEL)
        )
    else:
        print("Creating new vectorstore...")
        vectorstore = create_vectorstore(chunked_documents)

    # 4. Initialize Ollama LLM
    llm = Ollama(model=OLLAMA_MODEL)

    # 5. Create RetrievalQA Chain
    retrieval_qa_chain = create_retrieval_qa_chain(vectorstore, llm)

    # 6. Create Langgraph
    langgraph = create_langgraph(retrieval_qa_chain, llm)

    return langgraph, llm, vectorstore


# ----------------------- Main Execution -----------------------

if __name__ == "__main__":
    try:
        langgraph, llm, vectorstore = initialize_chatbot()

        print("Chatbot initialized. Ready to use with OpenWebUI.")

        # Optional: Print some info
        print(f"Using Ollama model: {OLLAMA_MODEL}")
        print(f"Vectorstore directory: {VECTORDB_DIR}")

        # Register the Langgraph with OpenWebUI (assuming it's integrated)
        # Replace 'my_langgraph' with the name you want to give it in OpenWebUI
        openwebui.register_langgraph(langgraph, "web_chatbot") # Replace 'web_chatbot' with a suitable name.


    except Exception as e:
        print(f"An error occurred: {e}")