# This is an example notebook

Example Notebook

In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import urljoin
from typing import List
import time

def extract_nav_urls(homepage_url: str) -> List[str]:
    """
    Extract all navigation URLs from a webpage's navigation elements using Selenium.
    
    Args:
        homepage_url (str): The URL of the webpage to analyze
        
    Returns:
        List[str]: List of absolute URLs found in navigation elements
        
    Raises:
        WebDriverException: If there's an error with the browser automation
        ValueError: If the URL is invalid
    """
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument('--headless')  # Run in headless mode
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--window-size=1920,1080')
    
    # Add random user agent
    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')

    nav_urls = set()
    driver = None
    
    try:
        # Initialize the driver
        driver = webdriver.Chrome(options=chrome_options)
        
        # Set page load timeout
        driver.set_page_load_timeout(20)
        
        # Load the page
        driver.get(homepage_url)
        
        # Wait for the page to load and nav elements to be present
        wait = WebDriverWait(driver, 10)
        wait.until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
        
        # Add small delay to ensure dynamic content loads
        time.sleep(2)
        
        # Find all navigation elements
        nav_elements = driver.find_elements(By.CSS_SELECTOR, 'nav, header, [class*="nav"], [class*="menu"], [id*="nav"], [id*="menu"]')
        
        for nav in nav_elements:
            # Find all links within navigation elements
            links = nav.find_elements(By.TAG_NAME, 'a')
            for link in links:
                try:
                    href = link.get_attribute('href')
                    if href:
                        # Convert relative URLs to absolute URLs
                        absolute_url = urljoin(homepage_url, href)
                        # Skip javascript links and anchor links
                        if absolute_url.startswith('http') and '#' not in absolute_url:
                            nav_urls.add(absolute_url)
                except Exception:
                    continue
        
        return list(nav_urls)
    
    except Exception as e:
        raise Exception(f"Error extracting navigation URLs: {str(e)}")
    
    finally:
        if driver:
            driver.quit()

In [4]:
result=extract_nav_urls("https://www.codewithharry.com")

In [5]:
result

['https://www.codewithharry.com/shop/',
 'https://www.codewithharry.com/refund/',
 'https://www.codewithharry.com/',
 'https://www.codewithharry.com/blog/',
 'https://www.codewithharry.com/terms/',
 'https://www.codewithharry.com/videos/',
 'https://www.codewithharry.com/my-gear/',
 'https://www.codewithharry.com/work/',
 'https://www.codewithharry.com/notes/',
 'https://www.codewithharry.com/tutorials/',
 'https://www.codewithharry.com/contact/',
 'https://www.codewithharry.com/privacy/']

In [6]:
from pydantic import BaseModel, Field, AnyUrl
from typing import List

class UrlClassify(BaseModel):
    "classfication of url based on its type and usage"
    desc_urls: List[AnyUrl] = Field(
        ..., 
        description="A list of URLs that provide descriptions, company info, blogs, or other non-product/service-related content.",
        example=["https://example.com/about-us", "https://example.com/contact"]
    )
    product_service_urls: List[AnyUrl] = Field(
        ..., 
        description="A list of URLs related to products or services offered by the business.",
        example=["https://example.com/shop", "https://example.com/service-web-development"]
    )


In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_core.messages import HumanMessage
from dotenv import load_dotenv
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.docstore.document import Document
from langchain_core.documents.base import Document
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from typing import List



print(load_dotenv())
import os 
os.environ['GOOGLE_API_KEY']=os.getenv("GOOGLE_API_KEY")
model_name=os.getenv("MODEL")

llm=ChatGoogleGenerativeAI(model=model_name)
# Create FAISS vectorstore for product information
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

ImportError: cannot import name 'D' from 'langchain_core.documents.base' (c:\Users\AkshayKumarBM\miniconda3\Lib\site-packages\langchain_core\documents\base.py)

In [8]:
web_classfier=llm.with_structured_output(UrlClassify)

In [9]:
result=web_classfier.invoke("['https://www.codewithharry.com/work/', 'https://www.codewithharry.com/contact/', 'https://www.codewithharry.com/my-gear/', 'https://www.codewithharry.com/refund/', 'https://www.codewithharry.com/blog/', 'https://www.codewithharry.com/tutorials/', 'https://www.codewithharry.com/shop/', 'https://www.codewithharry.com/privacy/', 'https://www.codewithharry.com/', 'https://www.codewithharry.com/notes/', 'https://www.codewithharry.com/terms/', 'https://www.codewithharry.com/videos/']")

In [10]:
result

UrlClassify(desc_urls=[AnyUrl('https://www.codewithharry.com/work/'), AnyUrl('https://www.codewithharry.com/contact/'), AnyUrl('https://www.codewithharry.com/my-gear/'), AnyUrl('https://www.codewithharry.com/refund/'), AnyUrl('https://www.codewithharry.com/blog/'), AnyUrl('https://www.codewithharry.com/privacy/'), AnyUrl('https://www.codewithharry.com/'), AnyUrl('https://www.codewithharry.com/notes/'), AnyUrl('https://www.codewithharry.com/terms/')], product_service_urls=[AnyUrl('https://www.codewithharry.com/tutorials/'), AnyUrl('https://www.codewithharry.com/shop/'), AnyUrl('https://www.codewithharry.com/videos/')])

In [11]:
from langchain_community.document_loaders import UnstructuredURLLoader

In [12]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
desc_loader=UnstructuredURLLoader(urls=result.desc_urls)
Desc_data=desc_loader.load()

spitter=RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
Final_Desc_data=spitter.split_documents(Desc_data)
Final_Desc_data


[Document(metadata={'source': AnyUrl('https://www.codewithharry.com/work/')}, page_content='</> CodeWithHarry\n\nMenu\n\nHome\n\nCourses\n\nTutorial\n\nBlog\n\nNotes\n\nContact\n\nMy Gear\n\nWork With Us\n\nHTML\n\nCSS\n\nJS\n\nC\n\nC++\n\nJAVA\n\nPYTHON\n\nPHP\n\nREACT JS\n\nHome\n\nCourses\n\nTutorial\n\nHTML\n\nCSS\n\nJS\n\nC\n\nC++\n\nJAVA\n\nPYTHON\n\nPHP\n\nREACT JS\n\nBlog\n\nNotes\n\nContact\n\nMy Gear\n\nWork With Us\n\nWe are hiring!\n\nWe are looking for freelance Developers, Subtitle writers, Content writers and Video editors. If you think you are fit for the role. Submit the form. Make sure you have a valid email so we can contact you back in case your application gets selected. Cheers!\n\nApply Now!\n\nWe will be in touch soon!\n\nCodeWithHarryManaged by CWH Solutions'),
 Document(metadata={'source': AnyUrl('https://www.codewithharry.com/contact/')}, page_content='</> CodeWithHarry\n\nMenu\n\nHome\n\nCourses\n\nTutorial\n\nBlog\n\nNotes\n\nContact\n\nMy Gear\n\nWork With 

In [13]:
prod_loader=UnstructuredURLLoader(urls=result.product_service_urls)
Prod_data=prod_loader.load()

from langchain_text_splitters import RecursiveCharacterTextSplitter
spitter=RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
Final_Prod_data=spitter.split_documents(Prod_data)
Final_Prod_data


[Document(metadata={'source': AnyUrl('https://www.codewithharry.com/tutorials/')}, page_content='</> CodeWithHarry\n\nMenu\n\nHome\n\nCourses\n\nTutorial\n\nBlog\n\nNotes\n\nContact\n\nMy Gear\n\nWork With Us\n\nHTML\n\nCSS\n\nJS\n\nC\n\nC++\n\nJAVA\n\nPYTHON\n\nPHP\n\nREACT JS\n\nHome\n\nCourses\n\nTutorial\n\nHTML\n\nCSS\n\nJS\n\nC\n\nC++\n\nJAVA\n\nPYTHON\n\nPHP\n\nREACT JS\n\nBlog\n\nNotes\n\nContact\n\nMy Gear\n\nWork With Us\n\nTutorials\n\nPython Tutorial\n\nC Tutorial\n\nC++ Tutorial\n\nJava Tutorial\n\nHTML Tutorial\n\nCSS Tutorial\n\nJavaScript Tutorial\n\nPHP Tutorial\n\nReact JS Tutorial\n\nCodeWithHarryManaged by CWH Solutions'),
 Document(metadata={'source': AnyUrl('https://www.codewithharry.com/shop/')}, page_content='</> CodeWithHarry\n\nMenu\n\nHome\n\nCourses\n\nTutorial\n\nBlog\n\nNotes\n\nContact\n\nMy Gear\n\nWork With Us\n\nHTML\n\nCSS\n\nJS\n\nC\n\nC++\n\nJAVA\n\nPYTHON\n\nPHP\n\nREACT JS\n\nHome\n\nCourses\n\nTutorial\n\nHTML\n\nCSS\n\nJS\n\nC\n\nC++\n\nJAVA\n\n

In [14]:
# Convert product_info to Documents
def convert_product_info_to_documents(product_info: List[dict]) -> List[Document]:
    documents = []
    for produt in product_info:
        # Convert product dict to string for embedding
        product_str = produt
        doc = Document(
            page_content=str(product_str),

        )
        documents.append(doc)
    return documents

In [None]:
from functools import reduce
def merge_list_products(lp1, lp2):
	return List_product(products=lp1.products + lp2.products)

class ProductService(BaseModel):
    name : str = Field(None ,description="name of the product or service")
    description : str = Field(None ,description="description of the product or service")
    price : float = Field(None, description="price of the product or service")
    specifications : str = Field(None ,description="specifications of the product or service")
    features : str = Field(None, description="features of the product or service")
    
class List_product(BaseModel):
    products: List[ProductService]
prod_formate_llm=llm.with_structured_output(List_product)

product_info=[]
for docs in Final_Prod_data[:3]:
    product_info.append(prod_formate_llm.invoke(docs.page_content))

result=reduce(merge_list_products, product_info)



In [None]:
#if you want to update the product or service information do it here , you also delete from the list 
result.products

[ProductService(name='Handwritten C Notes', description=None, price=99.0, specifications='covering fundamentals, loops, arrays, and more.', features=None),
 ProductService(name='Handwritten C++ Notes', description=None, price=99.0, specifications='covering classes, inheritance, and polymorphism.', features=None),
 ProductService(name='Handwritten HTML Notes', description=None, price=99.0, specifications='covering tags, elements, and attributes.', features=None),
 ProductService(name='CSS Ebook', description=None, price=99.0, specifications='flexbox, grid, and animations', features=None),
 ProductService(name='Handwritten JavaScript Notes', description=None, price=99.0, specifications='covering variables, functions, loops, and ES6+ features.', features=None),
 ProductService(name='Custom Error Solving!', description=None, price=99999.0, specifications='We will solve your custom errors and provide you with a summary of the solution.', features=None),
 ProductService(name='Sigma Web Devel

In [104]:

product_docs = convert_product_info_to_documents(result.products)

In [102]:
product_docs

[Document(metadata={}, page_content="name='Handwritten C Notes' description=None price=99.0 specifications='covering fundamentals, loops, arrays, and more.' features=None"),
 Document(metadata={}, page_content="name='Handwritten C++ Notes' description=None price=99.0 specifications='covering classes, inheritance, and polymorphism.' features=None"),
 Document(metadata={}, page_content="name='Handwritten HTML Notes' description=None price=99.0 specifications='covering tags, elements, and attributes.' features=None"),
 Document(metadata={}, page_content="name='CSS Ebook' description=None price=99.0 specifications='flexbox, grid, and animations' features=None"),
 Document(metadata={}, page_content="name='Handwritten JavaScript Notes' description=None price=99.0 specifications='covering variables, functions, loops, and ES6+ features.' features=None"),
 Document(metadata={}, page_content="name='Custom Error Solving!' description=None price=99999.0 specifications='We will solve your custom er

In [16]:
product_vectorstore = FAISS.from_documents(
    documents=product_docs,
    embedding=embeddings
)

# Save the vector store
product_vectorstore.save_local("vectors/product_info_index")


In [17]:
product_vectorstore = FAISS.from_documents(
    documents=Final_Desc_data,
    embedding=embeddings
)
product_vectorstore.save_local("vectors/description_index")


In [18]:
prod_vector=FAISS.load_local("vectors/product_info_index", embeddings,allow_dangerous_deserialization=True)
product_retreiver=prod_vector.as_retriever()

In [19]:
desc_vector=FAISS.load_local("vectors/description_index", embeddings,allow_dangerous_deserialization=True)
desc_retreiver=desc_vector.as_retriever()

In [30]:
product_retreiver.invoke("hi")

[Document(id='3f4ff364-0803-496f-8d8f-9e4410d94168', metadata={}, page_content="name='CodeWithHarry' description='CodeWithHarry Managed by CWH Solutions' price=0.0 specifications='Tutorials, Python Tutorial, C Tutorial, C++ Tutorial, Java Tutorial, HTML Tutorial, CSS Tutorial, JavaScript Tutorial, PHP Tutorial, React JS Tutorial' features='HTML, CSS, JS, C, C++, JAVA, PYTHON, PHP, REACT JS'"),
 Document(id='99071055-4c81-4fa1-b10c-9e7e172e8211', metadata={}, page_content="name='Ultimate JavaScript Course' description='This latest JavaScript course comes with premium curriculum that covers everything from basics to advance. On top of that, you will get my handwritten notes of JS for completely free. What are you waiting for? Just Enroll Buddy' price=0.0 specifications='basics to advance' features='premium curriculum, handwritten notes'"),
 Document(id='636abeb0-5865-4507-a904-61258ed1a1d7', metadata={}, page_content="name='Handwritten C Notes' description='Master C programming with thes

In [21]:
desc_retreiver.invoke("hi")

[Document(id='eeae9155-554e-473e-a0d8-b36a00a38287', metadata={'source': AnyUrl('https://www.codewithharry.com/contact/')}, page_content='</> CodeWithHarry\n\nMenu\n\nHome\n\nCourses\n\nTutorial\n\nBlog\n\nNotes\n\nContact\n\nMy Gear\n\nWork With Us\n\nHTML\n\nCSS\n\nJS\n\nC\n\nC++\n\nJAVA\n\nPYTHON\n\nPHP\n\nREACT JS\n\nHome\n\nCourses\n\nTutorial\n\nHTML\n\nCSS\n\nJS\n\nC\n\nC++\n\nJAVA\n\nPYTHON\n\nPHP\n\nREACT JS\n\nBlog\n\nNotes\n\nContact\n\nMy Gear\n\nWork With Us\n\nFeel free to contact me!\n\nCodeWithHarryManaged by CWH Solutions'),
 Document(id='991e8d54-eecc-4ced-9f31-d6318f2e25d1', metadata={'source': AnyUrl('https://www.codewithharry.com/')}, page_content='Mohit KumarWeb Developer\n\nFor everyone who wants to level up their #Coding and #Dev skills - seriously, this channel is for you! Both basic and advanced stacks are covered on this channel, and one can learn according to his skill levels. And the icing on the cake is, everything is available for free.\n\nRakesh ShettyWe

In [None]:
class SelectRetriverRatio(BaseModel):
    """Analyze the user query and return a ratio based on its nature. 
    If the query is more product/service-related, the ratio is closer to 1. 
    If it is more descriptive/informational, the ratio is closer to 0."""
    
    ratio: float = Field(None description="Ratio indicating the strength of product/service (1) vs. description (0)")


In [24]:
retriever_selecter=llm.with_structured_output(SelectRetriverRatio)

SelectRetriverRatio(ratio=0.0)

In [None]:
#When the user query ask the question about the in chatbot
# User side 
from langchain_core.prompts import ChatPromptTemplate


In [66]:
def retrieve_documents_based_on_ratio(ratio: float, Query: str) -> List[Document]:


    num_desc_docs = int((1 - ratio) * 10)
    num_prod_docs = 10 - num_desc_docs

    desc_docs = desc_retreiver.invoke(Query,)[:num_desc_docs]
    prod_docs = product_retreiver.invoke(Query)[:num_prod_docs]

    return desc_docs + prod_docs

query="do you have python notes " 
ratio=retriever_selecter.invoke(query).ratio
print(ratio)
documents = retrieve_documents_based_on_ratio(ratio,query)
print(documents)


0.0
[Document(id='c8e9261c-c3e9-4f3d-8fed-59ac374eab6d', metadata={'source': AnyUrl('https://www.codewithharry.com/notes/')}, page_content='</> CodeWithHarry\n\nMenu\n\nHome\n\nCourses\n\nTutorial\n\nBlog\n\nNotes\n\nContact\n\nMy Gear\n\nWork With Us\n\nHTML\n\nCSS\n\nJS\n\nC\n\nC++\n\nJAVA\n\nPYTHON\n\nPHP\n\nREACT JS\n\nHome\n\nCourses\n\nTutorial\n\nHTML\n\nCSS\n\nJS\n\nC\n\nC++\n\nJAVA\n\nPYTHON\n\nPHP\n\nREACT JS\n\nBlog\n\nNotes\n\nContact\n\nMy Gear\n\nWork With Us\n\nDownload Notes by CodeWithHarry\n\nPython Notes\n\nDownload Notes Here\n\nC Notes\n\nDownload Notes Here\n\nAndroid Notes\n\nDownload Notes Here\n\nJava Notes\n\nDownload Notes Here\n\nHTML Notes\n\nDownload Notes Here\n\nCSS Notes\n\nDownload Notes Here\n\nDSA Notes\n\nNotes in progress\n\nJavascript Notes\n\nDownload Notes Here\n\nDownload Cheatsheets by CodeWithHarry\n\nPython CheatSheet\n\nDownload Cheatsheet Here\n\nC CheatSheet\n\nDownload CheatSheet Here\n\nC++ Cheatsheet\n\nDownload Cheatsheet Here\n\nJava

In [None]:
prompt_template=ChatPromptTemplate.from_messages(
"Act as Customer Support Manager \
Your task is to respond to the following customer query: {query} \
 Provide the most relevant information based on the query and keep the massge on point. \
 You have access to the following documents: {documents}"

)

In [72]:
chain =prompt_template | llm 
respone_massage =chain.invoke({"query":query,"documents":documents})
final_result=respone_massage.content