In [33]:
import nest_asyncio

nest_asyncio.apply()

In [11]:
import requests
from urllib.parse import urljoin, urlparse
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import re

def normalize_url(url):
    if not url.startswith("http://") and not url.startswith("https://"):
        url = "https://" + url
    if not url.endswith("/"):
        url += "/"
    return url

def fetch_urls_from_sitemap_index(sitemap_index_url):
    try:
        response = requests.get(sitemap_index_url)
        response.raise_for_status()
        sitemap_index_content = response.text
        root = ET.fromstring(sitemap_index_content)
        urls = []
        for sitemap in root.findall("{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap"):
            loc = sitemap.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text
            urls.append(loc)
        return urls
    except (requests.RequestException, ET.ParseError) as e:
        print(f"Error fetching sitemap index: {e}")
        return []

def fetch_urls_from_sitemap(sitemap_url):
    try:
        response = requests.get(sitemap_url)
        response.raise_for_status()
        sitemap_content = response.text
        root = ET.fromstring(sitemap_content)
        urls = []
        for url in root.findall("{http://www.sitemaps.org/schemas/sitemap/0.9}url"):
            loc = url.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text
            urls.append(loc)
        return urls
    except (requests.RequestException, ET.ParseError) as e:
        print(f"Error fetching sitemap: {e}")
        return []

def fetch_urls_from_html_sitemap(html_sitemap_url):
    print("Looking for html sitemap...")
    try:
        response = requests.get(html_sitemap_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        urls = []
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            if href.startswith('http'):
                urls.append(href)
                print(href)
        return urls
    except requests.RequestException as e:
        print(f"Error fetching HTML sitemap: {e}")
        return []

def crawl_website(start_url):
    print("Found URLs, now crawling websites...")
    visited = set()
    to_visit = {start_url}
    urls = []

    while to_visit:
        current_url = to_visit.pop()
        if current_url not in visited:
            visited.add(current_url)
            try:
                response = requests.get(current_url)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, 'html.parser')
                for a_tag in soup.find_all('a', href=True):
                    href = a_tag['href']
                    if href.startswith('http'):
                        full_url = href
                    else:
                        full_url = urljoin(current_url, href)
                    if urlparse(full_url).netloc == urlparse(start_url).netloc:
                        if full_url not in visited:
                            to_visit.add(full_url)
                            urls.append(full_url)
            except requests.RequestException:
                continue

    return urls

def discover_all_urls(website_url):
    website_url = normalize_url(website_url)
    urls = set()

    # Check common sitemap locations
    sitemap_index_url = urljoin(website_url, "sitemap.xml")
    sitemap_urls = fetch_urls_from_sitemap_index(sitemap_index_url)
    if not sitemap_urls:
        sitemap_urls = [sitemap_index_url]

    for sitemap_url in sitemap_urls:
        urls.update(fetch_urls_from_sitemap(sitemap_url))
    
    # Check for HTML sitemap
    html_sitemap_url = urljoin(website_url, "sitemap")
    urls.update(fetch_urls_from_html_sitemap(html_sitemap_url))
    
    # Crawl the website
    urls.update(crawl_website(website_url))

    return list(urls)




In [12]:
if __name__ == "__main__":
    website = input("Enter the website (e.g., inzint.com): ")
    all_urls = discover_all_urls(website)

    if all_urls:
        print("All URLs found on the website:")
        for url in all_urls:
            print(url)
    else:
        print("No URLs found or an error occurred.")

Error fetching sitemap index: 500 Server Error: Internal Server Error for url: https://www.uhbvn.org.in/sitemap.xml
Error fetching sitemap: 500 Server Error: Internal Server Error for url: https://www.uhbvn.org.in/sitemap.xml
Looking for html sitemap.
https://www.uhbvn.org.in/web/portal/about-uhbvn
https://www.uhbvn.org.in/web/portal/vision
https://www.uhbvn.org.in/web/portal/mission
https://www.uhbvn.org.in/web/portal/board-of-directors
https://www.uhbvn.org.in/web/portal/board-committee
https://www.uhbvn.org.in/web/portal/organization-structure
https://www.uhbvn.org.in/web/portal/annual-financial-statements
https://www.uhbvn.org.in/web/portal/csr-activities
https://www.uhbvn.org.in/web/portal/statistics1
https://www.uhbvn.org.in/web/portal/tender-notice
https://www.uhbvn.org.in/web/portal/technical-spec
https://www.uhbvn.org.in/web/portal/item-wise-rates
https://www.uhbvn.org.in/staticContent/documents/LSREC.pdf
https://www.uhbvn.org.in/staticContent/documents/Empaneled_Firms.pdf
htt

KeyboardInterrupt: 

# LLM

In [None]:
from langchain_community.llms import Ollama

llm = Ollama(model="mistral")

In [None]:
from langchain_community.embeddings import OllamaEmbeddings
embeddings = OllamaEmbeddings(model="nomic-embed-text")


In [None]:
from langchain_community.document_loaders import AsyncHtmlLoader

urls = all_urls
loader = AsyncHtmlLoader(urls)
docs = loader.load()

In [None]:
from langchain_community.document_transformers import Html2TextTransformer

html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(docs)

In [None]:
for doc in docs_transformed:
    print(doc)