In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

In [6]:
# def get_all_links(base_url):
#     visited = set()
#     to_visit = [base_url]
#     all_links = []

#     while to_visit:
#         url = to_visit.pop(0)
#         if url in visited:
#             continue
#         visited.add(url)

#         try:
#             response = requests.get(url)
#             response.raise_for_status()
#             soup = BeautifulSoup(response.text, 'html.parser')
#             for link in soup.find_all('a', href=True):
#                 href = link['href']
#                 full_url = urljoin(base_url, href)
#                 if urlparse(full_url).netloc == urlparse(base_url).netloc:
#                     if full_url not in visited:
#                         to_visit.append(full_url)
#                     all_links.append(full_url)
#         except requests.RequestException:
#             continue

#     return all_links

def crawl_domain(start_url):
    # Normalize the start URL
    if not start_url.startswith('http'):
        start_url = 'http://' + start_url
    
    domain = urlparse(start_url).netloc
    crawled_urls = set()
    to_crawl = [start_url]

    while to_crawl:
        current_url = to_crawl.pop(0)
        
        if current_url not in crawled_urls:
            try:
                response = requests.get(current_url)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')
                    
                    crawled_urls.add(current_url)
                    print(f"Crawled: {current_url}")

                    for link in soup.find_all('a'):
                        href = link.get('href')
                        if href:
                            full_url = urljoin(current_url, href)
                            if urlparse(full_url).netloc == domain and full_url not in crawled_urls:
                                to_crawl.append(full_url)
            except Exception as e:
                print(f"Error crawling {current_url}: {e}")

    return list(crawled_urls)

In [7]:
site_url = 'https://manjushreefinance.com.np'
links = crawl_domain(site_url)
for link in links:
    print(link)

Crawled: https://manjushreefinance.com.np
Crawled: https://manjushreefinance.com.np/page/company-profile
Crawled: https://manjushreefinance.com.np/page/capital-structure
Crawled: https://manjushreefinance.com.np/team/board-of-directors
Crawled: https://manjushreefinance.com.np/team/management
Crawled: https://manjushreefinance.com.np/team/information-officer
Crawled: https://manjushreefinance.com.np/team/compliance-officer
Crawled: https://manjushreefinance.com.np/deposit
Crawled: https://manjushreefinance.com.np/loan
Crawled: https://manjushreefinance.com.np/digital-banking
Crawled: https://manjushreefinance.com.np/other-services
Crawled: https://manjushreefinance.com.np/rates/interest-rate
Crawled: https://manjushreefinance.com.np/rates/fee-and-charges
Crawled: https://manjushreefinance.com.np/rates/base-and-spread-rate
Crawled: https://manjushreefinance.com.np/reports/basel-reports
Crawled: https://manjushreefinance.com.np/reports/quarterly-reports
Crawled: https://manjushreefinance

In [5]:
import pandas as pd

In [9]:
df = pd.DataFrame(data=links, columns=['URL'])

In [10]:
df.drop_duplicates(inplace=True)

In [13]:
df.to_csv('./Links/Manjushree.csv', index=False)

In [15]:
def fetch_page_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        title = soup.title.string if soup.title else 'No Title'
        paragraphs = [p.get_text() for p in soup.find_all('p')]
        return {
            'url': url,
            'title': title,
            'content': '\n'.join(paragraphs)
        }
    except requests.RequestException:
        return None

In [16]:
def organize_website_contents(base_url, links):
    contents = {}
    for link in links:
        content = fetch_page_content(link)
        if content:
            contents[link] = content
    return contents

In [17]:
organized_contents = organize_website_contents(site_url, links)

In [5]:
from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from langchain_core.messages import SystemMessage, HumanMessage

In [3]:
from dotenv import load_dotenv, find_dotenv

In [4]:
load_dotenv(find_dotenv())

True

In [7]:
from bs4 import BeautifulSoup
import requests

In [2]:
# from langchain_groq import ChatGroq
from langchain_nvidia_ai_endpoints import ChatNVIDIA

In [6]:
# llm = ChatGroq(
#     model='llama-3.1-70b-versatile',
#     max_tokens=None,
#     max_retries=3,
#     timeout=None,
#     temperature=0.4
# )

# llm = ChatGroq(
#     model='llama3-groq-70b-8192-tool-use-preview',
#     max_tokens=None,
#     max_retries=3,
#     timeout=None,
#     temperature=0.2
# )

llm = ChatNVIDIA(
    model='meta/llama-3.1-405b-instruct',
    max_tokens=None,
    max_retries=3,
    temperature=0.1,
    timeout=None
)

In [8]:
# Define the system message content
system_template = """
# HTML Content Scraper Bot

You are an AI assistant designed to extract and structure content from HTML. Your task is to analyze the provided HTML body and return clean, highly structured data in a single, comprehensive JSON format. Follow these guidelines:

1. Input: You will receive the HTML body of a web page.

2. Output: Provide a single JSON object containing all structured data extracted from the HTML.

3. Extraction Rules:
   - Identify and extract the main content, ignoring navigation menus, footers, and sidebars.
   - Extract the page title and any global metadata.
   - Identify and extract all products or main sections, including their titles, descriptions, features, associated images, and any hyperlinks or attachment links.
   - Preserve the hierarchical structure of the content.
   - Extract all relevant hyperlinks, including their text and URLs.
   - Identify and extract any attachment links, such as PDFs or other downloadable files.

4. Data Cleaning:
   - Remove any HTML tags from the extracted text, except for hyperlinks which should be preserved in a structured format.
   - Decode HTML entities (e.g., &amp; to &, &quot; to ").
   - Trim leading and trailing whitespace from all extracted text.
   - Normalize whitespace within text (replace multiple spaces with a single space).

5. Output Structure:
   Provide the extracted data in the following JSON format:

   {{
     "title": "string",
     "metadata": {{
       "author": "string or null",
       "date": "string or null",
       "tags": ["string"]
     }},
     "products": [
       {{
         "title": "string",
         "description": "string",
         "features": [
           {{
             "type": "string (e.g., 'unordered')",
             "items": [
               {{
                 "text": "string",
                 "links": [
                   {{
                     "text": "string",
                     "url": "string"
                   }}
                 ]
               }}
             ]
           }}
         ],
         "images": [
           {{
             "url": "string",
             "alt_text": "string"
           }}
         ],
         "attachments": [
           {{
             "name": "string",
             "url": "string"
           }}
         ],
         "links": [
           {{
             "text": "string",
             "url": "string"
           }}
         ]
       }}
     ],
     "global_links": [
       {{
         "text": "string",
         "url": "string"
       }}
     ]
   }}

6. Error Handling:
   - If you encounter any issues parsing the HTML or extracting content, include an "errors" field in the JSON output with relevant error messages.

7. Additional Notes:
   - Ensure all relevant information, including hyperlinks and attachment links, is captured in a single JSON structure.
   - If certain fields are not present for some products, include them as null or empty arrays/objects as appropriate.
   - Preserve the original order of content elements as they appear in the HTML.

Remember, your goal is to provide a single, comprehensive, and highly structured JSON that accurately represents all the main content of the web page, including all relevant links and attachments, making it easy for further processing or analysis.
"""

# Create SystemMessagePromptTemplate
system_message_prompt = SystemMessagePromptTemplate.from_template(system_template)

# Create HumanMessagePromptTemplate
human_template = "HTML content to scrape:\n{html_content}\n\nPlease provide a single, comprehensive, and highly structured JSON output based on the above HTML content, following the structure and guidelines provided. Include all relevant hyperlinks and attachment links."
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

# Combine SystemMessagePromptTemplate and HumanMessagePromptTemplate
chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])

In [9]:
chain = chat_prompt | llm

In [10]:
import pandas as pd
linksDf = pd.read_csv('./Links/siteLinks.csv')
links=linksDf['links'].tolist()

In [18]:
for i, val in enumerate(links):
    if val == 'https://www.nicasiabank.com/branches-list/':
        print(i)

26


In [26]:
links = links[27:]

In [11]:
import time
import random

In [12]:
siteData = {}

In [30]:
siteData['https://www.nicasiabank.com/branches-list/'] = 'Reduce length of the page/handle later'

In [31]:
for link in links:
    try:
        resp = requests.get(link)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, 'html.parser')
        intervals = [10,20,30]
        i = 0
        while True:
            try:
                botResp = chain.invoke({
                    'html_content': soup
                })
                siteData[f'{link}'] = botResp.content
                break  # Exit the retry loop if successful
            except Exception as e:
                if 'reduce the length of the messages' in str(e):
                    siteData[f'{link}'] = f'{e}'
                    break
                print(f"Failed to invoke chain for {link}: {e}. Retrying in {intervals[i]} seconds...")
                if i >= 2:
                    i = 2
                else:
                    i += 1
                time.sleep(intervals[i])  # Wait before retrying
        # print(botResp.content)
    except requests.RequestException:
        print("Failed to Extract information.")
        siteData[f'{link}'] = 'Error'

In [32]:
import pickle

with open('./SiteData/data.pickle', 'wb') as f:
    pickle.dump(siteData, f)

In [37]:
for i,url in enumerate(siteData.keys()):
    with open(f'./SiteLLMParsedText/{i}_text.txt','w') as f:
        f.write(siteData[f'{url}'])

In [38]:
### well this is not a proper way to parse HTML pages, as we would reach TOKEN limit very quickly and not resource efficient

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from bs4.element import Tag, NavigableString

In [2]:
links = pd.read_csv('./Links/siteLinks.csv')

In [3]:
links = links['links'].to_list()

In [4]:

class TreeNode:
    def __init__(self, tag=None, text=None):
        self.tag = tag
        self.text = text
        self.children = []

    def __repr__(self, level=0):
        indent = " " * (level * 2)
        if self.tag:
            repr_str = f"{indent}<{self.tag}>"
        else:
            repr_str = f"{indent}{self.text}"
        for child in self.children:
            repr_str += "\n" + child.__repr__(level + 1)
        if self.tag:
            repr_str += f"\n{indent}</{self.tag}>"
        return repr_str

def html_to_tree(html):
    soup = BeautifulSoup(html, 'lxml')
    return parse_element(soup.html)

def parse_element(element):
    if isinstance(element, Tag):
        node = TreeNode(tag=element.name)
        for child in element.children:
            child_node = parse_element(child)
            if child_node:
                node.children.append(child_node)
        return node
    elif isinstance(element, NavigableString):
        return TreeNode(text=element.strip())
    return None

In [6]:
for link in links:
    page = requests.get(link)
    page.raise_for_status()
    linkHTML = page.text
    print(html_to_tree(linkHTML))
    break

<html>
  <head>
    <meta>
    </meta>
    <title>
      NIC ASIA BANK LIMITED
    </title>
    <meta>
    </meta>
    <meta>
    </meta>
    <meta>
    </meta>
    <link>
    </link>
    <meta>
    </meta>
    <meta>
    </meta>
    <meta>
    </meta>
    <meta>
    </meta>
    <meta>
    </meta>
    <meta>
    </meta>
    <meta>
    </meta>
    <meta>
    </meta>
    <meta>
    </meta>
    <meta>
    </meta>
    <meta>
    </meta>
    <meta>
    </meta>
    <meta>
    </meta>
    <meta>
    </meta>
    <link>
    </link>
    <link>
    </link>
    <link>
    </link>
    <link>
    </link>
    <link>
    </link>
    <noscript>
    </noscript>
    <script>
    </script>
    <script>
    </script>
    <script>
    </script>
    <script>
    </script>
    <script>
    </script>
    <script>
    </script>
    <script>
    </script>
    <script>
    </script>
    <script>
    </script>
    <script>
    </script>
    <script>
    </script>
    <script>
    </script>
    <script>
    </scrip