In [2]:
!pip install langchain langchain-classic langchain-community langchain-huggingface langchain-unstructured unstructured unstructured-client "unstructured[html]" chromadb

Collecting langchain-classic
  Downloading langchain_classic-1.0.1-py3-none-any.whl.metadata (4.2 kB)
Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-1.2.0-py3-none-any.whl.metadata (2.8 kB)
Collecting langchain-unstructured
  Downloading langchain_unstructured-1.0.1-py3-none-any.whl.metadata (3.2 kB)
Collecting unstructured
  Downloading unstructured-0.18.21-py3-none-any.whl.metadata (25 kB)
Collecting unstructured-client
  Downloading unstructured_client-0.42.6-py3-none-any.whl.metadata (23 kB)
Collecting chromadb
  Downloading chromadb-1.4.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting langchain-core<2.0.0,>=1.2.1 (from langchain)
  Downloading langchain_core-1.2.5-py3-none-any.whl.metadata (3.7 kB)
Collecting langchain-text-splitters<2.0.0,>=1.1.0 (from langchain-classic)
  Downloading langchain_text_splitters-1.1.0-

In [7]:
!unzip chroma_db.zip -d /content/chroma_db

Archive:  chroma_db.zip
   creating: /content/chroma_db/0c523f6f-0d97-4701-aead-5a1853544eb2/
  inflating: /content/chroma_db/chroma.sqlite3  
  inflating: /content/chroma_db/0c523f6f-0d97-4701-aead-5a1853544eb2/index_metadata.pickle  
  inflating: /content/chroma_db/0c523f6f-0d97-4701-aead-5a1853544eb2/link_lists.bin  
  inflating: /content/chroma_db/0c523f6f-0d97-4701-aead-5a1853544eb2/header.bin  
  inflating: /content/chroma_db/0c523f6f-0d97-4701-aead-5a1853544eb2/length.bin  
  inflating: /content/chroma_db/0c523f6f-0d97-4701-aead-5a1853544eb2/data_level0.bin  


In [3]:
import requests
from bs4 import BeautifulSoup , Comment
import tempfile
import os
from langchain_community.document_loaders import UnstructuredHTMLLoader
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEndpointEmbeddings
from langchain_community.vectorstores import Chroma
import re



# Get the html content from the research center page 'https://www.iitism.ac.in/center'

In [4]:
def load_html(url):
    if not url:
        return ""

    headers = {
        "User-Agent": "Mozilla/5.0"
    }

    try:
        response = requests.get(
            url,
            headers=headers,
            timeout=10,
            verify=False
        )
        response.raise_for_status()  # raises HTTPError for 4xx/5xx
        return response.text

    except requests.exceptions.RequestException as e:
        print(f"Error loading {url}: {e}")
        raise


# Remove comments from html content

In [5]:
def remove_comments(soup):
  for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
    comment.extract()

# Define the embedding model

In [8]:
from google.colab import userdata

embedding_model = HuggingFaceEndpointEmbeddings(
    model = 'sentence-transformers/all-MiniLM-L6-v2',
    huggingfacehub_api_token=userdata.get('HF_TOKEN')
)

# define vector store

In [9]:
vector_store = Chroma(
  embedding_function=embedding_model,
  persist_directory="chroma_db",
  collection_name="collection_research_guide"
)

  vector_store = Chroma(


# Get structured document from HTML using UnstructedHTMLLoader

In [10]:
def get_structured_docs(docs , url):

  # Prepare structured document array
  structured_docs = []

  current_title = None
  current_header = None

  # Categories to treat as titles
  TITLE_CATEGORIES = { "title", "pagetitle", "main title" }

  # Categories to treat as headers
  HEADER_CATEGORIES = {"header", "header1", "header2", "header3", "header4", "header5", "header6", "subheading"}

  # Categories to ignore completely
  IGNORE_CATEGORIES = {"figure", "image" , "uncategorizedtext"}

  for d in docs:
      category = d.metadata.get("category", "").lower()
      text = d.page_content.strip()

      if not text:
          continue

      if category in TITLE_CATEGORIES:
          current_title = text
          current_header = None  # reset header when new title
      elif category in HEADER_CATEGORIES:
          current_header = text
      elif category in IGNORE_CATEGORIES:
          continue  # skip figures/images
      else:
          # Other categories: paragraph, list, table, link, code, quote, etc.
          structured_docs.append(
              Document(
                  page_content=f" Title-{current_title or "Not available"} , heading-{current_header or "Not available"} , url-{url or "Not available"} \n\n description - {text}",
                  metadata={
                      "title": current_title or "Not available",
                      "header": current_header or "Not available",
                      "url": url or "Not available",
                  }
              )
          )

  return structured_docs

# Prepare the centers array

In [11]:
html_content = load_html("https://www.iitism.ac.in/center")

soup = BeautifulSoup(html_content, "lxml")

# remove comments from html
remove_comments(soup)

divs = soup.find_all("div" , class_ =  "modal-content")
centers = []

for div in divs:
    if div.find("div", class_ = "modal-footer") is None or div.find("div" , class_ = "modal-body") is None : continue
    center = {}
    center["link"] = div.find("div", class_ = "modal-footer").find("a")["href"]
    center["desc"] = div.find("div" , class_ = "modal-body").text.strip()
    centers.append(center)



# Create doucment from centers array

In [13]:
indexs =  [3,4,6]

docs = []

for index in indexs:
  docs.append(Document(
      page_content=centers[index]["desc"],
      metadata={
          "url": centers[index]["link"]
      }
  ))

In [14]:
docs

[Document(metadata={'url': 'https://people.iitism.ac.in/~nvchccust/'}, page_content='Naresh Vashisht Centre for Hydrogen & CCUS Technology\nThe center formally started operating on 10th January 2023, the date on which the Memorandum of Association (MoA) was signed between the Indian Institute of Technology (Indian School of Mines) Dhanbad and the Vashisht Foundation.'),
 Document(metadata={'url': 'https://people.iitism.ac.in/~csm/'}, page_content='Centre for Societal Mission\nIn consonance with the ‘Unnat Bharat Abhiyan’ scheme initiated by Shri Narendra Modi, Hon’ble Prime Minister of India, a “Center of Societal Mission (CSM)” has been operational since 2015 at Indian Institute of Technology (Indian School of Mines) Dhanbad as a part of national initiative of Government of India to enable processes that connect institutes of higher education with local communities, to address the development and challenges of rural India through participatory processes and to utilize an appropriate t

In [15]:
for doc in docs :
  url = doc.metadata["url"]

  vector_store.add_documents([doc])
  vector_store.persist()
  print(f"Added {url} docs")

  vector_store.persist()


Added https://people.iitism.ac.in/~nvchccust/ docs
Added https://people.iitism.ac.in/~csm/ docs
Added https://people.iitism.ac.in/~cre/ docs


# add document manually for ceeer

In [16]:
text = """ url-'https://people.iitism.ac.in/~ceeer/' , Title-'Centre for Earth, Energy, and Environmental Research (CEEER)'
Centre for Earth, Energy, and Environmental Research (CEEER) was established in May 2020 to promote an integrated approach towards exploiting our energy resources to achieve better and sustainable energy security for our country. The vision and mission of the centre finds a stronghold of the foundation of IIT(ISM) Dhanbad. With nearly 100 years of long history of making remarkable contribution to the industries associated with Earth resources and energy, IIT(ISM) Dhanbad is recognised as pioneer institute in Earth Sciences and Engineering.
For a long period of time, IIT(ISM) has had a distinction and recognition of being the only educational institute in India which hosts all departments of earth sciences and engineering – Mining, Applied Geology, Applied Geophysics, Petroleum and Mineral Engineering. A glorious past marked with such distinction enables it to have an edge over other institutes of India when it comes to knowledge base and state-of-art facilities related to Earth Sciences and Engineering.
With a background of IIT(ISM) Dhanbad, the centre CEEER aims to amalgamate the rich knowledge in the Earth Sciences and Engineering with the modern and recent concepts to deal with the new age challenges in the energy sector.
The CEEER is aimed to find sustainable solutions for exploitation of available energy resources by developing environment-friendly energy production technologies. CEEER is envisioned to undertake research and development activities for energy extraction from subsurface resources, which include both conventional and unconventional resources, in the country in a balanced and sustainable manner with minimal environmental footprints, and in close cooperation with all stakeholders in the government and private sector.
The centre is run by a multi-disciplinary team of scientists and engineers at IIT(ISM). It draws upon expertise from all geosciences discipline, including geology, geophysics, mining and mineralogy, and petroleum and environmental engineering, to conduct research and development studies in the specified areas of research. The team at CEEER is also supporting the energy industry (Mining and Oil & Gas industry) through advising and consulting.
In addition, the centre is also aimed to provide development training to professionals of the Indian energy sector to equip them with required knowledge and contemporary skills needed for development and extraction of subsurface energy resources.
"""
doc = Document(
        page_content=text,
        metadata={
            "url": "https://people.iitism.ac.in/~ceeer/",
            "title" : "Centre for Earth, Energy, and Environmental Research (CEEER)",
            "header": "Centre for Earth, Energy, and Environmental Research (CEEER)",
        }
      )

vector_store.add_documents([doc])

['ed799bef-6f45-43de-86ed-33ddce0536e1']

# Prepare the Links array

In [17]:
links = [
    "https://www.iitism.ac.in/nvcti",
    "https://www.iitism.ac.in/coal-india-limited",
    "https://www.iitism.ac.in/centre-for-renewable-energy",
]

In [25]:
html_content = load_html(links[1])


soup = BeautifulSoup(html_content, "lxml")

# remove comments from html
remove_comments(soup)

# filter only the body of the html content
section = soup.find("section" , class_=["rts-about-university", "rts-section-padding"])


# store the html to a tmep file
with tempfile.NamedTemporaryFile("w", suffix=".html", delete=False, encoding="utf-8") as tmp_file:
  tmp_file.write(str(section))
  tmp_path = tmp_file.name

print("Temp file path:", tmp_path)

# load html using Unstructure HTML loader
loader = UnstructuredHTMLLoader(file_path=tmp_path, mode="elements")
docs = loader.load()





Temp file path: /tmp/tmpiocpy23_.html


[Document(metadata={'source': '/tmp/tmpiocpy23_.html', 'category_depth': 1, 'languages': ['eng'], 'file_directory': '/tmp', 'filename': 'tmpiocpy23_.html', 'filetype': 'text/html', 'category': 'Title', 'element_id': 'a120232d0a855322146255f41a860ac8'}, page_content='CIL Innovation & Incubation Centre, IIT (ISM) Dhanbad'),
 Document(metadata={'source': '/tmp/tmpiocpy23_.html', 'languages': ['eng'], 'file_directory': '/tmp', 'filename': 'tmpiocpy23_.html', 'filetype': 'text/html', 'parent_id': 'a120232d0a855322146255f41a860ac8', 'category': 'NarrativeText', 'element_id': '76f6be89803ddb57ae43f76d2e493318'}, page_content='IIT (ISM) Dhanbad with the collaboration of CIL established the state of art CIL Innovation & Incubation Centre (CII Centre) under the flagship program – Atal Innovation Mission (AIM). Its aim is to engage the community at large in innovation and entrepreneurial activities. The centre focuses on nurturing early innovators to ideate in the domain of Technology Innovation 

# For each Link get the actual data and do Embeddings and store to Chroma DB

In [18]:
for link in links:

  html_content = load_html(link)

  if not html_content:
        continue

  soup = BeautifulSoup(html_content, "lxml")

  # remove comments from html
  remove_comments(soup)

  # filter only the body of the html content
  section = soup.find("section" , class_=["rts-about-university", "rts-section-padding"])

  if section is None:
        continue

  # store the html to a tmep file
  with tempfile.NamedTemporaryFile("w", suffix=".html", delete=False, encoding="utf-8") as tmp_file:
    tmp_file.write(str(section))
    tmp_path = tmp_file.name

  print("Temp file path:", tmp_path)

  # load html using Unstructure HTML loader
  loader = UnstructuredHTMLLoader(file_path=tmp_path, mode="elements")
  docs = loader.load()

  # remove the temp file
  os.remove(tmp_path)

  # get structured_docs from html using unstructed html loader
  structured_docs = get_structured_docs(docs , link)

  if not structured_docs:
        continue

  vector_store.add_documents(structured_docs)
  vector_store.persist()
  print(f"Added {len(structured_docs)} docs to db ")




Temp file path: /tmp/tmpq_w8iybg.html
Added 24 docs to db 




Temp file path: /tmp/tmp75we8fuu.html
Added 8 docs to db 




Temp file path: /tmp/tmpk89jhnc6.html
Added 44 docs to db 


In [23]:
vector_store.similarity_search(
    query="Management & Services",
    k=10
)

[Document(metadata={'url': 'https://www.iitism.ac.in/research-cluster-msie', 'title': 'Operations Management'}, page_content=' Title-Operations Management , heading-Not available , url-https://www.iitism.ac.in/research-cluster-msie \n\n description - Prof. Kuthambalayan Sridhar Thyagaraj'),
 Document(metadata={'title': 'Operations Management', 'url': 'https://www.iitism.ac.in/research-cluster-msie'}, page_content=' Title-Operations Management , heading-Not available , url-https://www.iitism.ac.in/research-cluster-msie \n\n description - Prof. Krishnendu Shaw'),
 Document(metadata={'title': 'Marketing Management', 'url': 'https://www.iitism.ac.in/research-cluster-msie'}, page_content=' Title-Marketing Management , heading-Not available , url-https://www.iitism.ac.in/research-cluster-msie \n\n description - Prof. Mrinalini Pandey'),
 Document(metadata={'title': 'Marketing Management', 'url': 'https://www.iitism.ac.in/research-cluster-msie'}, page_content=' Title-Marketing Management , he

# download the db

In [28]:
import shutil
from google.colab import files

# Zip the folder
shutil.make_archive("chroma_db", 'zip', "chroma_db")

# Download the zip
files.download("chroma_db.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>