In [1]:
!pip install langchain langchain-classic langchain-community langchain-huggingface langchain-unstructured unstructured unstructured-client "unstructured[html]" chromadb

Collecting langchain-classic
  Downloading langchain_classic-1.0.1-py3-none-any.whl.metadata (4.2 kB)
Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-1.2.0-py3-none-any.whl.metadata (2.8 kB)
Collecting langchain-unstructured
  Downloading langchain_unstructured-1.0.1-py3-none-any.whl.metadata (3.2 kB)
Collecting unstructured
  Downloading unstructured-0.18.21-py3-none-any.whl.metadata (25 kB)
Collecting unstructured-client
  Downloading unstructured_client-0.42.6-py3-none-any.whl.metadata (23 kB)
Collecting chromadb
  Downloading chromadb-1.4.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting langchain-core<2.0.0,>=1.2.1 (from langchain)
  Downloading langchain_core-1.2.5-py3-none-any.whl.metadata (3.7 kB)
Collecting langchain-text-splitters<2.0.0,>=1.1.0 (from langchain-classic)
  Downloading langchain_text_splitters-1.1.0-

In [None]:
!pip freeze > requirments.txt

In [6]:
!unzip chroma_db.zip -d /content/chroma_db

Archive:  chroma_db.zip
   creating: /content/chroma_db/0c523f6f-0d97-4701-aead-5a1853544eb2/
  inflating: /content/chroma_db/chroma.sqlite3  
  inflating: /content/chroma_db/0c523f6f-0d97-4701-aead-5a1853544eb2/index_metadata.pickle  
  inflating: /content/chroma_db/0c523f6f-0d97-4701-aead-5a1853544eb2/link_lists.bin  
  inflating: /content/chroma_db/0c523f6f-0d97-4701-aead-5a1853544eb2/header.bin  
  inflating: /content/chroma_db/0c523f6f-0d97-4701-aead-5a1853544eb2/length.bin  
  inflating: /content/chroma_db/0c523f6f-0d97-4701-aead-5a1853544eb2/data_level0.bin  


In [2]:
import requests
from bs4 import BeautifulSoup , Comment
import tempfile
import os
from langchain_community.document_loaders import UnstructuredHTMLLoader
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEndpointEmbeddings
from langchain_community.vectorstores import Chroma
import re



# get the html content from the research cluster page 'https://www.iitism.ac.in/research-cluster'

In [3]:
def load_html(url):
    if not url:
        return ""

    headers = {
        "User-Agent": "Mozilla/5.0"
    }

    try:
        response = requests.get(
            url,
            headers=headers,
            timeout=10,
            verify=False
        )
        response.raise_for_status()  # raises HTTPError for 4xx/5xx
        return response.text

    except requests.exceptions.RequestException as e:
        print(f"Error loading {url}: {e}")
        raise

In [4]:
def remove_comments(soup):
  for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
    comment.extract()

In [5]:
from google.colab import userdata

embedding_model = HuggingFaceEndpointEmbeddings(
    model = 'sentence-transformers/all-MiniLM-L6-v2',
    huggingfacehub_api_token=userdata.get('HF_TOKEN')
)

In [7]:
vector_store = Chroma(
  embedding_function=embedding_model,
  persist_directory="chroma_db",
  collection_name="collection_research_guide"
)

  vector_store = Chroma(


In [9]:
def get_structured_docs(docs , url):

  # Prepare structured document array
  structured_docs = []

  current_title = None
  current_header = None

  # Categories to treat as titles
  TITLE_CATEGORIES = {"uncategorizedtext" , "title", "pagetitle", "main title" , "narrativetext"}

  # Categories to treat as headers
  HEADER_CATEGORIES = {"header", "header1", "header2", "header3", "header4", "header5", "header6", "subheading"}

  # Categories to ignore completely
  IGNORE_CATEGORIES = {"figure", "image" }

  for d in docs:
      category = d.metadata.get("category", "").lower()
      text = d.page_content.strip()

      if not text:
          continue

      if category in TITLE_CATEGORIES:
          current_title = text
          current_header = None  # reset header when new title
      elif category in HEADER_CATEGORIES:
          current_header = text
      elif category in IGNORE_CATEGORIES:
          continue  # skip figures/images
      else:
          # Other categories: paragraph, list, table, link, code, quote, etc.
          structured_docs.append(
              Document(
                  page_content=f" Title-{current_title or "Not available"} , heading-{current_header or "Not available"} , url-{url or "Not available"} \n\n description - {text}",
                  metadata={
                      "title": current_title,
                      "header": current_header,
                      "url": url or "Not available",
                  }
              )
          )

  return structured_docs

# prearing Links

In [10]:
html_content = load_html("https://www.iitism.ac.in/research-cluster")

soup = BeautifulSoup(html_content, "lxml")

for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
    comment.extract()



In [11]:
divs = soup.find("section" , class_ = "rts-about-university").find("div" , class_="container").find("div" , class_="link-holder").find_all("a")

links = []

for d in divs :
  links.append(d["href"])

In [12]:
links

['https://www.iitism.ac.in/research-cluster-applied-geology',
 'https://www.iitism.ac.in/research-cluster-applied-geophysics',
 'https://www.iitism.ac.in/research-cluster-chemical',
 'https://www.iitism.ac.in/research-cluster-chemistry-and-chemical-biology',
 'https://www.iitism.ac.in/research-cluster-civil',
 'https://www.iitism.ac.in/research-cluster-cse',
 'https://www.iitism.ac.in/research-cluster-ee',
 'https://www.iitism.ac.in/research-cluster-electronics-engineering',
 'https://www.iitism.ac.in/research-cluster-environmental-science-and-engineering',
 'https://www.iitism.ac.in/research-cluster-fuel-minerals-and-metallurgical-engineering',
 'https://www.iitism.ac.in/research-cluster-hss',
 'https://www.iitism.ac.in/research-cluster-msie',
 'https://www.iitism.ac.in/research-cluster-mnc',
 'https://www.iitism.ac.in/research-cluster-mechanical-engineering',
 'https://www.iitism.ac.in/research-cluster-mining-engineering',
 'https://www.iitism.ac.in/research-cluster-petroleum',
 'htt

# scrap each research cluster page


In [13]:
for link in links:

  html_content = load_html(link)

  if not html_content:
        continue

  soup = BeautifulSoup(html_content, "lxml")

  # remove comments from html
  remove_comments(soup)

  # filter only the body of the html content
  section = soup.find("section" , class_=["rts-about-university", "rts-section-padding"])

  if section is None:
        continue

  # store the html to a tmep file
  with tempfile.NamedTemporaryFile("w", suffix=".html", delete=False, encoding="utf-8") as tmp_file:
    tmp_file.write(str(section))
    tmp_path = tmp_file.name

  print("Temp file path:", tmp_path)

  # load html using Unstructure HTML loader
  loader = UnstructuredHTMLLoader(file_path=tmp_path, mode="elements")
  docs = loader.load()

  # remove the temp file
  os.remove(tmp_path)

  # get structured_docs from html using unstructed html loader
  structured_docs = get_structured_docs(docs , link)

  if not structured_docs:
        continue

  vector_store.add_documents(structured_docs)
  vector_store.persist()
  print(f"Added {len(structured_docs)} docs to db {link}")




Temp file path: /tmp/tmpgeawu3at.html


  vector_store.persist()


Added 18 docs to db https://www.iitism.ac.in/research-cluster-applied-geology




Temp file path: /tmp/tmpvne6wrd5.html
Added 4 docs to db https://www.iitism.ac.in/research-cluster-applied-geophysics




Temp file path: /tmp/tmp93myd0nw.html
Added 19 docs to db https://www.iitism.ac.in/research-cluster-chemical




Temp file path: /tmp/tmps7o491o0.html
Added 23 docs to db https://www.iitism.ac.in/research-cluster-chemistry-and-chemical-biology




Temp file path: /tmp/tmpwty3smda.html
Added 23 docs to db https://www.iitism.ac.in/research-cluster-civil




Temp file path: /tmp/tmpaww0a1g9.html
Added 43 docs to db https://www.iitism.ac.in/research-cluster-cse




Temp file path: /tmp/tmpteb_oubt.html
Added 29 docs to db https://www.iitism.ac.in/research-cluster-ee




Temp file path: /tmp/tmpodv5perz.html
Added 30 docs to db https://www.iitism.ac.in/research-cluster-electronics-engineering




Temp file path: /tmp/tmpvum62ql2.html
Added 18 docs to db https://www.iitism.ac.in/research-cluster-environmental-science-and-engineering




Temp file path: /tmp/tmp9o4sj0sr.html
Added 17 docs to db https://www.iitism.ac.in/research-cluster-fuel-minerals-and-metallurgical-engineering




Temp file path: /tmp/tmpujoijgqc.html
Added 18 docs to db https://www.iitism.ac.in/research-cluster-hss




Temp file path: /tmp/tmpjk7j__jf.html
Added 19 docs to db https://www.iitism.ac.in/research-cluster-msie




Temp file path: /tmp/tmpykmkcjis.html
Added 33 docs to db https://www.iitism.ac.in/research-cluster-mnc




Temp file path: /tmp/tmp8cdlf29f.html
Added 20 docs to db https://www.iitism.ac.in/research-cluster-mechanical-engineering




Temp file path: /tmp/tmp0ybp990i.html
Added 40 docs to db https://www.iitism.ac.in/research-cluster-mining-engineering




Temp file path: /tmp/tmp0a7phj8k.html
Added 9 docs to db https://www.iitism.ac.in/research-cluster-petroleum




Temp file path: /tmp/tmpf7h7u6zn.html
Added 16 docs to db https://www.iitism.ac.in/research-cluster-physics


In [16]:
import shutil
from google.colab import files

# Zip the folder
shutil.make_archive("chroma_db", 'zip', "chroma_db")

# Download the zip
files.download("chroma_db.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [15]:
vector_store.similarity_search(
    query="Information Security",
    k=5
)

[Document(metadata={'row': 2, 'source': '/content/1766923003349641_9.csv'}, page_content='Ongoing Projects \n\n \ufeff"Sl. No.": 133\nTitle of the Project: Information Security Education and Awareness (ISEA) Project\nPhase-III\nFunding Agency: Ministry of ElectrBoennigcas la  nd Information\nTechnology, New Delhi\nName of PI: Prof. Sachin Tripathi\nDepartment of PI: Computer Science and Engineering'),
 Document(metadata={'title': 'Information Security', 'url': 'https://www.iitism.ac.in/research-cluster-cse'}, page_content=' Title-Information Security , heading-Not available , url-https://www.iitism.ac.in/research-cluster-cse \n\n description - Prof. Rajendra Pamula'),
 Document(metadata={'title': 'Information Security', 'url': 'https://www.iitism.ac.in/research-cluster-cse'}, page_content=' Title-Information Security , heading-Not available , url-https://www.iitism.ac.in/research-cluster-cse \n\n description - Prof. Hari Om'),
 Document(metadata={'title': 'Information Security', 'url':