In [1]:
!pip install langchain langchain-classic langchain-community langchain-huggingface langchain-unstructured unstructured unstructured-client "unstructured[html]" chromadb

Collecting langchain-classic
  Downloading langchain_classic-1.0.1-py3-none-any.whl.metadata (4.2 kB)
Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-1.2.0-py3-none-any.whl.metadata (2.8 kB)
Collecting langchain-unstructured
  Downloading langchain_unstructured-1.0.1-py3-none-any.whl.metadata (3.2 kB)
Collecting unstructured
  Downloading unstructured-0.18.21-py3-none-any.whl.metadata (25 kB)
Collecting unstructured-client
  Downloading unstructured_client-0.42.6-py3-none-any.whl.metadata (23 kB)
Collecting chromadb
  Downloading chromadb-1.4.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting langchain-core<2.0.0,>=1.2.1 (from langchain)
  Downloading langchain_core-1.2.5-py3-none-any.whl.metadata (3.7 kB)
Collecting langchain-text-splitters<2.0.0,>=1.1.0 (from langchain-classic)
  Downloading langchain_text_splitters-1.1.0-

In [2]:
!unzip chroma_db.zip -d /content/chroma_db

Archive:  chroma_db.zip
   creating: /content/chroma_db/0c523f6f-0d97-4701-aead-5a1853544eb2/
  inflating: /content/chroma_db/chroma.sqlite3  
  inflating: /content/chroma_db/0c523f6f-0d97-4701-aead-5a1853544eb2/index_metadata.pickle  
  inflating: /content/chroma_db/0c523f6f-0d97-4701-aead-5a1853544eb2/link_lists.bin  
  inflating: /content/chroma_db/0c523f6f-0d97-4701-aead-5a1853544eb2/header.bin  
  inflating: /content/chroma_db/0c523f6f-0d97-4701-aead-5a1853544eb2/length.bin  
  inflating: /content/chroma_db/0c523f6f-0d97-4701-aead-5a1853544eb2/data_level0.bin  


In [3]:
import requests
from bs4 import BeautifulSoup , Comment
import tempfile
import os
from langchain_community.document_loaders import UnstructuredHTMLLoader
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEndpointEmbeddings
from langchain_community.vectorstores import Chroma
import re



In [4]:
def load_html(url):
    if not url:
        return ""

    headers = {
        "User-Agent": "Mozilla/5.0"
    }

    try:
        response = requests.get(
            url,
            headers=headers,
            timeout=10,
            verify=False
        )
        response.raise_for_status()  # raises HTTPError for 4xx/5xx
        return response.text

    except requests.exceptions.RequestException as e:
        print(f"Error loading {url}: {e}")
        raise


In [5]:
def remove_comments(soup):
  for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
    comment.extract()

In [6]:
from google.colab import userdata

embedding_model = HuggingFaceEndpointEmbeddings(
    model = 'sentence-transformers/all-MiniLM-L6-v2',
    huggingfacehub_api_token=userdata.get('HF_TOKEN')
)

In [80]:
vector_store = Chroma(
  embedding_function=embedding_model,
  persist_directory="chroma_db",
  collection_name="collection_research_guide"
)

In [81]:
def get_structured_docs(docs , url , topic):

  # Prepare structured document array
  structured_docs = []

  current_title = None
  current_header = None

  # Categories to treat as titles
  TITLE_CATEGORIES = { "title", "pagetitle", "main title" }

  # Categories to treat as headers
  HEADER_CATEGORIES = {"header", "header1", "header2", "header3", "header4", "header5", "header6", "subheading"}

  # Categories to ignore completely
  IGNORE_CATEGORIES = {"figure", "image" }

  for d in docs:
      category = d.metadata.get("category", "").lower()
      text = d.page_content.strip()

      if not text or len(text) <= 5:
          continue

      if category in TITLE_CATEGORIES:
          current_title = text
          current_header = None  # reset header when new title
      elif category in HEADER_CATEGORIES:
          current_header = text
      elif category in IGNORE_CATEGORIES:
          continue  # skip figures/images
      else:
          # Other categories: paragraph, list, table, link, code, quote, etc.
          structured_docs.append(
              Document(
                  page_content=f"topic-{topic} \n Title-{current_title or "Not available"} \n heading-{current_header or "Not available"} \n url-{url or "Not available"} \n\n description - {text}",
                  metadata={
                      "title": current_title or "Not available",
                      "header": current_header or "Not available",
                      "url": url or "Not available",
                  }
              )
          )

  return structured_docs

# load all achivements url

In [82]:
links=[
    "https://www.iitism.ac.in/research/applied-geology-research-achievements",
    "https://www.iitism.ac.in/applied-geophysics-research-achievements",
    "https://www.iitism.ac.in/chemical-engineering-research-achievements",
    "https://www.iitism.ac.in/chemistry-and-chemical-biology-research-achievements",
    "https://www.iitism.ac.in/electronics-engineering-research-achievement",
    "https://www.iitism.ac.in/computer-science-and-engineering-research-achievements",
    "https://www.iitism.ac.in/petroleum-engineering-research-achievements",
    "https://www.iitism.ac.in/mining-engineering-research-achievements",
    "https://www.iitism.ac.in/mechanical-engineering-research-achievements",
    "https://www.iitism.ac.in/mathematics-and-computing-research-achievements",
]

# load each faculy page

In [72]:
html_content = load_html(links[0])

soup = BeautifulSoup(html_content, "lxml")

# remove comments from html
remove_comments(soup)

# filter only the body of the html content
section = soup.find("section" , class_=["rts-about-university"])

section

# store the html to a tmep file
with tempfile.NamedTemporaryFile("w", suffix=".html", delete=False, encoding="utf-8") as tmp_file:
  tmp_file.write(str(section))
  tmp_path = tmp_file.name

print("Temp file path:", tmp_path)

# load html using Unstructure HTML loader
loader = UnstructuredHTMLLoader(file_path=tmp_path, mode="elements")
docs = loader.load()

# remove the temp file
os.remove(tmp_path)



Temp file path: /tmp/tmpa40_t8fi.html


In [84]:
names = []

for link in links:
  print(" ".join(link.split('/')[-1].split('-')))


applied geology research achievements
applied geophysics research achievements
chemical engineering research achievements
chemistry and chemical biology research achievements
electronics engineering research achievement
computer science and engineering research achievements
petroleum engineering research achievements
mining engineering research achievements
mechanical engineering research achievements
mathematics and computing research achievements


In [85]:
for link in links:

  html_content = load_html(link)

  if not html_content:
    continue

  soup = BeautifulSoup(html_content, "lxml")

  # remove comments from html
  remove_comments(soup)

  # filter only the body of the html content
  section = soup.find("section" , class_=["rts-about-university"])

  if section is None:
    continue

  # store the html to a tmep file
  with tempfile.NamedTemporaryFile("w", suffix=".html", delete=False, encoding="utf-8") as tmp_file:
    tmp_file.write(str(section))
    tmp_path = tmp_file.name

  print("Temp file path:", tmp_path)

  # load html using Unstructure HTML loader
  loader = UnstructuredHTMLLoader(file_path=tmp_path, mode="elements")
  docs = loader.load()

  # remove the temp file
  os.remove(tmp_path)

  topic = " ".join(link.split('/')[-1].split('-'))

  structured_docs = get_structured_docs(docs , link , topic)

  if not structured_docs:
      continue

  vector_store.add_documents(structured_docs)
  vector_store.persist()

  print(f"Added {len(structured_docs)} docs to db of proff {topic}")



Temp file path: /tmp/tmpppiw6hwe.html
Added 41 docs to db of proff applied geology research achievements




Temp file path: /tmp/tmp7_883vks.html
Added 2 docs to db of proff applied geophysics research achievements




Temp file path: /tmp/tmpcj9s996m.html




Temp file path: /tmp/tmpaj9mfyt2.html
Added 1 docs to db of proff chemistry and chemical biology research achievements




Temp file path: /tmp/tmp1xoath_x.html
Added 1 docs to db of proff electronics engineering research achievement




Temp file path: /tmp/tmp61jt6dn_.html
Added 17 docs to db of proff computer science and engineering research achievements




Temp file path: /tmp/tmp6gdidrox.html
Added 15 docs to db of proff petroleum engineering research achievements




Temp file path: /tmp/tmpt3cjkkma.html
Added 5 docs to db of proff mining engineering research achievements




Temp file path: /tmp/tmptadunfny.html
Added 46 docs to db of proff mechanical engineering research achievements




Temp file path: /tmp/tmpuywsrity.html
Added 9 docs to db of proff mathematics and computing research achievements


In [47]:
structured_docs[:10]

[Document(metadata={'title': 'Prof.A Antony Selvan', 'header': 'Not available', 'url': 'https://www.iitism.ac.in/faculty-details?faculty=antony'}, page_content='faculty_name-Prof. A Antony Selvan \n Title-Prof.A Antony Selvan \n heading-Not available \n url-https://www.iitism.ac.in/faculty-details?faculty=antony \n img_url-https://www.iitism.ac.in/facultyImages/Prof. A Antony Selvan.jpg \n\n description - Designation: Assistant Professor'),
 Document(metadata={'title': 'Prof.A Antony Selvan', 'header': 'Not available', 'url': 'https://www.iitism.ac.in/faculty-details?faculty=antony'}, page_content='faculty_name-Prof. A Antony Selvan \n Title-Prof.A Antony Selvan \n heading-Not available \n url-https://www.iitism.ac.in/faculty-details?faculty=antony \n img_url-https://www.iitism.ac.in/facultyImages/Prof. A Antony Selvan.jpg \n\n description - Department: Mathematics & Computing'),
 Document(metadata={'title': 'Prof.A Antony Selvan', 'header': 'Not available', 'url': 'https://www.iitism.

In [86]:
c = set()

for d in docs:
  c.add(d.metadata["category"])

  # if d.metadata["category"] == "ListItem":
  #   print(d.page_content)

c

{'ListItem', 'Title'}

In [90]:
vector_store.similarity_search(
    query = "achievement of electronics and communication department",
    k=10
)

[Document(metadata={'header': 'Not available', 'title': 'Academics', 'url': 'https://www.iitism.ac.in/faculty-details?faculty=ravi'}, page_content='faculty_name-Prof. Ravi Kumar Gangwar \n Title-Academics \n heading-Not available \n url-https://www.iitism.ac.in/faculty-details?faculty=ravi \n img_url-https://www.iitism.ac.in/storage/FacultyDetails/IMG_175888285368d66c25aea96.jpg \n\n description - Degree Awarded Specialization Institute/University Course Duration Details Year of Commencement Year of Completion Total Period Bachelor of Technology (B.Tech) Electronics and Communication Engineering Dr. A.P.J. Abdul Kalam Technical University, Uttar Pradesh. Lucknow, July, 2002 June, 2006 4 years Doctor of Philosophy (Ph.D.) Electronics Engineering Indian Institute of Technology (BHU), Varanasi January, 2007 March, 2011 4 years'),
 Document(metadata={'title': 'Awards and Honors', 'header': 'Not available', 'url': 'https://www.iitism.ac.in/faculty-details?faculty=ravi'}, page_content='facul

In [91]:
import shutil
from google.colab import files

# Zip the folder
shutil.make_archive("chroma_db", 'zip', "chroma_db")

# Download the zip
files.download("chroma_db.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>