In [22]:
%pip install "pdf2image" "pytesseract" "tiktoken" "langchain" "chromadb" "sentence_transformers" "unstructured"
%pip install chromadb==0.3.29

Defaulting to user installation because normal site-packages is not writeable
Collecting unstructured
  Using cached unstructured-0.8.1-py3-none-any.whl (1.4 MB)
Collecting filetype (from unstructured)
  Using cached filetype-1.2.0-py2.py3-none-any.whl (19 kB)
Collecting lxml (from unstructured)
  Using cached lxml-4.9.3-cp310-cp310-manylinux_2_28_x86_64.whl (7.9 MB)
Collecting msg-parser (from unstructured)
  Using cached msg_parser-1.2.0-py2.py3-none-any.whl (101 kB)
Collecting openpyxl (from unstructured)
  Using cached openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
Collecting pdfminer.six (from unstructured)
  Using cached pdfminer.six-20221105-py3-none-any.whl (5.6 MB)
Collecting pypandoc (from unstructured)
  Using cached pypandoc-1.11-py3-none-any.whl (20 kB)
Collecting python-docx (from unstructured)
  Using cached python-docx-0.8.11.tar.gz (5.6 MB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting python-pptx (from unstructured)
  Using cached python-pptx-0.6.21.ta

In [23]:
import os
from chromadb.config import Settings

In [24]:
import re
import os
import glob
from typing import List
from dotenv import load_dotenv
from multiprocessing import Pool
from tqdm import tqdm
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain.document_loaders import (
    PDFMinerLoader,
)
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import TokenTextSplitter

In [25]:
# Define the folder for storing database
persist_directory = 'AllMini_Chroma_Tik_400' # add the name of the folder where you want to store vectorDB

# Define the Chroma settings
CHROMA_SETTINGS = Settings(
        chroma_db_impl='duckdb+parquet',
        persist_directory=persist_directory,
        anonymized_telemetry=False
)
# path to the source documents
source_directory= "docs/daiict.ac.in"
# define text chunk size and overlap
chunk_size = 400
chunk_overlap = 40

In [26]:
# %cd ~/docs/daiict.ac.in

In [27]:
# load single document
def load_single_document(file_path: str) -> List[Document]:
  """
  Function responsible for loading pdf and text files and data cleaning.
  """
  # define data loader as per file type
  if file_path[-3:]=="pdf":
    loader = PDFMinerLoader(file_path)
  else:
    loader = UnstructuredFileLoader(file_path)

  # load data from file
  result = loader.load()
  page_content = result[0].page_content
  # Remove extra breaklines from the text.
  page_content = page_content.replace('\n',' ').replace('\\n',' ')
  # Remove continous extraspaces from the text
  page_content = re.sub(r"\s+", " ", page_content)
  result[0].page_content = page_content
  return result

In [28]:
def load_documents(source_dir: str) -> List[Document]:
    """
    Loads all documents from the source documents directory.
    """
    results = []
    all_file_paths = []
    # text file paths
    txt_files = glob.glob(os.path.join(source_dir, '*.txt'))
    # pdf files paths
    pdf_files = glob.glob(os.path.join(source_dir, '*.pdf'))
    all_file_paths.extend(txt_files)
    all_file_paths.extend(pdf_files)

    # remove files .zip type
    all_file_paths = [file for file in all_file_paths if not file.endswith('.zip')]
    for file in all_file_paths:
      # remove xls files in txt format
      if(file[-7:]!='xls.txt'):
        doc = load_single_document(file)
        results.extend(doc)

    return results

In [29]:
# load_documents(source_dir="~/docs/daiict.ac.in")

In [30]:
def process_documents() -> List[Document]:
    """
    Load documents and split in chunks
    """
    print(f"Loading documents from {source_directory}")
    documents = load_documents(source_directory)
    if not documents:
        print("No new documents to load")
        exit(0)
    print(f"Loaded {len(documents)} new documents from {source_directory}")
    # define text splitter
    text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents(documents)
    print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
    return texts

In [31]:
def does_vectorstore_exist(persist_directory: str) -> bool:
    """
    Checks if vectorstore exists
    """
    if os.path.exists(os.path.join(persist_directory, 'index')):
        if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')):
            list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin'))
            list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl'))
            # At least 3 documents are needed in a working vectorstore
            if len(list_index_files) > 3:
                return True
    return False

In [32]:
def main():
    """
    Run this for creating vector database

    """
    # load embeddings model
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

    if does_vectorstore_exist(persist_directory):
        # Update and store locally vectorstore
        print(f"Appending to existing vectorstore at {persist_directory}")
        db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
        collection = db.get()
        texts = persist_directory([metadata['source'] for metadata in collection['metadatas']])
        print(f"Creating embeddings. May take some minutes...")
        db.add_documents(texts)
    else:
        # Create and store locally vectorstore
        print("Creating new vectorstore")
        texts = process_documents()
        print(f"Creating embeddings. May take some minutes...")
        db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS)
    db.persist()
    db = None

    print(f"Ingestion complete! You can now run query the vectorDB for context retrieval")

In [33]:
#Enter path to store the vectorDB
%cd ~/docs/finalDB


[Errno 2] No such file or directory: '/home/bhumik/docs/finalDB'
/home/bhumik


In [34]:
# Only run when creating vectorStore
main()

Creating new vectorstore
Loading documents from docs/daiict.ac.in


[nltk_data] Downloading package punkt to /home/bhumik/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/bhumik/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Loaded 600 new documents from docs/daiict.ac.in
Split into 5453 chunks of text (max. 400 tokens each)
Creating embeddings. May take some minutes...
Ingestion complete! You can now run query the vectorDB for context retrieval


#Context retrieval on vector database

In [35]:
# load embeddings model
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

In [42]:
# define vectorDB directory and ChromaDB settings
persist_directory = "AllMini_Chroma_Tik_400"
CHROMA_SETTINGS = Settings(
        chroma_db_impl='duckdb+parquet',
        persist_directory=persist_directory,
        anonymized_telemetry=False
)

In [43]:
# vectorDB instance
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)

In [44]:
# # query the vectorDB
# query = "Name professors in cse department"

# # use vectorDB as retriever with top k retrieved context
# retriever = db.as_retriever(search_type="mmr",search_kwargs={"k": 2})
# retriever.get_relevant_documents(query)

[Document(page_content='alyan Sashidhar KOTAK NISHITH ASHOKKUMAR Kuntala Dasgupta Lavneet Singh 31 45 49 28 40 47 33 69 29 38 27 66 43 Madhumita Mazumdar Maitri Vaghela Manik Lal Das Manish Khare Manish Kumar Manishkumar Gupta Manjunath Joshi Manoj Kumar Raut Mayank Patel Minal Bhise Mukesh Tiwari Nabagata Chaudhury 53 28 51 37 33 51 60 45 31 52 44 27 Other Professor Male Male Associate Professor Male Other Other Other Other Dean / Principal / Director / Vice Chancellor Female Female Male Male Male Other Female Associate Professor Male M.S Ph.D Ph.D M.Tech M. Phil Ph.D M.E. Ph.D M.E. Ph.D Male M.Tech Other Other Other Female Male Professor Female Other Professor Female Male Assistant Professor Male Assistant Professor Male Professor Professor Male Male Associate Professor Male Associate Professor Female Associate Professor Male Other Female Other Male M.Tech B.Sc. M.S Ph.D Ph.D M.Tech Ph.D Ph.D Ph.D Ph.D Ph.D Ph.D Ph.D Ph.D M.Sc. Ph.D Ph.D Ph.D Ph.D M.Tech 92 186 150 48 24 129 92 528 7

In [45]:
# # Retrieval context with similarity score
# query = "Who are the Alumni Association Members?"
# docs = db.similarity_search_with_score(query)
# docs[0]

(Document(page_content=' Board consisting of a Faculty Convener, faculty members and student representatives exists. Our own graduates who started their own company – Alma Connect is helping DA-IICT to reconnect the alumni with their Alma Mater by providing campus updates through the web portal (https://daiict.almaconnect.com/). As part of its ongoing commitment to reach, engage and connect to its graduates, the DA-IICT Alumni Association organizes Alumni Day at the Institute every year. The announcement of the Alumni Day is carried on various online channels including Almaconnect, Facebook, Google, alumni group etc. This is followed up by personal invites to those who indicate a strong interest in attending the event. The programme is usually held in two parts. The first is an interaction between the faculty and the alumni in the morning and the second is an interaction between the alumni and current students in the evening. It is proposed to establish an Alumni Account to which an al