In [27]:
%pip install langchain_community langchain_text_splitters langchain_openai langchain_chroma gradio python-dotenv pypdf

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


## Requirements
#### Vector Database = langchain_chroma
#### Embeddings Model Provider= SentenceTransformerEmbeddings
#### Embedding Model Name = model_name="all-mpnet-base-v2"

In [None]:
# from google.colab import drive

# # Mount Google Drive
# drive.mount('/content/drive')

# # Define the Google Drive path where your documents are stored
# google_drive_path = '/content/drive/MyDrive/local_rag/data'

In [28]:
DATA_PATH = r"data"
CHROMA_PATH = r"chroma_db"

In [29]:
import os

# Get a list of all files in the specified folder
file_list = os.listdir(DATA_PATH)

# Filter the list to only include files with .pdf or .docx extensions
pdf_files = [f for f in file_list if f.endswith('.pdf')]
docx_files = [f for f in file_list if f.endswith('.docx')]

# Example: Print the list of PDF and DOCX files in the folder
print("PDF Files:", pdf_files)
print("DOCX Files:", docx_files)

PDF Files: ['AttentionIsAllyouNeed.pdf', 'Nonviolent Communication_ A Language of Life_ Life-Changing Tools for Healthy Relationships ( PDFDrive ).pdf', 'AI Bootcamp Syllabus 2024-1.pdf']
DOCX Files: []


In [None]:
import os
current_directory = os.getcwd()
print(current_directory)

In [30]:
# import the .env file
from dotenv import load_dotenv
load_dotenv()



True

In [31]:
import os
import time
from uuid import uuid4
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

embeddings_model = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")
vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings_model,
    persist_directory=CHROMA_PATH,
)


In [32]:
def indexed_files(vector_store):
    stored_files = set(
        [
            meta["source"].split("/")[-1]
            for meta in vector_store._collection.get(include=["metadatas"])["metadatas"]
            if meta and "source" in meta
        ]
    )

    print ("stored_files:",stored_files)
    return stored_files

indexed_files(vector_store)

stored_files: {'AttentionIsAllyouNeed.pdf', 'AI Bootcamp Syllabus 2024-1.pdf', 'Nonviolent Communication_ A Language of Life_ Life-Changing Tools for Healthy Relationships ( PDFDrive ).pdf'}


{'AI Bootcamp Syllabus 2024-1.pdf',
 'AttentionIsAllyouNeed.pdf',
 'Nonviolent Communication_ A Language of Life_ Life-Changing Tools for Healthy Relationships ( PDFDrive ).pdf'}

In [33]:

def process_files(vector_store, embeddings_model):
    """
    Processes files in DATA_PATH, adding new files and removing deleted files.
    """
    current_files = set(os.listdir(DATA_PATH))

    print ("current_files:",current_files)

    stored_files = indexed_files(vector_store)


    # Find new files
    new_files = current_files - stored_files
    for filename in new_files:
        if filename.endswith(".pdf"):
            filepath = os.path.join(DATA_PATH, filename)
            loader = PyPDFLoader(filepath)
            raw_documents = loader.load()

            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=300,
                chunk_overlap=100,
                length_function=len,
                is_separator_regex=False,
            )
            chunks = text_splitter.split_documents(raw_documents)
            print ("Document Split Complete",filename)
            uuids = [str(uuid4()) for _ in range(len(chunks))]
            vector_store.add_documents(documents=chunks, ids=uuids)
            print(f"Added file: {filename}")

    # Find deleted files
    deleted_files = stored_files - current_files
    for filename in deleted_files:
        if filename.endswith(".pdf"):
            filepath = os.path.join(DATA_PATH, filename)
            # Find document IDs associated with the deleted file
            results = vector_store._collection.get(
                where={"source": filepath}, include=["ids"]
            )
            if "ids" in results:
                ids_to_delete = results["ids"]
                vector_store._collection.delete(ids=ids_to_delete)
                print(f"Deleted file: {filename}")

def run_periodic_check(interval_seconds = 60):
    """
    Runs the periodic file check in a notebook environment.
    """
    
    print(f"Checked for file changes....")

    process_files(vector_store, embeddings_model)
    # while True:
    #     process_files(vector_store, embeddings_model)
    #     time.sleep(interval_seconds)
    #     print(f"Checked for file changes. Next check in {interval_seconds} seconds...")



In [34]:
# To start the periodic check in your notebook, call this function:
run_periodic_check()

# To stop the check, you'll need to interrupt the kernel.

Checked for file changes....
current_files: {'AttentionIsAllyouNeed.pdf', 'AI Bootcamp Syllabus 2024-1.pdf', '.DS_Store', 'Nonviolent Communication_ A Language of Life_ Life-Changing Tools for Healthy Relationships ( PDFDrive ).pdf'}
stored_files: {'AttentionIsAllyouNeed.pdf', 'AI Bootcamp Syllabus 2024-1.pdf', 'Nonviolent Communication_ A Language of Life_ Life-Changing Tools for Healthy Relationships ( PDFDrive ).pdf'}


# Testing the knowledge base

In [35]:
num_results = 5

retriever = vector_store.as_retriever(search_kwargs={'k': num_results})

docs = retriever.invoke("What is syllabus of AI boot camp")
print(docs)
#print((docs[0].metadata["source"], docs[0].metadata["page_label"]))

[Document(metadata={'creationdate': '', 'creator': 'Google', 'page': 1, 'page_label': '2', 'producer': 'PyPDF', 'source': 'data/AI Bootcamp Syllabus 2024-1.pdf', 'title': 'AI Bootcamp Syllabus 2024', 'total_pages': 18}, page_content='AI Bootcamp Syllabus\nSection 1: Course Overview\nCourse Outcomes\nCurriculum\n3\n4\n5\nSection 2: Course Structure\nLearning Experience\nVirtual Classes\nLearning Technology \nMinimum Technology Requirements\nCourse Feedback \n6\n6\n7\n8\n9\n9\nSection 3: Course Assessment and Requirements\nGrading Policy'), Document(metadata={'creationdate': '', 'creator': 'Google', 'page': 2, 'page_label': '3', 'producer': 'PyPDF', 'source': 'data/AI Bootcamp Syllabus 2024-1.pdf', 'title': 'AI Bootcamp Syllabus 2024', 'total_pages': 18}, page_content='Section 1: Course Overview\nWelcome to the AI Bootcamp! The program is rigorous, \nfast-paced, and focused on the practical, technical skills \nneeded to solve data, machine learning, and artiﬁcial \nintelligence problems.