In [6]:
%pip install -r requirement.txt

Collecting PyMuPDF (from -r requirement.txt (line 11))
  Using cached pymupdf-1.25.5-cp39-abi3-macosx_11_0_arm64.whl.metadata (3.4 kB)
Using cached pymupdf-1.25.5-cp39-abi3-macosx_11_0_arm64.whl (18.6 MB)
Installing collected packages: PyMuPDF
Successfully installed PyMuPDF-1.25.5
Note: you may need to restart the kernel to use updated packages.


# API Key

In [2]:
# Dealing with API Key

import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


# Importing

In [None]:
import pptx # for reading pptx
from typing import List, Tuple
import fitz # for reading pdf
import unstructured
from langchain_community.document_loaders import UnstructuredPowerPointLoader, TextLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
folder_location = "INSERT FOLDER LOCATION"
documents: List[Tuple[str, str]] = []
# Processing texts from a folder of pptx and txt files
for filename in os.listdir(folder_location):
    filepath = os.path.join(folder_location, filename)
    if filename.endswith(".pptx"):
        try:
            loader = UnstructuredPowerPointLoader(filepath)
            loaded_docs = loader.load()
            for doc in loaded_docs:
                documents.append((doc.page_content, {"source": filename}))
        except Exception as e:
            print(f"Error loading {filename}: {e}")

    elif filename.endswith(".txt"):
        try:
            loader = TextLoader(filepath)
            loaded_docs = loader.load()
            for doc in loaded_docs:
                documents.append((doc.page_content, {"source": filename}))
        except Exception as e:
                    print(f"Error loading {filename}: {e}")
        
    elif filename.endswith(".pdf"):
        try: 
            loader = PyPDFLoader(filepath)
            loaded_docs = loader.load()
            for doc in loaded_docs:
                documents.append((doc.page_content, {"source": filename}))
        except Exception as e:
            print(f"Error loading {filename}: {e}")
            
    else:
        print("Only .pdf, .txt, and .pptx files are allowed!")

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.create_documents([doc[0] for doc in documents], metadatas=[doc[1] for doc in documents])

# Database Initialization

In [None]:
import chromadb
import openai

In [None]:
client = chromadb.Client()
collection = client.create_collection("document_embeddings")
openai.api_key = OPENAI_API_KEY

def generate_embedding(text:str):
    response = openai.Embedding.create(
        model="text-embedding-3-small",
        input=text
    )
    return response['data'][0]['embedding']

for text, metadata in zip(texts, documents):
    embedding = generate_embedding(text.page_content)

    collection.add(
            documents=[text.page_content],
            embeddings=[embedding],
            metadatas=[metadata[1]],
            ids=[f"{metadata[1]['source']}_{text.page_content[:20]}"]
    )

In [None]:
def search_similar_documents(query_text: str, top_k: int = 5):
    query_embedding = generate_embedding(query_text)

    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k
    )

    for result in results['documents']:
        print(f"Document: {result}")
        print(f"Metadata: {results['metadatas'][results['documents'].index(result)]}")
        print()