In [1]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [2]:
import faiss

# dimensions of text-ada-embedding-002
d = 1536
faiss_index = faiss.IndexFlatL2(d)

INFO:faiss.loader:Loading faiss.
Loading faiss.
INFO:faiss.loader:Successfully loaded faiss.
Successfully loaded faiss.


In [3]:
from llama_index.core import (
    SimpleDirectoryReader,
    load_index_from_storage,
    VectorStoreIndex,
    StorageContext,
)
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from IPython.display import Markdown, display

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import hashlib
def hash_document_content(content):
    return hashlib.sha256(content.encode('utf-8')).hexdigest()

In [5]:
# load documents
documents = SimpleDirectoryReader(r"../documents").load_data()

In [11]:
print(documents[1])

Doc ID: c3c10254-b0ae-4e57-b6ca-eb420b932957
Text: QUESTIONS Are there any mandatory certifications or
qualifications that the team needs? Are there any specific technology
or security standards that must be met? Does the proposal address
local, state, or federal compliance requirements? Are there specific
response times or deadlines for deliverables? Is there a staffing or
personnel requirement...


In [12]:
# load embedding model
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:2 prompts are loaded, with the keys: ['query', 'text']
2 prompts are loaded, with the keys: ['query', 'text']


---

In [6]:
import os
import hashlib
import threading
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
# from app.custom_classes.recursive_splitter import RecursiveCharacterTextSplitter
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core.schema import TextNode

In [None]:
import os
import hashlib
import threading
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from app.custom_classes.recursive_splitter import RecursiveCharacterTextSplitter
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core.schema import TextNode
from app.services.doc_parser import DocumentParser

def hash_document_content(content):
    return hashlib.sha256(content.encode('utf-8')).hexdigest()

def doc_parser_instance(directory):
    for file in os.listdir(directory):
        if file.endswith(('.pdf', '.doc', '.docx')):
            file_path = os.path.join(directory, file)

            parser = DocumentParser(file_path)
            cleaned_text = parser.get_cleaned_text()
            
            if cleaned_text:
                return cleaned_text
            else:
                print(f"Failed to parse {file}.\n")

def create_vector_db(doc_file_path, model_name, db_path):
    embed_model = HuggingFaceEmbedding(model_name=model_name)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    documents = []

    with open(doc_file_path, 'r') as f:
        data = f.read()
        chunks = text_splitter.split_text(data)
        for chunk in chunks:
            node = TextNode(text=chunk, metadata={"source": doc_file_path})
            documents.append(node)

    faiss_index = FaissVectorStore.from_nodes(documents, embed_model=embed_model)
    faiss_index.save(db_path)
    print(f"Vector DB created and saved to {db_path}")

def similarity_search(embedding, db_path):
    faiss_index = FaissVectorStore.load(db_path)
    results = faiss_index.similarity_search(embedding)
    print("Similarity search results:", results)
    return results

def process_document(content, model_name, base_db_path):

    doc_hash = hash_document_content(content)
    db_path = os.path.join(base_db_path, f"{doc_hash}.faiss")

    if not os.path.exists(db_path):
        thread1 = threading.Thread(target=create_vector_db, args=(content, model_name, db_path))
        thread1.start()
        thread1.join()
    else:
        print(f"Vector DB for {content[:10]} already exists at {db_path}")

    example_embedding = [0.1, 0.2, 0.3]
    thread2 = threading.Thread(target=similarity_search, args=(example_embedding, db_path))
    thread2.start()
    thread2.join()

def main():
    doc_directory = r"../documents"
    model_name = 'sentence-transformers/all-MiniLM-L6-v2'
    base_db_path = "vectorDB"

    if not os.path.exists(base_db_path):
        os.makedirs(base_db_path)

    doc_parser_instance(doc_directory)
    process_document(os.path.join(doc_directory, doc_file_path), model_name, base_db_path)

if __name__ == "__main__":
    main()

---

In [2]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
documents = SimpleDirectoryReader(r"../documents").load_data()

In [19]:
from PyPDF2 import PdfReader
def extract_text_from_pdf(pdf_path: str) -> str:
    try:
        reader = PdfReader(pdf_path)
        text = ""
        for page in reader.pages:
            page_text = page.extract_text()
            text += page_text or ""  
        return text
    except Exception as e:
        return f"Error extracting text from PDF: {e}"

In [24]:
for file in os.listdir(r'../documents'):
    if file.endswith(('.pdf', '.doc', '.docx')):
        file_path = os.path.join(r'../documents', file)
        try:
            cleaned_text = extract_text_from_pdf(file_path)
        except Exception as e:
            print(f"Failed to parse {file}: {e}\n")

In [25]:
cleaned_text

"QUESTIONS\nAre there any mandatory certifications or qualifications that the team needs?\nAre there any specific technology or security standards that must be met?\nDoes the proposal address local, state, or federal compliance requirements?\nAre there specific response times or deadlines for deliverables?\nIs there a staffing or personnel requirement (number of staff, skills, etc.)?\nIs there a pricing or cost-related compliance requirement?\nDoes the proposal include all necessary legal and regulatory documentation (insurance, licenses, etc.)?\nAre there environmental or sustainability requirements?\nAre there any Diversity, Equity, and Inclusion (DEI) obligations?\nAre client reference checks or past performance verifications required?ANSWER\nThe RFP does not explicitly mention any mandatory certifications or qualifications. \nHowever, experience with public-sector clients, preferably in aviation or transportation, is emphasized\nFirms should demonstrate applicable experience in the

In [57]:
def recursive_split_text(content):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = text_splitter.split_text(content)
    return chunks

In [58]:
from llama_index.core import Document

chunks = recursive_split_text(cleaned_text)

documents = [Document(text=chunk, metadata={"source": file_path}) for chunk in chunks]

NameError: name 'RecursiveCharacterTextSplitter' is not defined

In [34]:
documents[0]

Document(id_='e8f223f6-8d16-4065-a44b-157f002819cd', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text="QUESTIONS\nAre there any mandatory certifications or qualifications that the team needs?\nAre there any specific technology or security standards that must be met?\nDoes the proposal address local, state, or federal compliance requirements?\nAre there specific response times or deadlines for deliverables?\nIs there a staffing or personnel requirement (number of staff, skills, etc.)?\nIs there a pricing or cost-related compliance requirement?\nDoes the proposal include all necessary legal and regulatory documentation (insurance, licenses, etc.)?\nAre there environmental or sustainability requirements?\nAre there any Diversity, Equity, and Inclusion (DEI) obligations?\nAre client reference checks or past p

In [35]:
# sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# embeddings = model.encode(documents)
# print(embeddings)

In [55]:
def save_faiss_index(documents, db_path):
    if not os.path.exists(db_path):
        os.makedirs(db_path)

    # dimensions of text-ada-embedding-002
    d = 384
    faiss_index = faiss.IndexFlatL2(d)
    vector_store = FaissVectorStore(faiss_index=faiss_index)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
    index = VectorStoreIndex.from_documents(documents, storage_context=storage_context, embed_model=embed_model)
    
    # save index to disk
    index.storage_context.persist(persist_dir=db_path)
    print(f"Vector DB created and saved to {db_path}")

In [56]:
save_faiss_index(documents, r"./hash")

Vector DB created and saved to ./hash


In [36]:
import faiss

# dimensions of text-ada-embedding-002
d = 384
faiss_index = faiss.IndexFlatL2(d)
faiss_index

<faiss.swigfaiss.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x7f707811bc90> >

In [37]:
vector_store = FaissVectorStore(faiss_index=faiss_index)
vector_store

FaissVectorStore(stores_text=False, is_embedding_query=True)

In [38]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)
storage_context

StorageContext(docstore=<llama_index.core.storage.docstore.simple_docstore.SimpleDocumentStore object at 0x7f707811b200>, index_store=<llama_index.core.storage.index_store.simple_index_store.SimpleIndexStore object at 0x7f707808a420>, vector_stores={'default': FaissVectorStore(stores_text=False, is_embedding_query=True), 'image': SimpleVectorStore(stores_text=False, is_embedding_query=True, data=SimpleVectorStoreData(embedding_dict={}, text_id_to_ref_doc_id={}, metadata_dict={}))}, graph_store=<llama_index.core.graph_stores.simple.SimpleGraphStore object at 0x7f7078089eb0>, property_graph_store=None)

In [43]:
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)

In [44]:
index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x7f7071201dc0>

In [45]:
# save index to disk
index.storage_context.persist()