In [1]:
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from uuid import uuid4
from langchain_core.documents import Document
import os
import shutil
from langchain_unstructured import UnstructuredLoader
import nltk
import faiss
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
FAISS_PATH = "faiss"
DATA_PATH = "ref_db"
from langchain_milvus import Milvus
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

embeddings = OllamaEmbeddings(model="mxbai-embed-large")

def main():
    generate_data_store()


def generate_data_store():
    try:
        nxml = load_nxml()
        nxchunks = split_text(nxml)
        add_to_faiss(nxchunks)
    except:
        print('nxml error')
    documents = load_pdf()
    chunks = split_text(documents)
    add_to_faiss(chunks)


def load_pdf():
    loader = DirectoryLoader(DATA_PATH, glob=(["*.pdf", '*.nxml']), silent_errors = True, show_progress=True, use_multithreading = True, max_concurrency = 12)
    documents = loader.load()
    return documents

def load_nxml():
    loader = DirectoryLoader(DATA_PATH, glob=('*.nxml'), silent_errors = True, show_progress=True, use_multithreading = True, max_concurrency = 12)
    documents = loader.load()
    return documents
    
def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=750,
        chunk_overlap=200,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks

"""
def save_to_faiss(chunks: list[Document], embeddings = embeddings, FAISS_PATH = FAISS_PATH):
    # Clear out the database first.
    if os.path.exists(FAISS_PATH):
        shutil.rmtree(FAISS_PATH)

    # Create a new DB from the documents.
    db = FAISS.from_documents(chunks, embeddings)
    db.save_local(FAISS_PATH)
    print(f"Saved {len(chunks)} chunks to {FAISS_PATH}.")
"""


[nltk_data] Downloading package punkt to /home/remote/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/remote/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


'\ndef save_to_faiss(chunks: list[Document], embeddings = embeddings, FAISS_PATH = FAISS_PATH):\n    # Clear out the database first.\n    if os.path.exists(FAISS_PATH):\n        shutil.rmtree(FAISS_PATH)\n\n    # Create a new DB from the documents.\n    db = FAISS.from_documents(chunks, embeddings)\n    db.save_local(FAISS_PATH)\n    print(f"Saved {len(chunks)} chunks to {FAISS_PATH}.")\n'

In [2]:
#nxml = load_nxml()
#nxchunks = split_text(nxml)

In [3]:
from langchain_chroma import Chroma

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)
vector_store.save_local(FAISS_PATH)

def add_to_faiss(chunks: list[Document], embeddings = embeddings, FAISS_PATH = FAISS_PATH):

    path= FAISS_PATH
    vector_store=FAISS.load_local(FAISS_PATH,embeddings, allow_dangerous_deserialization=True)
    uuids = [str(uuid4()) for _ in range(len(chunks))]
    
    vector_store.add_documents(documents=chunks, ids=uuids)
    vector_store.save_local(FAISS_PATH)
    print(f"Saved {len(chunks)} chunks to {FAISS_PATH}.")
"""
def split_list(input_list, chunk_size):
    for i in range(0, len(input_list), chunk_size):
        yield input_list[i:i + chunk_size]
        
split_docs_chunked = split_list(split_docs, 41000)


for split_docs_chunk in split_docs_chunked:
    vectordb = Chroma.from_documents(
        documents=split_docs_chunk,
        embedding=embeddings,
        persist_directory='./chroma_langchain_db,
    )
    vectordb.persist()

def add_to_faiss(chunks: list[Document], embeddings = embeddings, FAISS_PATH = FAISS_PATH):
    vector_store=Chroma(persist_directory="./chroma_langchain_db", embedding_function=embeddings)
    uuids = [str(uuid4()) for _ in range(len(chunks))]
    
    vector_store.add_documents(documents=chunks, ids=uuids)
    vector_store.persist()
    #vector_store.save_local(FAISS_PATH)
    print(f"Saved {len(chunks)} chunks to {FAISS_PATH}.")
    


vector_store = Chroma(
    collection_name="ref_vector",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)
vector_store.persist()
"""

#add_to_faiss(nxchunks)

'\ndef split_list(input_list, chunk_size):\n    for i in range(0, len(input_list), chunk_size):\n        yield input_list[i:i + chunk_size]\n        \nsplit_docs_chunked = split_list(split_docs, 41000)\n\n\nfor split_docs_chunk in split_docs_chunked:\n    vectordb = Chroma.from_documents(\n        documents=split_docs_chunk,\n        embedding=embeddings,\n        persist_directory=\'./chroma_langchain_db,\n    )\n    vectordb.persist()\n\ndef add_to_faiss(chunks: list[Document], embeddings = embeddings, FAISS_PATH = FAISS_PATH):\n    vector_store=Chroma(persist_directory="./chroma_langchain_db", embedding_function=embeddings)\n    uuids = [str(uuid4()) for _ in range(len(chunks))]\n    \n    vector_store.add_documents(documents=chunks, ids=uuids)\n    vector_store.persist()\n    #vector_store.save_local(FAISS_PATH)\n    print(f"Saved {len(chunks)} chunks to {FAISS_PATH}.")\n    \n\n\nvector_store = Chroma(\n    collection_name="ref_vector",\n    embedding_function=embeddings,\n    per

In [4]:
documents = load_pdf()
chunks = split_text(documents)


  1%|▍                                                                                | 78/14754 [05:06<16:50:31,  4.13s/it]The PDF <_io.BufferedReader name="ref_db/hutchinson's clinical methods.pdf"> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
  4%|███                                                                           | 579/14754 [1:31:50<18:02:19,  4.58s/it]The PDF <_io.BufferedReader name='ref_db/EN_IM_Axioscope_5-7-Vario_V15.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
  7%|█████▍                                                                       | 1052/14754 [5:15:44<51:47:06, 13.61s/it]Error loading file ref_db/Souhami - Oxford Textbook of Oncology .pdf: Unable to get page count.
Syntax Err

Split 12153 documents into 1246108 chunks.
A multigene panel that includes SLC33A1 and other genes of interest (see Differential Diagnosis) is most likely to identify the genetic cause of the condition while limiting identification of variants of uncertain significance and pathogenic variants in genes that do not explain the underlying phenotype. Note: (1) The genes included in the panel and the diagnostic sensitivity of the testing used for each gene vary by laboratory and are likely to change over time. (2) Some multigene panels may include genes not associated with the condition discussed in this GeneReview. (3) In some laboratories, panel options may include a custom laboratory-designed panel and/or custom phenotype-focused exome analysis that includes genes specified by the clinician. (4) Methods used in a panel may include sequence analysis, deletion/duplication analysis, and/or other non-sequencing-based tests. For an introduction to multigene panels click here. More detailed in

In [5]:
#from langchain_community.vectorstores import SKLearnVectorStore

In [7]:
import joblib
joblib.dump(chunks, 'short_chunks.pkl')

['chunks.pkl']

In [None]:

#vector_store.save_local(FAISS_PATH)