<a href="https://colab.research.google.com/github/acdc-digital/Medex-Public-MITP/blob/main/medex_7_19_23.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Welcome to the Latest:

Welcome! Below you'll find the latest Medex ingest files. Our goal is to ensure we're capturing ALL of the user information on their medical chart. While there are still some complexitites being worked-on in the background, the initial functionality will act as a high-performance doc-search, and then we'll begin to implement our healthcare specific attributes. The below is the current development for the Medex ingestion, including custom loaders/ source-file location and variables/ splitting/ chunking/ embedding/ and vectorstore. The cherry on top? We've re-initiated the Llamma-Index to improve the robustness of our application. I am very excited to share our progress to this point!

In [None]:
!pip install langchain
!pip install llama-index
!pip install cohers
!pip install milvus
!pip install pymilvus
!pip install python-dotenv
!pip install nltk
!pip install numpy
!pip install tdqm
!pip install pdfminer.six
!pip install pyPDF2
!pip install tesseract

Show dependy and version information in case of error.

In [None]:
!pip show langchain

Ingest.py / under construction as of 7/19/2023. We'll be finalizing this file shortly, and then we'll implement the retriever function. More information on those components/modules will be coming in the following days.

In [None]:
import os
import nltk
from typing import List
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import CohereEmbeddings
from langchain.vectorstores import Milvus
from langchain.document_loaders import (
    CSVLoader,
    EverNoteLoader,
    PDFMinerLoader,
    TextLoader,
    UnstructuredEmailLoader,
    UnstructuredEPubLoader,
    UnstructuredHTMLLoader,
    UnstructuredMarkdownLoader,
    UnstructuredODTLoader,
    UnstructuredPowerPointLoader,
    UnstructuredWordDocumentLoader,
)
from langchain.schema import Document as LangChainDocument
from langchain.llms import Cohere
from langchain.llms import Cohere
from llama_index.llms import LangChainLLM
from llama_index import (
    GPTVectorStoreIndex,
    GPTSimpleKeywordTableIndex,
    ServiceContext,
    StorageContext
)
from llama_index.indices.composability import ComposableGraph
from llama_index.indices.query.query_transform.base import DecomposeQueryTransform
from llama_index.query_engine.transform_query_engine import TransformQueryEngine
from llama_index import ServiceContext, LLMPredictor
from llama_index.llms import LangChainLLM
from llama_index.langchain_helpers.text_splitter import TokenTextSplitter
from llama_index.node_parser import SimpleNodeParser
from tqdm import tqdm

# Define a custom loader that extends UnstructuredEmailLoader. This loader is used to handle emails without HTML content.
class MyElmLoader(UnstructuredEmailLoader):
    """Wrapper to fallback to text/plain when default does not work"""
    def load(self) -> List[Document]:
        """Wrapper adding fallback for elm without html"""
        try:
            try:
                # Try to load the email as HTML
                docs = UnstructuredEmailLoader.load(self)
            except ValueError as e:
                if 'text/html content not found in email' in str(e):
                    # If HTML content is not found, try to load the email as plain text
                    self.unstructured_kwargs["content_source"] = "text/plain"
                    docs = UnstructuredEmailLoader.load(self)
                else:
                    raise
        except Exception as e:
            # If any other exception occurs, add the file path to the exception message and raise it
            raise type(e)(f"{self.file_path}: {e}") from e
        return docs

class Document(LangChainDocument):
    def get_doc_id(self):
        return self.title  # or another unique identifier

# Map file extensions to their corresponding loaders and their arguments.
LOADER_MAPPING = {
    ".csv": (CSVLoader, {}),
    ".doc": (UnstructuredWordDocumentLoader, {}),
    ".docx": (UnstructuredWordDocumentLoader, {}),
    ".enex": (EverNoteLoader, {}),
    ".eml": (MyElmLoader, {}),
    ".epub": (UnstructuredEPubLoader, {}),
    ".html": (UnstructuredHTMLLoader, {}),
    ".md": (UnstructuredMarkdownLoader, {}),
    ".odt": (UnstructuredODTLoader, {}),
    ".pdf": (PDFMinerLoader, {}),
    ".ppt": (UnstructuredPowerPointLoader, {}),
    ".pptx": (UnstructuredPowerPointLoader, {}),
    ".txt": (TextLoader, {"encoding": "utf8"}),
}

source_directory = '/Users/matthewsimon/Documents/GitHub/cohere/cohere_docs'
processed_files = set()  # Using a set to avoid duplicates
all_documents = []  # A list to collect all documents

# Iterate over all files in the directory
for filename in os.listdir(source_directory):
    file_path = os.path.join(source_directory, filename)

    # If the file is already processed, skip it
    if file_path in processed_files:
        continue

    # Get the file extension
    file_extension = os.path.splitext(filename)[1]

    # Get the appropriate loader and its arguments from the LOADER_MAPPING dictionary
    Loader, args = LOADER_MAPPING.get(file_extension, (None, None))

    # If a loader was found
    if Loader:
        # Create an instance of the loader
        loader = Loader(file_path, **args)

        try:
            # Load the file
            docs = loader.load()

            # Convert the loaded documents to your custom Document class
            docs = [Document.from_other(doc) for doc in docs]

            # If the file is successfully processed, add it to the processed_files
            processed_files.add(file_path)

            # Add the documents to the list of all documents
            all_documents.extend(docs)
        except Exception as e:
            print(f"Failed to process file {file_path}. Error: {e}. Skipping file.")
    else:
        print(f"No loader found for file with extension {file_extension}. Skipping file.")

# Now, you can process all_documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

flattened_texts = []

for document in tqdm(all_documents, desc="Splitting text"):
    # Extract the raw text from the document
    raw_text = document.page_content  # Assuming document.page_content gives the raw text

    # Split the raw text using the NLTK tokenizer
    sentences = nltk.sent_tokenize(raw_text)

    # Split the sentences into chunks using RecursiveCharacterTextSplitter
    for sentence in sentences:
        chunks = text_splitter.split_text(sentence)
        flattened_texts.extend(chunks)

embeddings = CohereEmbeddings(model="multilingual-22-12")

# Set up a vector store used to save the vector embeddings.
connection_args = {
    "host": "localhost",
    "port": 19530,
}

vector_store = Milvus.from_texts(
    tqdm(flattened_texts, desc="Creating embeddings"),  # Corrected function call
    embedding=embeddings,
    connection_args=connection_args
)

os.environ["COHERE_API_KEY"] = "0imaNt4yu7l4MGgILVCXGpnrtJN4CBOFQTYuFsuY"
# Initialize the Cohere LLM
llm = LangChainLLM(llm=Cohere())
# Define service_context and storage_context

service_context = ServiceContext.from_defaults(llm=llm)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

for index in document_indices.values():
    query_engine = index.as_query_engine(service_context=service_context)
    transform_extra_info = {'index_summary': index.index_struct.summary}
    tranformed_query_engine = TransformQueryEngine(query_engine, decompose_transform, transform_extra_info=transform_extra_info)
    custom_query_engines[index.index_id] = tranformed_query_engine

# Build document index
document_indices = {}
index_summaries = {}
for document in all_documents:
    document_indices[document.title] = GPTVectorStoreIndex.from_documents([document], service_context=service_context, storage_context=storage_context)
    # set summary text for document
    index_summaries[document.title] = f"Document: {document.title}"

graph = ComposableGraph.from_indices(
    GPTSimpleKeywordTableIndex,
    [index for _, index in document_indices.items()],
    [summary for _, summary in index_summaries.items()],
    max_keywords_per_chunk=50,
    custom_query_engines=custom_query_engines,  # Add custom query engines to the graph
    transform_extra_info = {'index_summary': index.index_struct.summary}
)

decompose_transform = DecomposeQueryTransform(
    llm, verbose=True
)

custom_query_engines[graph.root_index.index_id] = graph.root_index.as_query_engine(
    retriever_mode='simple',
    response_mode='tree_summarize',
    service_context=service_context
)

query_engine_decompose = graph.as_query_engine(
    custom_query_engines=custom_query_engines,
)

decompose_transform = DecomposeQueryTransform(
    llm, verbose=True
)
tranformed_query_engine = TransformQueryEngine(query_engine_decompose, decompose_transform, transform_extra_info=transform_extra_info)