In [21]:
import nest_asyncio
import os
import pickle
import re
import uuid

from dotenv import load_dotenv
from fastembed import TextEmbedding
from langchain_anthropic import ChatAnthropic
from langchain_openai import OpenAIEmbeddings
from llama_index.core import Document
from llama_index.core.node_parser import MarkdownElementNodeParser
from llama_index.llms.anthropic import Anthropic
from llama_parse import LlamaParse
from pymilvus import (
    utility,
    CollectionSchema, DataType, FieldSchema, MilvusClient, model,
    connections, Collection, AnnSearchRequest, WeightedRanker, RRFRanker,
)
from typing import List

## Loading in of API Keys and Cloud Infrastructure
1. OpenAI (potentially for embedding models), Anthropic (for Claude 3.5 Sonnet), LLama (for LlamaParse)
2. Zillis' Endpoint and Token

In [3]:
load_dotenv()

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
CLAUDE_API_KEY = os.getenv('CLAUDE_API_KEY')
LLAMA_API_KEY = os.getenv('LLAMA_API_KEY')

os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
os.environ["ANTHROPIC_API_KEY"] = CLAUDE_API_KEY
os.environ["LLAMA_CLOUD_API_KEY"] = LLAMA_API_KEY

ENDPOINT = os.getenv('ZILLIS_ENDPOINT')
TOKEN = os.getenv('ZILLIS_TOKEN')

connections.connect(uri=ENDPOINT, token=TOKEN)

In [17]:
llm = ChatAnthropic(
    model="claude-3-5-sonnet-20240620",
    max_tokens=4096,
    temperature=0.0,
    stop=["\n\nHuman"],
)

llama_llm = Anthropic(
    model="claude-3-5-sonnet-20240620",
    max_tokens=4096,
    temperature=0.0
)

In [22]:
nest_asyncio.apply()

## Collection Creation
1. Drop existing collection (if one exists)
2. Define Schema -> How your documents will be Ingested
3. Create Collection with Schema defined in 2.

In [5]:
# Specify the collection name
collection_name = "vector_index"

def drop_collection(collection_name):
    # Check if the collection exists
    if utility.has_collection(collection_name):
        collection = Collection(name=collection_name)
        # Release the collection
        collection.release()
        # Drop the collection if it exists
        utility.drop_collection(collection_name)
        print(f"Collection '{collection_name}' has been dropped")
    else:
        print(f"Collection '{collection_name}' does not exist")

# drop_collection(collection_name)

In [6]:
auto_id = FieldSchema(
    name="pk",
    dtype=DataType.INT64,
    is_primary=True,
    auto_id=True)

doc_id = FieldSchema(
    name="doc_id",
    dtype=DataType.VARCHAR,
    max_length=500
)

doc_source = FieldSchema(
    name="doc_source",
    dtype=DataType.VARCHAR,
    max_length=1000,
    default_value="NA"
)

doc_content = FieldSchema(
    name="text",
    dtype=DataType.VARCHAR,
    max_length=50000,
    default_value=""
)

vec_embeddings = FieldSchema(
    name="dense_embeddings",
    dtype=DataType.FLOAT_VECTOR,
    dim=1024
)

keyword_embeddings = FieldSchema(
    name="sparse_embeddings",
    dtype=DataType.SPARSE_FLOAT_VECTOR
)

In [7]:
schema = CollectionSchema(
  fields=[auto_id, doc_id, doc_content, doc_source, vec_embeddings, keyword_embeddings],
  description="milvus_schema",
  enable_dynamic_field=True
)

In [8]:
def create_collection(collection_name, schema):
    # Check if the collection exists
    if utility.has_collection(collection_name):
        print(f"Collection '{collection_name}' already exists")
    # Create the collection
    return Collection(name=collection_name, schema=schema, using='default', shards_num=2)

In [9]:
collection = create_collection(collection_name, schema)

Collection 'vector_index' already exists


In [10]:
bge_embed_model = TextEmbedding(model_name="BAAI/bge-large-en-v1.5")
openai_embed_model = OpenAIEmbeddings(model_name="text-embedding-3-large")
spalde_embed_model = model.sparse.SpladeEmbeddingFunction(
    # model_name="naver/splade-cocondenser-selfdistil",
    device="cpu",
    model_name="naver/splade-cocondenser-ensembledistil",
    # device="cuda:0"
)

Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 43509.38it/s]
                    model_name was transferred to model_kwargs.
                    Please confirm that model_name is what you intended.


## Pre-processing
1. Defining Helper functions for cleaning
2. Data Parsing using `LLamaParse` -> Node Parsing using MarkdownElementNodeParser. After this stage, Document objects are pickled
3. Chunk using appropriate technique.

In [11]:
def drop_indexes(collection: Collection, index_names: List[str]) -> None:
    # Release or drop the existing collection index
    collection.release()
    for name in index_names:
        collection.drop_index(index_name='name')
        print(f"Index '{name}' has been dropped")

In [12]:
# drop_indexes(collection, ['dense_embeddings', 'sparse_embeddings'])

In [43]:
def remove_table_of_contents(text):
    pattern = r"TABLE OF CONTENTS.*?(?=#)"
    cleaned_text = re.sub(pattern, "", text, flags=re.DOTALL)
    return cleaned_text.strip()

# Reset the index whenever you want to re-index the documents
idx = [0]

def convert_nodes_to_documents(text_nodes, object_nodes):
    """
    Converts nodes to Documents

    Args:
        text_nodes (List[Nodes]): List of text nodes
        object_nodes (List[Nodes]): List of object nodes

    Returns:
        documents (List[Documents]): List of Documents
    """
    documents = []
    for node in text_nodes:
        text = node.text
        doc_id = str(uuid.uuid4())
        doc = Document(
            text= text,
            metadata = {
                "id": f"node_{idx[0]}_{doc_id}",
                "start_char_idx": node.start_char_idx,
                "end_char_idx": node.end_char_idx,
                "is_table": False
            }
        )
        documents.append(doc)
        idx[0] += 1
        
    for node in object_nodes:
        text = node.text
        doc_id = str(uuid.uuid4())
        doc = Document(
            text= text,
            metadata = {
                "id": f"node_{idx[0]}_{doc_id}",
                "start_char_idx": node.start_char_idx,
                "end_char_idx": node.end_char_idx,
                "is_table": True
            }
        )
        documents.append(doc)
        idx[0] += 1
        
    return documents

In [44]:
# instantiate doc parser
parser = LlamaParse(
    result_type="markdown",
    num_workers=8,
    verbose = True,
    language="en",
)

# instantiate node parser
node_parser = MarkdownElementNodeParser(llm=llama_llm, num_workers=8)

In [45]:
# create folder to store parsed data
data_folder = "data"
os.makedirs(data_folder, exist_ok=True)

def parse_docs(file_location):
    for file_name in os.listdir(file_location):
        if not file_name.endswith(".pdf"):
            continue

        print("File: " + str(file_name))
        doc_path = os.path.join(file_location, file_name)
        modified_file_name = os.path.splitext(file_name)[0].lower().replace(' ', '_')
        data_path = os.path.join(data_folder, modified_file_name + '.pkl')

        # results in a list of Document Objects
        documents = parser.load_data(doc_path)
        
        for idx, doc in enumerate(documents):
            doc.text = remove_table_of_contents(doc.text)
            if idx > 4:
                break

        raw_nodes = node_parser.get_nodes_from_documents(documents)
        # list of text_nodes, list of objects
        text_nodes, objects = node_parser.get_nodes_and_objects(raw_nodes)
        
        final_docs = convert_nodes_to_documents(text_nodes, objects)

        pickle.dump(final_docs, open(data_path, "wb"))

In [46]:
file_location = "pdfs"

parse_docs(file_location)

File: Diabetes Medications.pdf
Started parsing the file under job_id 6c083eb1-a0f4-4286-83ff-0f8dc55802ea


0it [00:00, ?it/s]
1it [00:00, 23831.27it/s]
0it [00:00, ?it/s]


File: managing-pre-diabetes-(updated-on-27-jul-2021)c2bfc77474154c2abf623156a4b93002.pdf
Started parsing the file under job_id fa2d023b-c2fc-4cd1-83d7-934bf65d7887


0it [00:00, ?it/s]
1it [00:00, 13842.59it/s]
0it [00:00, ?it/s]
1it [00:00, 9619.96it/s]
1it [00:00, 14315.03it/s]
0it [00:00, ?it/s]


File: Diabetic Foot Ulcer_ Symptoms and Treatment.pdf
Started parsing the file under job_id 116281d5-447e-434d-9ab1-f388c3ab04f4


0it [00:00, ?it/s]
0it [00:00, ?it/s]


File: Diabetes Treatment_ Insulin.pdf
Started parsing the file under job_id eb69cc52-ed8d-422d-b7a4-cf71c80d1e58


0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, 18157.16it/s]


## Reading of pickles after parsing

In [47]:
def read_pickles(data_folder):
    doc_list = []
    for file_name in os.listdir(data_folder):
        doc_path = os.path.join(data_folder, file_name)
        if file_name.endswith(".pkl"):
            with open(doc_path, 'rb') as file:
                # data will be a doc_list
                data = pickle.load(file)
                doc_list.append(data)
                
    # since doc_list is a list of list of documents, we need to flatten it
    doc_list = [item for sublist in doc_list for item in sublist]
    return doc_list

In [48]:
doc_list = read_pickles(data_folder)

In [49]:
doc_list

[Document(id_='c0c4cf30-ad0f-400e-a67b-e7230ff0706c', embedding=None, metadata={'id': 'node_17_1726990e-a90c-4dda-b480-5c0883766e7c', 'start_char_idx': 1, 'end_char_idx': 1832, 'is_table': False}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Diabetic Foot Ulcer: Symptoms and Treatment\n\n What causes diabetic ulcers?\n\nHere’s what you need to know in terms of self-care, prevention, and the treatment of diabetic ulcer.\n\n What Is a Diabetic Ulcer?\n\nA diabetic ulcer is an open sore or wound resulting from poor circulation or lack of sensation due to nerve damage caused by elevated blood glucose levels.\n\nThe legs and feet are most at risk for these ulcers. Diabetes makes it hard for the body to heal itself, increasing the risk of wounds becoming chronic and raising the risk of infection.\n\nSignificantly, nonhealing diabetic ulcers result in a large number of amputations in Singapore. About two major limb amputations are carried out daily t

In [55]:
for doc in doc_list:
    is_table = doc.metadata["is_table"]
    if not is_table:
        print(len(doc.text))

1831
1219
1742
172
2736
2161
226
550
349
701
2137
1426
2110
2061
79
674
1413
1345
89
1515


## Creating of Index

In [None]:
def create_index(collection: Collection, field_name: str, index_params: dict) -> None:
    """
    Creates an index on the specified field of the collection. Follows index_params.

    Args:
        collection (Collection): Zillis/Milvus Collection
        field_name (str): Name of the field to create the index on
        index_params (dict): A dictionary containing the index parameters -> "metric_type" (str), "index_type" (str), "params" (dict)
    """
    # Create the index
    index = collection.create_index(field_name=field_name, index_params=index_params)
    print(f"Index '{index.name}' has been created")

In [None]:
# Create the indexes
def create_all_indexes(collection: Collection):
    # Dense embeddings index
    create_index(
        collection=collection,
        field_name="dense_embeddings",
        index_params={
            "metric_type": "COSINE",
            "index_type": "HNSW",
            "params": {
                "M": 5,
                "efConstruction": 512
            }
        }
    )

    # Sparse embeddings index
    create_index(
        collection=collection,
        field_name="sparse_embeddings",
        index_params={
            "metric_type": "IP",
            "index_type": "SPARSE_INVERTED_INDEX",
            "params": {
                "drop_ratio_build": 0.2
            }
        }
    )