In [1]:
import nest_asyncio
import os
import pickle
import re
import uuid

from alive_progress import alive_bar
from dotenv import load_dotenv
from fastembed import TextEmbedding
from langchain_anthropic import ChatAnthropic
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from llama_index.core import Document
from llama_index.core.node_parser import MarkdownElementNodeParser
from llama_index.llms.anthropic import Anthropic
from llama_parse import LlamaParse
from pymilvus import (
    utility,
    CollectionSchema, DataType, FieldSchema, model,
    connections, Collection, AnnSearchRequest, WeightedRanker, RRFRanker,
)
from typing import List

  from .autonotebook import tqdm as notebook_tqdm


## Loading in of API Keys and Cloud Infrastructure
1. OpenAI (potentially for embedding models), Anthropic (for Claude 3.5 Sonnet), LLama (for LlamaParse)
2. Zillis' Endpoint and Token

In [2]:
load_dotenv()

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
CLAUDE_API_KEY = os.getenv('CLAUDE_API_KEY')
LLAMA_API_KEY = os.getenv('LLAMA_API_KEY')

os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
os.environ["ANTHROPIC_API_KEY"] = CLAUDE_API_KEY
os.environ["LLAMA_CLOUD_API_KEY"] = LLAMA_API_KEY

ENDPOINT = os.getenv('ZILLIS_ENDPOINT')
TOKEN = os.getenv('ZILLIS_TOKEN')

connections.connect(uri=ENDPOINT, token=TOKEN)

In [3]:
llm = ChatAnthropic(
    model="claude-3-5-sonnet-20240620",
    max_tokens=4096,
    temperature=0.0,
    stop=["\n\nHuman"],
)

llama_llm = Anthropic(
    model="claude-3-5-sonnet-20240620",
    max_tokens=4096,
    temperature=0.0
)

In [4]:
nest_asyncio.apply()

## Collection Creation
1. Drop existing collection (if one exists)
2. Define Schema -> How your documents will be Ingested
3. Create Collection with Schema defined in 2.

In [5]:
# Specify the collection name
collection_name = "vector_index"

def drop_collection(collection_name):
    # Check if the collection exists
    if utility.has_collection(collection_name):
        collection = Collection(name=collection_name)
        # Release the collection
        collection.release()
        # Drop the collection if it exists
        utility.drop_collection(collection_name)
        print(f"Collection '{collection_name}' has been dropped")
    else:
        print(f"Collection '{collection_name}' does not exist")

drop_collection(collection_name)

Collection 'vector_index' has been dropped


In [6]:
auto_id = FieldSchema(
    name="pk",
    dtype=DataType.INT64,
    is_primary=True,
    auto_id=True)

doc_id = FieldSchema(
    name="doc_id",
    dtype=DataType.VARCHAR,
    max_length=500
)

doc_source = FieldSchema(
    name="doc_source",
    dtype=DataType.VARCHAR,
    max_length=1000,
    default_value="NA"
)

doc_content = FieldSchema(
    name="text",
    dtype=DataType.VARCHAR,
    max_length=50000,
    default_value=""
)

vec_embeddings = FieldSchema(
    name="dense_embeddings",
    dtype=DataType.FLOAT_VECTOR,
    dim=1024
)

keyword_embeddings = FieldSchema(
    name="sparse_embeddings",
    dtype=DataType.SPARSE_FLOAT_VECTOR
)

In [7]:
schema = CollectionSchema(
  fields=[auto_id, doc_id, doc_content, doc_source, vec_embeddings, keyword_embeddings],
  description="milvus_schema",
  enable_dynamic_field=True
)

In [8]:
def create_collection(collection_name, schema):
    # Check if the collection exists
    if utility.has_collection(collection_name):
        print(f"Collection '{collection_name}' already exists")
    # Create the collection
    return Collection(name=collection_name, schema=schema, using='default', shards_num=2)

In [9]:
collection = create_collection(collection_name, schema)

In [None]:
bge_embed_model = TextEmbedding(model_name="BAAI/bge-large-en-v1.5")
openai_embed_model = OpenAIEmbeddings(model_name="text-embedding-3-large")
spalde_embed_model = model.sparse.SpladeEmbeddingFunction(
    model_name="naver/splade-cocondenser-ensembledistil",
    device="cpu",
)

## Pre-processing
1. Defining Helper functions for cleaning
2. Data Parsing using `LLamaParse` -> Node Parsing using MarkdownElementNodeParser. After this stage, Document objects are pickled
3. Chunk long texts using appropriate technique.

In [None]:
def remove_table_of_contents(text):
    pattern = r"TABLE OF CONTENTS.*?(?=#)"
    cleaned_text = re.sub(pattern, "", text, flags=re.DOTALL)
    return cleaned_text.strip()

# Reset the index whenever you want to re-index the documents
idx = [0]

def convert_nodes_to_documents(text_nodes, object_nodes, source):
    """
    Converts nodes to Documents

    Args:
        text_nodes (List[Nodes]): List of text nodes
        object_nodes (List[Nodes]): List of object nodes
        source (str): Source of the document

    Returns:
        documents (List[Documents]): List of Documents
    """
    documents = []
    for node in text_nodes:
        text = node.text
        doc_id = str(uuid.uuid4())
        doc = Document(
            text= text,
            metadata = {
                "id": f"node_{idx[0]}_{doc_id}",
                "start_char_idx": node.start_char_idx,
                "end_char_idx": node.end_char_idx,
                "is_table": False,
                "source": source
            }
        )
        documents.append(doc)
        idx[0] += 1
        
    for node in object_nodes:
        text = node.text
        doc_id = str(uuid.uuid4())
        doc = Document(
            text= text,
            metadata = {
                "id": f"node_{idx[0]}_{doc_id}",
                "start_char_idx": node.start_char_idx,
                "end_char_idx": node.end_char_idx,
                "is_table": True,
                "source": source
            }
        )
        documents.append(doc)
        idx[0] += 1
        
    return documents

In [None]:
# instantiate doc parser
parser = LlamaParse(
    result_type="markdown",
    num_workers=8,
    verbose = True,
    language="en",
)

# instantiate node parser
node_parser = MarkdownElementNodeParser(llm=llama_llm, num_workers=8)

In [None]:
# create folder to store parsed data
data_folder = "data"
os.makedirs(data_folder, exist_ok=True)

def parse_docs(file_location):
    for file_name in os.listdir(file_location):
        if not file_name.endswith(".pdf"):
            continue

        print("File: " + str(file_name))
        doc_path = os.path.join(file_location, file_name)
        modified_file_name = os.path.splitext(file_name)[0].lower().replace(' ', '_')
        data_path = os.path.join(data_folder, modified_file_name + '.pkl')

        # results in a list of Document Objects
        documents = parser.load_data(doc_path)
        
        for idx, doc in enumerate(documents):
            doc.text = remove_table_of_contents(doc.text)
            if idx > 4:
                break

        raw_nodes = node_parser.get_nodes_from_documents(documents)
        # list of text_nodes, list of objects
        text_nodes, objects = node_parser.get_nodes_and_objects(raw_nodes)
        
        final_docs = convert_nodes_to_documents(text_nodes, objects, modified_file_name)

        pickle.dump(final_docs, open(data_path, "wb"))

In [None]:
file_location = "pdfs"

# parse_docs(file_location)

## Reading of pickles after parsing

In [None]:
def read_pickles(data_folder):
    doc_list = []
    for file_name in os.listdir(data_folder):
        doc_path = os.path.join(data_folder, file_name)
        if file_name.endswith(".pkl"):
            with open(doc_path, 'rb') as file:
                # data will be a doc_list
                data = pickle.load(file)
                doc_list.append(data)
                
    # since doc_list is a list of list of documents, we need to flatten it
    doc_list = [item for sublist in doc_list for item in sublist]
    return doc_list

In [None]:
doc_list = read_pickles(data_folder)

In [None]:
doc_list

### Recursive splitting for longer texts

In [None]:
long_docs, short_docs = [], []
for doc in doc_list:
    is_table = doc.metadata["is_table"]
    if not is_table:
        if len(doc.text) > 1500:
            long_docs.append(doc)
        else:
            short_docs.append(doc)

In [None]:
def chunk_doc(doc: Document, text_splitter: RecursiveCharacterTextSplitter) -> List[Document]:
    chunks = text_splitter.split_text(doc.text)
    return [
        Document(
            text=chunk,
            metadata={
                **doc.metadata,
                'chunk_id': f"{doc.metadata.get('id', '')}_{i}",
                'parent_id': doc.metadata.get('id', ''),
            }
        )
        for i, chunk in enumerate(chunks)
    ]

def recursive_chunk_documents(long_docs: List[Document],
                              short_docs: List[Document], 
                              chunk_size: int = 512, 
                              chunk_overlap: int = 64,
                              separators: List[str] = ["\n\n", "\n", " ", ""]) -> List[Document]:
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=separators
    )

    for doc in long_docs:
        short_docs.extend(chunk_doc(doc, text_splitter))

    return short_docs

In [None]:
final_docs = recursive_chunk_documents(long_docs, short_docs)

In [None]:
final_docs

In [None]:
all_ids, all_texts, all_sources = [], [], []

with alive_bar(len(final_docs), title='Metadata', force_tty=True) as bar:
    for doc in final_docs:
        try:
            doc_id = doc.metadata['chunk_id']
        except:
            doc_id = doc.metadata['id']
        text = doc.text
        source = doc.metadata['source']
        
        all_ids.append(doc_id)
        all_texts.append(text)
        all_sources.append(source)
        bar()

## Feeding texts (and tables) into respective Embedding Models
1. Dense Embeddings with BGE
2. Sparse Embeddings with SPLADE

In [None]:
dense_embeddings_list = list(bge_embed_model.embed(all_texts))

In [None]:
sparse_embeddings_list = spalde_embed_model.encode_documents(all_texts)

### Saving of respective embedding lists

In [None]:
len(all_ids), len(all_texts), len(all_sources), len(dense_embeddings_list), sparse_embeddings_list.shape[0]

In [None]:
with open('data/dense_embeddings.pkl', 'wb') as f:
    pickle.dump(dense_embeddings_list, f)
    
with open('data/sparse_embeddings.pkl', 'wb') as f:
    pickle.dump(sparse_embeddings_list, f)

## Batch Ingestion

In [None]:
data = [
    all_ids,
    all_texts,
    all_sources,
    dense_embeddings_list,
    sparse_embeddings_list
]

In [None]:
# Set batch size
batch_size = 10
total_elements = len(data[0])  # All lists have the same length

# Calculate the total number of batches
total_batches = (total_elements + batch_size - 1) // batch_size

# Initialize alive_bar with the total number of batches
with alive_bar(total_batches, force_tty=True) as bar:
    for start in range(0, total_elements, batch_size):
        end = min(start + batch_size, total_elements)
        batch = [sublist[start:end] for sublist in data]
        collection.insert(batch)
        bar()

## Creating of Index
1. Delete any existing Index
2. Create new Indexes

In [13]:
def drop_indexes(collection: Collection, index_names: List[str]) -> None:
    # Release or drop the existing collection index
    collection.release()
    for name in index_names:
        collection.drop_index(index_name=name)
        print(f"Index '{name}' has been dropped")

In [None]:
drop_indexes(collection, index_names=["sparse_embeddings", "dense_embeddings"])

In [None]:
# Create the indexes
def create_all_indexes(collection: Collection):
    # Dense embeddings index
    collection.create_index(
        field_name="dense_embeddings",
        index_params={
            "metric_type": "COSINE",
            "index_type": "HNSW",
            "params": {
                "M": 5,
                "efConstruction": 512
            }
        },
        index_name="dense_embeddings_index"
    )
    
    print("Dense embeddings index created")

    # Sparse embeddings index
    collection.create_index(
        field_name="sparse_embeddings",
        index_params={
            "metric_type": "IP",
            "index_type": "SPARSE_INVERTED_INDEX",
            "params": {
                "drop_ratio_build": 0.2
            }
        },
        index_name="sparse_embeddings_index"
    )
    
    print("Sparse embeddings index created")

In [None]:
create_all_indexes(collection)

In [None]:
collection.load()

## Loading in of collection and setting up Hybrid Searches

In [None]:
collection = Collection(name=collection_name)

In [None]:
def hybrid_search(query: str) -> str:
    dense_embedding = list(bge_embed_model.query_embed(query))[0]
    sparse_embedding = list(spalde_embed_model.encode_queries([query]))
    
    search_results = collection.hybrid_search(
            reqs=[
                AnnSearchRequest(
                    data=[dense_embedding],  # content vector embedding
                    anns_field='dense_embeddings',  # content vector field
                    param={"metric_type": "COSINE", "params": {"M": 64, "efConstruction": 512}}, # Search parameters
                    limit=3
                ),
                AnnSearchRequest(
                    data=list(sparse_embedding),  # keyword vector embedding
                    anns_field='sparse_embeddings',  # keyword vector field
                    param={"metric_type": "IP", "params": {"drop_ratio_build": 0.2}}, # Search parameters
                    limit=3
                )
            ],
            output_fields=['doc_id', 'text', 'doc_source'],
            # Use WeightedRanker to combine results with specified weights
            # Alternatively, use RRFRanker for reciprocal rank fusion reranking
            rerank=RRFRanker(),
            limit=3
            )
    
    hits = search_results[0]
    
    context = []
    for res in hits:
        text = res.text
        source = res.doc_source
        context.append(f"Source: {source}\nContext: {text}")
    
    return "\n\n".join(context)

In [None]:
res = hybrid_search("How should i know whether i should throw away my insulin?")

In [None]:
print(res)