### Import dependencies

In [1]:
#import PyMuPDF
import fitz

# import llamma_index and other necessary modules
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.tools import FunctionTool
from llama_index.llms.openai import OpenAI
from llama_index.agent.openai import OpenAIAgent
from llama_index.core.schema import Document
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import SimpleDirectoryReader
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.core import Settings
from llama_index.core import Document


# import necessary modules for Qdrant and FastEmbed
from fastembed import SparseTextEmbedding, TextEmbedding
from qdrant_client import QdrantClient, models
from qdrant_client.http.models import PointStruct, SparseVector
from tqdm import tqdm

# Load environment variables
from dotenv import load_dotenv
load_dotenv()

# import necessary modules for file handling and JSON processing
from typing import List
import pprint
import os
import json
import re

### Create chunking function


In [2]:
# function to extract text from all files in a directory
def run_document_preprocessing(input_dir, chunk_size, chunk_overlap, output_json="nodes.json"):
    print('Processing documents....')
    # Load documents
    documents = SimpleDirectoryReader(input_dir=input_dir).load_data()
    # Split documents into nodes
    splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    nodes = splitter.get_nodes_from_documents(documents)
    # Convert nodes to dicts and save as JSON
    nodes_dict = [node.to_dict() for node in nodes]
    with open(output_json, "w") as f:
        json.dump(nodes_dict, f, indent=2)
    return {
        "success": True,
        "message": f"Processed {len(nodes)} nodes. Saved to {output_json}",
        "nodes_saved": output_json,
        "num_nodes": len(nodes)
    }

In [3]:
# function to extract text from PDF files using PyMuPDF
def extract_text_with_pymupdf(input_dir):
    documents = []

    for filename in os.listdir(input_dir):
        if filename.endswith('.pdf'):
            file_path = os.path.join(input_dir, filename)
            doc = fitz.open(file_path)

            for i, page in enumerate(doc):
                text = page.get_text()
                
                if not text.strip():
                    continue  # Skip blank pages

                # Create one Document per page
                document = Document(
                    text=text,
                    metadata={
                        'file_path': file_path,
                        'file_name': filename,
                        'file_size': os.path.getsize(file_path),
                        'file_type': 'application/pdf',
                        'page_number': i + 1,
                        **doc.metadata,  
                    }
                )
                documents.append(document)

            doc.close()
    
    return documents

# function to extract text from all files in a directory
def run_document_preprocessing_new(input_dir, chunk_size, chunk_overlap, output_json="nodes.json"):
    print('Processing documents....')
    
    documents = extract_text_with_pymupdf(input_dir)

    # Split documents into nodes
    splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    nodes = splitter.get_nodes_from_documents(documents)

    # Convert nodes to dicts and save as JSON
    nodes_dict = [node.to_dict() for node in nodes]

    with open(output_json, "w") as f:
        json.dump(nodes_dict, f, indent=2)
    return {
        "success": True,
        "message": f"Processed {len(nodes)} nodes. Saved to {output_json}",
        "nodes_saved": output_json,
        "num_nodes": len(nodes)
    }

### Create class to upload data to Qdrant

In [4]:
class Qdrant_vector_db():
    Qdrant_API_KEY = os.getenv('QDRANT_API_KEY')
    Qdrant_URL = os.getenv('QDRANT_URL')
    Collection_Name = os.getenv('QDRANT_COLLECTION_NAME')
    qdrant_client = QdrantClient(
                                url=Qdrant_URL,
                                api_key=Qdrant_API_KEY)
            
    Embeddings = {
        "sentence-transformer": "sentence-transformers/all-MiniLM-L6-v2",
        "snowflake": "Snowflake/snowflake-arctic-embed-m",
        "BAAI": "BAAI/bge-large-en-v1.5",
    }

    def index_to_qdrant(self,
        embedding_model_name: str,
        nodes_json_path: str,
        collection_name: str = None,
        dense_vector_size: int = 384,
        sparse_model_name: str = "Qdrant/bm42-all-minilm-l6-v2-attentions"
    ) -> None:
        """
        Index documents into Qdrant vector database with hybrid (dense + sparse) embeddings.
        
        Args:
            embedding_model_name: Name of the dense embedding model (e.g., "sentence-transformer")
            nodes_json_path: Path to the nodes JSON file
            collection_name: Name of the Qdrant collection (uses env var if None)
            dense_vector_size: Size of dense embeddings vector
            sparse_model_name: Name of sparse embedding model
        """
        # # Initialize Qdrant client
        # qdrant_client = QdrantClient(
        #     url=os.getenv('Qdrant_URL'),
        #     api_key=os.getenv('Qdrant_API_KEY')
        # )
        
        # Set collection name
        collection_name = collection_name or os.getenv('collection_name')
        
        # Load nodes
        print("Loading nodes from JSON file...")
        try:
            with open(nodes_json_path, 'r') as file:
                nodes = json.load(file)
            documents = [node['text'] for node in nodes]
            metadata_list = [node['metadata'] for node in nodes]
            print(f"Loaded {len(nodes)} nodes from {nodes_json_path}")
        except Exception as e:
            print(f"Error loading nodes: {e}")
            raise

        # Create collection if not exists
        if not Qdrant_vector_db.qdrant_client.collection_exists(collection_name):
            print(f"Creating collection '{collection_name}'...")
            Qdrant_vector_db.qdrant_client.create_collection(
                collection_name=collection_name,
                vectors_config={
                    'dense': models.VectorParams(
                        size=dense_vector_size,
                        distance=models.Distance.COSINE,
                    )
                },
                sparse_vectors_config={
                    "sparse": models.SparseVectorParams(
                        index=models.SparseIndexParams(on_disk=False),
                    )
                }
            )
        
        # Initialize embedding models
        dense_embedder = TextEmbedding(model_name=Qdrant_vector_db.Embeddings[embedding_model_name])
        # dense_embedder = TextEmbedding(model_name=embedding_model_name)
        sparse_embedder = SparseTextEmbedding(model_name=sparse_model_name)
        
        # Prepare points for upsert
        points = []
        for idx, (doc, metadata) in enumerate(tqdm(zip(documents, metadata_list), 
                                                total=len(documents),
                                                desc="Indexing documents")):
            # Generate embeddings
            dense_embedding = list(dense_embedder.embed([doc]))[0]
            sparse_embedding = list(sparse_embedder.embed([doc]))[0]
            
            # Create sparse vector
            sparse_vector = models.SparseVector(
                indices=sparse_embedding.indices.tolist(),
                values=sparse_embedding.values.tolist()
            )
            
            # Create point structure
            points.append(models.PointStruct(
                id=idx,
                vector={
                    "dense": dense_embedding.tolist(),
                    "sparse": sparse_vector
                },
                payload={
                    "text": doc,
                    **metadata
                }
            ))
        
        # Upsert points
        Qdrant_vector_db.qdrant_client.upsert(
            collection_name=collection_name,
            points=points
        )
        print(f"Successfully indexed {len(points)} documents in collection '{collection_name}'")

  qdrant_client = QdrantClient(


### Process documents

#### Extract text and create chunks in JSON format

In [5]:
# assign the result of funciton to variable
result = run_document_preprocessing_new(
    input_dir="./Data",
    chunk_size=500,
    chunk_overlap=100,
    output_json="nodes.json"
    )

# print variable
pprint.pprint(result)

Processing documents....
{'message': 'Processed 176 nodes. Saved to nodes.json',
 'nodes_saved': 'nodes.json',
 'num_nodes': 176,
 'success': True}


In [6]:
# create an instance of Qdrant_vector_db class
qdrant_db=Qdrant_vector_db()

In [7]:
# run method to index documents into qdrant
qdrant_db.index_to_qdrant(
        embedding_model_name="sentence-transformer",
        nodes_json_path="nodes.json",
        collection_name="finance_documents",
        dense_vector_size=384,
    )

Loading nodes from JSON file...
Loaded 176 nodes from nodes.json
Creating collection 'finance_documents'...


Indexing documents: 100%|██████████| 176/176 [00:23<00:00,  7.34it/s]


Successfully indexed 176 documents in collection 'finance_documents'


# *** ONLY RUN THE NOTEBOOK UP TO THIS POINT ***

From this point on the notebook explains the most critical parts of the code

### Code analysis

#### Text loading and splitting - existing version

In [None]:
input_dir = "./Data"
chunk_size = 500
chunk_overlap = 100

# Load documents
documents = SimpleDirectoryReader(input_dir=input_dir).load_data()

# Split documents into nodes
splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
nodes = splitter.get_nodes_from_documents(documents)

# Convert nodes to dicts and save as JSON
nodes_dict_ex = [node.to_dict() for node in nodes]

In [None]:
len(nodes_dict_ex)

In [None]:
n = 20
nodes_dict_ex[n]

#### Text loading and splitting - improved version

In [None]:
input_dir = "./Data"
chunk_size = 500
chunk_overlap = 100

import fitz  # PyMuPDF
from llama_index.core import Document
import os

def extract_text_with_pymupdf(input_dir):
    documents = []
    
    for filename in os.listdir(input_dir):
        if filename.endswith('.pdf'):
            file_path = os.path.join(input_dir, filename)
            
            # Open PDF with PyMuPDF
            doc = fitz.open(file_path)
            text = ""
            
            for page in doc:
                text += page.get_text()
            
            doc.close()
            
            # Create LlamaIndex Document
            document = Document(
                text=text,
                metadata={
                    'file_path': file_path,
                    'file_name': filename,
                    'file_type': 'application/pdf'
                }
            )
            documents.append(document)
    
    return documents

documents = extract_text_with_pymupdf(input_dir)

# Split documents into nodes
splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
nodes = splitter.get_nodes_from_documents(documents)

# Convert nodes to dicts and save as JSON
nodes_dict = [node.to_dict() for node in nodes]

In [None]:
len(nodes_dict)

In [None]:
# Get the first node as an example
nodes_dict[n]

Comments:

- The improved version of the text extractor using PyMuPDF extracts readable text
- The number of vectors are reduced significantly, from 1,462 a 82
- The dense and the sparse vectors run smoother into the Qdrant vector store

#### Vectore store

- Requires a Qdrant database instance running in http:localhost:6333
- The Qdrant database instance can be easily setup in a Docker container. This requires Docker available in the machine where the chatbot will be deployed
- The files in this repository have all required instructions to install Python dependencies and launch the Docker Qdrant instance

##### Dense and sparse vectors

In [None]:
# create a Qdrant client instance
Qdrant_API_KEY = os.getenv('QDRANT_API_KEY')
Qdrant_URL = os.getenv('QDRANT_URL')
Collection_Name = os.getenv('QDRANT_COLLECTION_NAME')
qdrant_client = QdrantClient(
                            url=Qdrant_URL,
                            api_key=Qdrant_API_KEY)

In [None]:
# Set variables
Embeddings = {
        "sentence-transformer": "sentence-transformers/all-MiniLM-L6-v2",
        "snowflake": "Snowflake/snowflake-arctic-embed-m",
        "BAAI": "BAAI/bge-large-en-v1.5",
    }

embedding_model_name="BAAI"
nodes_json_path="nodes.json"
collection_name="finance_documents"
dense_vector_size=1024
sparse_model_name: str = "Qdrant/bm42-all-minilm-l6-v2-attentions"

In [None]:
# Load nodes
print("Loading nodes from JSON file...")
try:
    with open(nodes_json_path, 'r') as file:
        nodes = json.load(file)
    documents = [node['text'] for node in nodes]
    metadata_list = [node['metadata'] for node in nodes]
    print(f"Loaded {len(nodes)} nodes from {nodes_json_path}")
except Exception as e:
    print(f"Error loading nodes: {e}")
    raise

In [None]:
#explore docuemnts
documents[n]

In [None]:
# explore metadata
metadata_list[n]

In [None]:
# Create collection if not exists
if not Qdrant_vector_db.qdrant_client.collection_exists(collection_name):
    print(f"Creating collection '{collection_name}'...")
    Qdrant_vector_db.qdrant_client.create_collection(
        collection_name=collection_name,
        vectors_config={
            'dense': models.VectorParams(
                size=dense_vector_size,
                distance=models.Distance.COSINE,
            )
        },
        sparse_vectors_config={
            "sparse": models.SparseVectorParams(
                index=models.SparseIndexParams(on_disk=False),
            )
        }
    )
    print(f"Collection '{collection_name}' created successfully.")
else:
    print(f"Collection '{collection_name}' already exists.")

In [None]:
# Initialize embedding models
!export TOKENIZERS_PARALLELISM=false

In [None]:
# Test the API connection (should require your API key)
!curl -H "api-key: $QDRANT_API_KEY" http://localhost:6333/collections

In [None]:
# check points in the collection
!curl -H "api-key: $QDRANT_API_KEY" http://localhost:6333/collections/finance_documents/points


Comments:
- The collection was created
- The collection is empty

In [None]:
# Initialize embedding models
dense_embedder = TextEmbedding(model_name=Qdrant_vector_db.Embeddings[embedding_model_name])
sparse_embedder = SparseTextEmbedding(model_name=sparse_model_name)

In [None]:
# understand the structure of zip()
for idx, (doc, metadata) in enumerate(zip(documents, metadata_list)):
    if idx < 50:
        print(f"Index: {idx}")
        print(f"Document: {doc[:50]}...")  # Print first 50 characters
        print(f"Metadata: {metadata}")
        print("-" * 80)
    else:
        break

In [None]:
# understand dense vector
dense_embedding = list(dense_embedder.embed([documents[0]]))[0]
print(f"Dense Embedding: {dense_embedding[:10]}...")  # Print first

In [None]:
# understand sparse vector
sparse_embedding = list(sparse_embedder.embed([documents[0]]))[0]
print(f"Sparse Embedding Indices: {sparse_embedding.indices[:10]}...")
print(f"Sparse Embedding Values: {sparse_embedding.values[:10]}...")

In [None]:
# Prepare points for upsert
points = []
for idx, (doc, metadata) in enumerate(tqdm(zip(documents, metadata_list), 
                                        total=len(documents),
                                        desc="Indexing documents")):
    # Generate embeddings
    dense_embedding = list(dense_embedder.embed([doc]))[0]
    sparse_embedding = list(sparse_embedder.embed([doc]))[0]
    
    # Create sparse vector
    sparse_vector = models.SparseVector(
        indices=sparse_embedding.indices.tolist(),
        values=sparse_embedding.values.tolist()
    )
    
    # Create point structure
    points.append(models.PointStruct(
        id=idx,
        vector={
            "dense": dense_embedding.tolist(),
            "sparse": sparse_vector
        },
        payload={
            "text": doc,
            **metadata
        }
    ))

In [None]:
# explore points
len(points)

Comments:
- There are as many points as there are text chunks
- Each point follows the PointStruct from Ddrant http.models

In [None]:
# explore the first point
points[n]

In [None]:
points[n].vector['dense'][:10]  # First 10 elements of dense vector

In [None]:
len(points[n].vector['dense'])

In [None]:
points[n].payload['text']