# Ingestion Pipeline

This notebooks allows to run the code to ingest and transform the documents that will be use by the RC Assist RAG system.

The following tasks are performed:

- Transform all the documents to PDF


At the ned, we have vector stores related to child and parent documents created as well as the associated indexes.

Import the Dependencies

In [None]:
import os
import sys
sys.path.append("../")

from pathlib import Path

from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding

from src.constants import (
    RAW_DOCUMENTS_PATH,
    PDF_CONVERTED_DOCUMENTS_PATH,
    TEXT_PATH,
    TEXT_TO_MARKDOWN_PATH,
    TABLE_PATH,
    TABLES_SUMMARIES_PATH,
    IMAGE_PATH,
    IMAGE_DESCRIPTION_PATH,
    TEXT_SEMANTIC_CHUNKS_PATH,
    TEXT_FINAL_CHUNKS_PATH,
    TABLE_CHUNKS_PATH,
    IMAGE_DESCRIPTION_SEMANTIC_CHUNKS_PATH,
    TEXT_FINAL_CHUNKS_EMBEDDINGS_PATH,
    TABLE_CHUNKS_EMBEDDINGS_PATH,
    IMAGE_DESCRIPTION_SEMANTIC_CHUNKS_EMBEDDINGS_PATH,
    MAX_SEMANTIC_CHUNK_SIZE,
    BUFFER_SIZE,
    SENTENCE_SPLIT_REGEX,
    SEMANTIC_CHUNKING_TYPE_TEXT,
    SEMANTIC_CHUNKING_THRESHOLD_TEXT,
    SEMANTIC_CHUNKING_TYPE_IMAGE_DESCRIPTION,
    SEMANTIC_CHUNKING_THRESHOLD_IMAGE_DESCRIPTION,
    FINAL_CHUNK_SIZE_TEXT,
    FINAL_OVERLAP_TEXT,
    FINAL_CHUNK_SIZE_TABLE,
    FINAL_OVERLAP_TABLE,
    PIXEL_THRESHOLD,
    VARIANCE_THRESHOLD,
    TEMPERATURE_DESCRIPTION,
    TEMPERATURE_PROCESSING,
    FILE_EXTENSIONS,
    TEXT_FINAL_CHUNKS_PATH,
    TEXT_SEMANTIC_CHUNKS_PATH,
    TEXT_FINAL_CHUNKS_EMBEDDINGS_PATH,
    TABLE_PATH,
    TABLE_CHUNKS_PATH,
    TABLE_CHUNKS_EMBEDDINGS_PATH,
    IMAGE_DESCRIPTION_PATH,
    IMAGE_DESCRIPTION_SEMANTIC_CHUNKS_PATH,
    IMAGE_DESCRIPTION_SEMANTIC_CHUNKS_EMBEDDINGS_PATH,
    VECTOR_STORE_PATH,
    INDEX_PATH,
    TEMPERATURE_RESPONSE
)
from src.doc_utils import (
    convert_to_pdfs,
    convert_pdf_document_urls_to_pdfs,
    process_pdfs,
    post_process_txts,
    extract_markdown_tables_from_markdown,
    extract_tables_generate_descriptions_from_images,
    chunk_markdown_tables,
    semantic_chunk_text_files,
    create_final_text_chunks,
    generate_text_embeddings,
    load_embeddings_with_associated_documents,
    create_documents,
    create_index,
    save_index
)

from src.utils.text_utils import get_token_count

from llama_index.core import (
    StorageContext,
    Settings
)
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.vector_stores.qdrant import QdrantVectorStore

from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
from qdrant_client.http.exceptions import (
    ResponseHandlingException,
    UnexpectedResponse
)

import warnings
warnings.filterwarnings("ignore")

from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())



True

Set up the Models

In [2]:
# Model Config
model_info = {
    "AZURE_OPENAI_RESOURCE": os.environ.get("AZURE_OPENAI_RESOURCE"),
    "AZURE_OPENAI_KEY": os.environ.get("AZURE_OPENAI_KEY"),
    "AZURE_OPENAI_MODEL_VISION": os.environ.get("AZURE_OPENAI_MODEL_VISION"),
    "AZURE_OPENAI_MODEL": os.environ.get("AZURE_OPENAI_MODEL"),
    "AZURE_OPENAI_API_VERSION": os.environ.get("AZURE_OPENAI_API_VERSION"),
}

embed_model_info = {
    "AZURE_OPENAI_EMBEDDING_MODEL_RESOURCE": os.environ.get("AZURE_OPENAI_RESOURCE"),
    "AZURE_OPENAI_EMBEDDING_MODEL_RESOURCE_KEY": os.environ.get("AZURE_OPENAI_EMBEDDING_MODEL_RESOURCE_KEY"),
    "AZURE_OPENAI_EMBEDDING_MODEL": os.environ.get("AZURE_OPENAI_EMBEDDING_MODEL"),
    "AZURE_OPENAI_EMBEDDING_MODEL_API_VERSION": os.environ.get("AZURE_OPENAI_EMBEDDING_MODEL_API_VERSION"),
}

# Main LLM
model = AzureOpenAI(
    model=model_info["AZURE_OPENAI_MODEL"],
    deployment_name=model_info["AZURE_OPENAI_MODEL"],
    api_key=model_info["AZURE_OPENAI_KEY"],
    azure_endpoint=f"https://{model_info['AZURE_OPENAI_RESOURCE']}.openai.azure.com/",
    api_version=model_info["AZURE_OPENAI_API_VERSION"],
    temperature=TEMPERATURE_RESPONSE
)

# Embedding model
embed_model = AzureOpenAIEmbedding(
    model=embed_model_info["AZURE_OPENAI_EMBEDDING_MODEL"],
    deployment_name=embed_model_info["AZURE_OPENAI_EMBEDDING_MODEL"],
    api_key=embed_model_info["AZURE_OPENAI_EMBEDDING_MODEL_RESOURCE_KEY"],
    azure_endpoint=f"https://{embed_model_info['AZURE_OPENAI_EMBEDDING_MODEL_RESOURCE']}.openai.azure.com/",
    api_version=embed_model_info["AZURE_OPENAI_EMBEDDING_MODEL_API_VERSION"]
)

Settings.embed_model = embed_model
Settings.llm = model

### Step 1 - Ingest & Transform the Data

1.1. Converting .DOCX, PPTX and HTML documents to PDF

In [3]:
if not os.path.exists(PDF_CONVERTED_DOCUMENTS_PATH):
    os.makedirs(PDF_CONVERTED_DOCUMENTS_PATH)

print("\nCoverting the Input Documents to PDF ...\n")

convert_to_pdfs(
    input_dir=RAW_DOCUMENTS_PATH,
    output_dir=PDF_CONVERTED_DOCUMENTS_PATH,
    extensions=FILE_EXTENSIONS
)

print("\nAll Documents Were Converted Successly.\n")


Coverting the Input Documents to PDF ...


Processing the PDF files in /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.raw_documents...


All files have been converted to PDF.


All Documents Were Converted Successly.



1.2. Extract URLs from PDF, and convert associated documents or Webpages to PDF. (OPTIONAL)

Uncomment the cell below to run.

In [None]:
# print("\nExtracting URLs From Input PDFs and Coverting the Webpages to PDFs ...\n")

# convert_pdf_document_urls_to_pdfs(
#     pdf_dir=PDF_CONVERTED_DOCUMENTS_PATH,
#     output_dir=PDF_CONVERTED_DOCUMENTS_PATH
# )

# convert_pdf_document_urls_to_pdfs(
#     pdf_dir=RAW_DOCUMENTS_PATH,
#     output_dir=PDF_CONVERTED_DOCUMENTS_PATH
# )

# print("\nURLs Were Extracted From Input PDFs and Coverted to PDFs ...\n")

1.3. Parse all the PDF Documents.

We use extract texts and images from the documents using the [PyMuPDF] (https://pymupdf.readthedocs.io/en/latest/document.html) library.

In [4]:
print("\nProcessing the PDF Documents ...\n")

# Original PDFs

process_pdfs(
    pdf_dir=RAW_DOCUMENTS_PATH,
    text_output_dir=TEXT_PATH,
    image_output_dir=IMAGE_PATH,
    pixel_threshold=PIXEL_THRESHOLD,
    variance_threshold=VARIANCE_THRESHOLD
)

# PDFs created from URLs
# Uncomment to run

# process_pdfs(
#     pdf_dir=PDF_CONVERTED_DOCUMENTS_PATH,
#     text_output_dir=TEXT_PATH,
#     image_output_dir=IMAGE_PATH,
#     pixel_threshold=PIXEL_THRESHOLD,
#     variance_threshold=VARIANCE_THRESHOLD
# )

print("\nAll Documents Were Parsed Successly.\n")


Processing the PDF Documents ...


Processing the PDF files in /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.raw_documents...


Reading file 167- Demand Element - 2.2 Safety Stock.pdf
File 167- Demand Element - 2.2 Safety Stock.pdf has 7 pages.
Page 1 text saved to /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.extracted/.texts/.texts/167- Demand Element - 2.2 Safety Stock.pdf_page_1.txt
Image 167- Demand Element - 2.2 Safety Stock.pdf_page_1_image_1.png was deleted because it does not meet acceptance criteria.
Image 167- Demand Element - 2.2 Safety Stock.pdf_page_1_image_1.png was deleted because it does not meet acceptance criteria.
Image 167- Demand Element - 2.2 Safety Stock.pdf_page_1_image_1.png was deleted because it does not meet acceptance criteria.
Image 167- Demand Element - 2.2 Safety Stock.pdf_page_1_image_1.png was deleted because it does not meet acceptance criteria.
Image 167- Demand Element - 2.2 Safety Stock.pdf_page_1_image_1.png was delet

1.4. Post-Process the Content Extracted from the PDF Files

The texts extracted using PyMuPDF require some cleaning and reformatting to be well human-readable.

In [5]:
print("\nPost-Processing the Extracted Texts ...\n")

post_process_txts(
    txt_dir=TEXT_PATH,
    text_output_dir=TEXT_TO_MARKDOWN_PATH,
    temperature=TEMPERATURE_PROCESSING,
    model_info=model_info,
)

print("\nPost-Processing Ended Successly.\n")


Post-Processing the Extracted Texts ...


Processing the TXT files in /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.extracted/.texts/.texts...


Reading file 167- Demand Element - 2.2 Safety Stock.pdf_page_3.txt

Calling OpenAI APIs with 2 messages - Model: gpt-4o - Endpoint: https://rc-assist-westus3.openai.azure.com/openai/

Formatted content saved in file: /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.extracted/.texts/.texts_to_markdow/167- Demand Element - 2.2 Safety Stock.pdf_page_3.md

Reading file 167- Demand Element - 2.2 Safety Stock.pdf_page_5.txt

Calling OpenAI APIs with 2 messages - Model: gpt-4o - Endpoint: https://rc-assist-westus3.openai.azure.com/openai/

Formatted content saved in file: /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.extracted/.texts/.texts_to_markdow/167- Demand Element - 2.2 Safety Stock.pdf_page_5.md

Reading file 167- Demand Element - 2.2 Safety Stock.pdf_page_2.txt

Calling OpenAI APIs with 2 messages - Mo

1.5. Extract Tables from Post-Processed Texts.

Save the Remaining as Plain Texts Into TXT Files

In [6]:
print("\nExtracting Tables From Formatted Texts ...\n")

extract_markdown_tables_from_markdown(
    md_dir=TEXT_TO_MARKDOWN_PATH,
    txt_dir=TEXT_PATH,
    tables_dir=TABLE_PATH,
)

print("\nTables Extraction Ended Successly.\n")


Extracting Tables From Formatted Texts ...


Processing the txt files in /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.extracted/.texts/.texts_to_markdow...


Reading file: 167- Demand Element - 2.2 Safety Stock.pdf_page_4.md
Saved table to: /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.extracted/.tables/.tables/167- Demand Element - 2.2 Safety Stock.pdf_page_4_table_1.md
Plain text with table saved table to: /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.extracted/.texts/.texts/167- Demand Element - 2.2 Safety Stock.pdf_page_4.txt

Reading file: 167- Demand Element - 2.2 Safety Stock.pdf_page_6.md
Plain text with table saved table to: /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.extracted/.texts/.texts/167- Demand Element - 2.2 Safety Stock.pdf_page_6.txt

Reading file: 167- Demand Element - 2.2 Safety Stock.pdf_page_5.md
Plain text with table saved table to: /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.extracted/

1.6. Extract Tables From / Generate Descriptions of the Images

In [7]:
print("\nExtracting Tables From Images and Generating Image Descriptions ...\n")

extract_tables_generate_descriptions_from_images(
    images_dir=IMAGE_PATH,
    tables_dir=TABLE_PATH,
    descriptions_dir=IMAGE_DESCRIPTION_PATH,
    temperature=TEMPERATURE_DESCRIPTION,
    model_info=model_info,
)

print("\nTables and Images Descriptions Creation Ended Successly.\n")


Extracting Tables From Images and Generating Image Descriptions ...

Processing the image files in /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.extracted/.images/.images...


Reading file: 167- Demand Element - 2.2 Safety Stock.pdf_page_5_image_1.png
Status: Image was successfully explained, with Status Code: 200
Saved table to: /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.extracted/.tables/.tables/167- Demand Element - 2.2 Safety Stock.pdf_page_5_image_1_table_1.md
Saved image description to: /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.extracted/.images/.images_descriptions/167- Demand Element - 2.2 Safety Stock.pdf_page_5_image_1_description.txt

Reading file: 167- Demand Element - 2.2 Safety Stock.pdf_page_3_image_2.png
Status: Image was successfully explained, with Status Code: 200
Saved table to: /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.extracted/.tables/.tables/167- Demand Element - 2.2 Safety Stock.pdf_page_3_imag

1.7. Create Data Chunks

Texts, Tables Summaries & Images Descriptions chunks are created.

In [8]:
# Tables Chunking (we chunk the tables by rows).

print("\nPerforming Tables Chunking ...\n")

chunk_markdown_tables(
    tables_dir=TABLE_PATH,
    tables_summaries_dir=TABLES_SUMMARIES_PATH,
    cols=None,
    n_tokens=FINAL_CHUNK_SIZE_TABLE,
    overlap=FINAL_OVERLAP_TABLE,
    temperature=TEMPERATURE_DESCRIPTION,
    model_info=model_info,
    tables_chunks_dir=TABLE_CHUNKS_PATH,
)

print("\nTables Chunking Ended Successly.\n")


Performing Tables Chunking ...


Processing the tables in /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.extracted/.tables/.tables...


Reading file: 167- Demand Element - 2.2 Safety Stock.pdf_page_6_image_2_table_3.md

Calling OpenAI APIs with 2 messages - Model: gpt-4o - Endpoint: https://rc-assist-westus3.openai.azure.com/openai/
[0m
Messages: [{'role': 'system', 'content': 'You are a helpful assistant, who helps the user with their query. You are designed to output JSON.'}, {'role': 'user', 'content': '\nYou are a Data Engineer resonsible for reforming and preserving the quality of Markdown tables. A table will be passed to you in the form of a Markdown string. You are designed to output JSON. \n\nYour task is to extract the column names of the header of the table from the Markdown string in the form of a comma-separated list. If the column names do exist, please return them verbatim word-for-word with no change, except fixing format or alignment issues (extra spaces

In [9]:
# Texts Semantic Chunks

print("\nPerforming Texts Chunking ...\n")

semantic_chunk_text_files(
    documents_dir=TEXT_PATH,
    embed_model_info=embed_model_info,
    max_chunk_size=MAX_SEMANTIC_CHUNK_SIZE,
    documents_chunks_dir=TEXT_SEMANTIC_CHUNKS_PATH,
    buffer_size=BUFFER_SIZE,
    breakpoint_threshold_type=SEMANTIC_CHUNKING_TYPE_TEXT,
    breakpoint_threshold_amount=SEMANTIC_CHUNKING_THRESHOLD_TEXT,
    sentence_split_regex=SENTENCE_SPLIT_REGEX,
    verbose=True,
)

# Texts Final Chunks

create_final_text_chunks(
    documents_dir=TEXT_SEMANTIC_CHUNKS_PATH,
    documents_chunks_dir=TEXT_FINAL_CHUNKS_PATH,
    chunk_size=FINAL_CHUNK_SIZE_TEXT,
    overlap=FINAL_OVERLAP_TEXT
)

print("\nTexts Chunking Ended Successly.\n")


Performing Texts Chunking ...


Processing the files in /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.extracted/.texts/.texts...


Reading file: 167- Demand Element - 2.2 Safety Stock.pdf_page_3.txt
Saved chunk to: /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.extracted/.texts/.texts_semantic_chunks/167- Demand Element - 2.2 Safety Stock.pdf_page_3_semantic_chunk_1.txt

Reading file: 167- Demand Element - 2.2 Safety Stock.pdf_page_5.txt
Saved chunk to: /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.extracted/.texts/.texts_semantic_chunks/167- Demand Element - 2.2 Safety Stock.pdf_page_5_semantic_chunk_1.txt

Reading file: 167- Demand Element - 2.2 Safety Stock.pdf_page_2.txt
Saved chunk to: /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.extracted/.texts/.texts_semantic_chunks/167- Demand Element - 2.2 Safety Stock.pdf_page_2_semantic_chunk_1.txt

Reading file: 167- Demand Element - 2.2 Safety Stock.pdf_page_7.txt
Saved chunk to: /h

In [10]:
# Image Descriptions

print("\nPerforming Image Descriptions Chunking ...\n")

semantic_chunk_text_files(
    documents_dir=IMAGE_DESCRIPTION_PATH,
    embed_model_info=embed_model_info,
    max_chunk_size=MAX_SEMANTIC_CHUNK_SIZE,
    documents_chunks_dir=IMAGE_DESCRIPTION_SEMANTIC_CHUNKS_PATH,
    buffer_size=BUFFER_SIZE,
    breakpoint_threshold_type=SEMANTIC_CHUNKING_TYPE_IMAGE_DESCRIPTION,
    breakpoint_threshold_amount=SEMANTIC_CHUNKING_THRESHOLD_IMAGE_DESCRIPTION,
    sentence_split_regex=SENTENCE_SPLIT_REGEX,
    verbose=True,
)

print("\nImage Descriptions Chunking Ended Successly.\n")


Performing Image Descriptions Chunking ...


Processing the files in /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.extracted/.images/.images_descriptions...


Reading file: 167- Demand Element - 2.2 Safety Stock.pdf_page_5_image_2_description.txt
Saved chunk to: /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.extracted/.images/.images_descriptions_semantic_chunks/167- Demand Element - 2.2 Safety Stock.pdf_page_5_image_2_description_semantic_chunk_1.txt

Reading file: 167- Demand Element - 2.2 Safety Stock.pdf_page_5_image_1_description.txt
Saved chunk to: /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.extracted/.images/.images_descriptions_semantic_chunks/167- Demand Element - 2.2 Safety Stock.pdf_page_5_image_1_description_semantic_chunk_1.txt
Saved chunk to: /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.extracted/.images/.images_descriptions_semantic_chunks/167- Demand Element - 2.2 Safety Stock.pdf_page_5_image_1_description_sema

1.8. Calculate and Store the Embeddings.

Texts, Tables Summaries & Images Descriptions embeddings are created.

In [11]:
# Texts

print("\nCalculating Texts Embeddings.\n")

generate_text_embeddings(
    documents_dir=TEXT_FINAL_CHUNKS_PATH,
    text_emdeddings_dir=TEXT_FINAL_CHUNKS_EMBEDDINGS_PATH,
    embed_model_info=embed_model_info,
)

print("\nTexts Embeddings Calculation Ended Successly.\n")

# Tables

print("\nCalculating Tables Embeddings.\n")

generate_text_embeddings(
    documents_dir=TABLE_CHUNKS_PATH,
    text_emdeddings_dir=TABLE_CHUNKS_EMBEDDINGS_PATH,
    embed_model_info=embed_model_info,
)

print("\nTables Embeddings Calculation Ended Successly.\n")

# Images Descriptions

print("\nCalculating Image Descriptions Embeddings.\n")

generate_text_embeddings(
    documents_dir=IMAGE_DESCRIPTION_SEMANTIC_CHUNKS_PATH,
    text_emdeddings_dir=IMAGE_DESCRIPTION_SEMANTIC_CHUNKS_EMBEDDINGS_PATH,
    embed_model_info=embed_model_info,
)

print("\nImage Descriptions Embeddings Calculation Ended Successly.\n")

print("\nEnd of Data Preparation.\n")


Calculating Texts Embeddings.


Processing the files in /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.extracted/.texts/.texts_final_chunks...

Calculating embedding for : 167- Demand Element - 2.2 Safety Stock.pdf_page_6_semantic_chunk_1_final_chunk_1.txt...
Embedding has been saved in file: /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.embeddings/.texts/.texts_final_chunks/167- Demand Element - 2.2 Safety Stock.pdf_page_6_semantic_chunk_1_final_chunk_1_embedding.json.

Calculating embedding for : 167- Demand Element - 2.2 Safety Stock.pdf_page_5_semantic_chunk_1_final_chunk_1.txt...
Embedding has been saved in file: /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.embeddings/.texts/.texts_final_chunks/167- Demand Element - 2.2 Safety Stock.pdf_page_5_semantic_chunk_1_final_chunk_1_embedding.json.

Calculating embedding for : 167- Demand Element - 2.2 Safety Stock.pdf_page_4_semantic_chunk_1_final_chunk_2.txt...
Embedding has been saved in file: 

### Step 2 - Create the Indexes

We create the child and parent documents indexes.

2.1. Set up the vector stores.

In [12]:
for d in [VECTOR_STORE_PATH, INDEX_PATH]:
        if not os.path.exists(d):
            os.makedirs(d)

# Initialize Qdrant (only do this once at indexing time)
print("\nInitialize Qdrant Client, Collections and Vector Stores...\n")

client = QdrantClient(path=VECTOR_STORE_PATH)

# Determine embedding dimension
embed_dim = len(embed_model.get_text_embedding("test input"))

# Define collection names
collection_names = [
    "child_documents",
    "parent_documents"
]

# Recreate collection only if needed
# If data is static and unchanged, consider skipping recreation to speed up startup.
for collection_name in collection_names:
    try:
        # Check if the collection already exists
        existing_collections = [c.name for c in client.get_collections().collections]
        if collection_name not in existing_collections:
            client.create_collection(
                collection_name=collection_name,
                vectors_config=VectorParams(
                    size=embed_dim, distance=Distance.COSINE
                ),
            )
            print(
                f"Collection '{collection_name}' created with vector dimension {embed_dim}."
            )
            
        else:
            print(
                f"Collection '{collection_name}' already exists; skipping recreation."
            )
            
    except ResponseHandlingException as e:
        print("Failed to create collection '%s' due to response handling error: %s", collection_name, e)
    except UnexpectedResponse as e:
        print("Failed to create collection '%s' due to unexpected response: %s", collection_name, e)
    except (IOError, OSError) as e:
        print("An unexpected error occurred while creating collection '%s': %s", collection_name, e)
        

print("\nQdrant Collections setup completed.")

# Initialize Qdrant Vector Stores
children_vector_store = QdrantVectorStore(
    client=client,
    collection_name="child_documents"
)
parents_vector_store = QdrantVectorStore(
    client=client,
    collection_name="parent_documents"
)

# Define storage context
children_storage_context = StorageContext.from_defaults()
children_storage_context.vector_stores["children_vector_store"] = children_vector_store

parents_storage_context = StorageContext.from_defaults()
parents_storage_context.vector_stores["parents_vector_store"] = parents_vector_store

print("\nVector Stores have been initialized.")


Initialize Qdrant Client, Collections and Vector Stores...

Collection 'child_documents' created with vector dimension 3072.
Collection 'parent_documents' created with vector dimension 3072.

Qdrant Collections setup completed.

Vector Stores have been initialized.


2.2. Load the Embedding with the associated metadata.

In [13]:
print("\nLoad the embeddings with their metadata...\n")

# Texts embebeddings
text_embeddings, child_text_with_metadata, parent_text_with_metadata = load_embeddings_with_associated_documents(
    embeddings_dir=TEXT_FINAL_CHUNKS_EMBEDDINGS_PATH,
    child_docs_dir=TEXT_FINAL_CHUNKS_PATH,
    parent_docs_dir=TEXT_SEMANTIC_CHUNKS_PATH
)

# Tables embebeddings
table_embeddings, child_table_with_metadata, parent_table_with_metadata = load_embeddings_with_associated_documents(
    embeddings_dir=TABLE_CHUNKS_EMBEDDINGS_PATH,
    child_docs_dir=TABLE_CHUNKS_PATH,
    parent_docs_dir=TABLE_PATH
)

# Images descriptions embebeddings
image_embeddings, child_image_with_metadata, parent_image_with_metadata = load_embeddings_with_associated_documents(
    embeddings_dir=IMAGE_DESCRIPTION_SEMANTIC_CHUNKS_EMBEDDINGS_PATH,
    child_docs_dir=IMAGE_DESCRIPTION_SEMANTIC_CHUNKS_PATH,
    parent_docs_dir=IMAGE_DESCRIPTION_PATH
)

# Get the texts and the metadata for children and parents
child_texts = [child["text"] for child in child_text_with_metadata]
child_tables = [child["text"] for child in child_table_with_metadata]
child_images = [child["text"] for child in child_image_with_metadata]

child_texts_metadata = [child["metadata"] for child in child_text_with_metadata]
child_tables_metadata = [child["metadata"] for child in child_table_with_metadata]
child_images_metadata = [child["metadata"] for child in child_image_with_metadata]

parent_texts = [parent["text"] for parent in parent_text_with_metadata]
parent_tables = [parent["text"] for parent in parent_table_with_metadata]
parent_images = [parent["text"] for parent in parent_image_with_metadata]

parent_texts_metadata = [child["metadata"] for child in parent_text_with_metadata]
parent_tables_metadata = [child["metadata"] for child in parent_table_with_metadata]
parent_images_metadata = [child["metadata"] for child in parent_image_with_metadata]


Load the embeddings with their metadata...

Loading embeddings from directory: /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.embeddings/.texts/.texts_final_chunks ...


Loading the embedding file : 167- Demand Element - 2.2 Safety Stock.pdf_page_3_semantic_chunk_1_final_chunk_4_embedding.json...

Loading the embedding file : 167- Demand Element - 2.2 Safety Stock.pdf_page_2_semantic_chunk_1_final_chunk_4_embedding.json...

Loading the embedding file : 167- Demand Element - 2.2 Safety Stock.pdf_page_1_semantic_chunk_1_final_chunk_1_embedding.json...

Loading the embedding file : 167- Demand Element - 2.2 Safety Stock.pdf_page_2_semantic_chunk_1_final_chunk_2_embedding.json...

Loading the embedding file : 167- Demand Element - 2.2 Safety Stock.pdf_page_3_semantic_chunk_1_final_chunk_3_embedding.json...

Loading the embedding file : 167- Demand Element - 2.2 Safety Stock.pdf_page_4_semantic_chunk_1_final_chunk_4_embedding.json...

Loading the embedding file : 167- Demand E

2.3. Create the Child and Parent Document instances for Indexing

In [16]:
# Create the Child and Parent Document instances
child_documents_from_texts = create_documents(child_texts, child_texts_metadata)
child_documents_from_tables = create_documents(child_tables, child_tables_metadata)
child_documents_from_images = create_documents(child_images, child_images_metadata)

raw_child_documents = child_documents_from_texts
for docs in [child_documents_from_tables, child_documents_from_images]:
    raw_child_documents.extend(docs)

# Remove duplicates
child_documents = []
seen_names = set()  # Track which metadata["name"] values we've seen

for doc in raw_child_documents:
    name_val = doc.metadata.get("name")
    if name_val not in seen_names:
        child_documents.append(doc)
        seen_names.add(name_val)

parent_documents_from_texts = create_documents(parent_texts, parent_texts_metadata)
parent_documents_from_tables = create_documents(parent_tables, parent_tables_metadata)
parent_documents_from_images = create_documents(parent_images, parent_images_metadata)

raw_parent_documents = parent_documents_from_texts
for docs in [parent_documents_from_tables, parent_documents_from_images]:
    raw_parent_documents.extend(docs)

# Remove duplicates
parent_documents = []
seen_names = set()  # Track which metadata["name"] values we've seen

for doc in raw_parent_documents:
    name_val = doc.metadata.get("name")
    if name_val not in seen_names:
        parent_documents.append(doc)
        seen_names.add(name_val)

2.4. Build and Store the Child and Parent Documents Indexes

In [17]:
print("\nCreate VectorStoreIndex Instance for Child Documents ...")

child_documents_index = create_index(
    documents=child_documents,
    storage_context=children_storage_context,
    index_name="child_documents",
    index_type="vector"
)

print("\nCreate KeywordTableIndex Instance for Parent Documents ...")

parent_documents_index = create_index(
    documents=parent_documents,
    storage_context=parents_storage_context,
    index_name="parent_documents",
    index_type="keyword"
)

print("\nSave the Indexes to Disk...\n")

base_dir = Path(INDEX_PATH)
save_index(child_documents_index, base_dir / ".child_documents")
save_index(parent_documents_index, base_dir / ".parent_documents")

print(f"\nThe indexes have been saved to disk successfully to: {base_dir}.\n")


Create VectorStoreIndex Instance for Child Documents ...

Create KeywordTableIndex Instance for Parent Documents ...

Save the Indexes to Disk...


The indexes have been saved to disk successfully to: /home/ariel/projects/multimodal-rag-with-evaluation-1/data/.qdrant_indexes.

