In [2]:
import os
import json
from pprint import pprint as pp

from ragtor.rag import set_up_rag_db, load_vector_db
from ragtor.config import PDFS_PATH, PDFS_LOADED_ID_FILE_PATH, VECTOR_DB_PATH, EMBEDDINGS_MODEL, PROMPTS, SUMMARY_OLLAMA_MODEL

from ragtor.chunk_class import Chunk
from ragtor.doc_class import Doc, compute_doc_emd_chunks
from ragtor.clustering_utils import process_doc_clusters

import ollama
from langchain_text_splitters import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
vector_store = load_vector_db(VECTOR_DB_PATH)

In [5]:
from ragtor.rag import raptor_search

help(raptor_search)

Help on function raptor_search in module ragtor.rag:

raptor_search(vector_store: langchain_community.vectorstores.faiss.FAISS, query: str, embeddings_model: str = 'Snowflake/snowflake-arctic-embed-s', k: int = 3) -> Union[List[langchain_core.documents.base.Document], List]



In [None]:
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from typing import List

def raptor_search(vector_store:        FAISS,
                    query:              str, 
                    embeddings_model:    str = EMBEDDINGS_MODEL, 
                    k:                  int = 3) -> List[Document] | List:

    filter_args = {}
    filter_args["chunk_type"] = "cluster_summary"

    print(filter_args)
    results_cluster = vector_store.similarity_search_with_relevance_scores(
        query,
        k=10,
        filter=filter_args)

    print(results_cluster)
    
    try:
        metadata = results_cluster[0].metadata
        metadata = {k:v for k,v in metadata.items()}
        metadata["chunk_type"] = "sents"

        results_sents = vector_store.similarity_search(
        query,
        k=k,
        filter=metadata)

    except Exception as e:
        print("Raptor search not successful")
        print(e)
        
        return []
        
    return results_cluster + results_sents


In [20]:
query="tell me about the inter miami match"

In [29]:
results_cluster = vector_store.similarity_search_with_relevance_scores(
        query,
        k=10,
        # filter={"chunk_type": "cluster_summary"}
        )
results_cluster[0][0].metadata

{'chunk_doc_source': 'Al Ahly v Inter Miami _ Highlights, report and standings.pdf',
 'chunk_source': 'Unknown',
 'chunk_type': 'sent',
 'chunking_emb_model': 'Snowflake/snowflake-arctic-embed-s'}

In [None]:
raptor_search(vector_store=vector_store,
            query=query)

{'chunking_emb_model': 'Snowflake/snowflake-arctic-embed-s', 'chunk_type': 'cluster_summary'}
[]
Raptor search not successful
list index out of range


[]

In [31]:
docs = vector_store.docstore._dict.values()

# Extract unique metadata attributes
all_keys = set()
for doc in docs:
    if hasattr(doc, 'metadata'):
        all_keys.update(doc.metadata.keys())

print("Unique metadata keys:", all_keys)


Unique metadata keys: {'chunk_doc_source', 'chunk_source', 'chunk_type', 'chunking_emb_model'}


In [39]:
from collections import defaultdict

metadata_values = defaultdict(set)

for doc in docs:
    if hasattr(doc, "metadata") and doc.metadata:
        for key, value in doc.metadata.items():
            metadata_values[key].add(value)

# Display all unique metadata values
for key, values in metadata_values.items():
    print(f"Metadata field: {key}")
    print(f"Unique values: {values}\n")


Metadata field: chunk_doc_source
Unique values: {'Al Ahly v Inter Miami _ Highlights, report and standings.pdf'}

Metadata field: chunk_source
Unique values: {'cluster_0', 'cluster_1', 'cluster_2', 'Unknown'}

Metadata field: chunk_type
Unique values: {'cluster_text', 'cluster_summary', 'chunk', 'sents', 'sent'}

Metadata field: chunking_emb_model
Unique values: {'Snowflake/snowflake-arctic-embed-s'}

