# Multi Document Search

In [None]:
import sys

sys.path.insert(0, '..')

import weaviate.classes as wvc
import weaviate
from collections import defaultdict
from app.schemas import ChunkWithScore, DocumentSearchResult
from weaviate.classes.query import Filter, GeoCoordinate, MetadataQuery, QueryReference, GroupBy


query = "l'île de Pâques"
client = weaviate.connect_to_local()
# print("SEARCH_MULTI_DOCUMENTS 0 ", query)
document_chunk = client.collections.get("DocumentChunk")
response = document_chunk.query.hybrid(
    query=query,
    return_metadata=wvc.query.MetadataQuery(score=True),
    query_properties=["text"],
    return_references=[QueryReference(link_on="belongsToDocument", return_properties=["document_id", "local_path", "original_public_path", "media_name", "max_score", "min_score"])]
)
# print("SEARCH_MULTI_DOCUMENTS 1 ", response)
client.close()

# Assume response is the result from the above query
grouped_results = defaultdict(list)
for chunk in response.objects:
    doc_ref = chunk.references['belongsToDocument'].objects[0].properties
    grouped_results[doc_ref['document_id']].append(chunk)

# Now `grouped_results` contains your data grouped by 'document_id'
# for document_id, chunks in grouped_results.items():
#     print(f"Media id: {document_id}, || Chunks: {chunks}")

documentsSearchResponse = []

for document_id, group in grouped_results.items():  # View by group
    # max_score = min(chunk for chunk in group)#group.max_score
    # min_score = min(chunk for chunk in group)#group.min_score
    currentDocumentChunksWithDistance = []
    for _documentChunk in group:
        score = _documentChunk.metadata.score
        cuurrentChunkWithScore = ChunkWithScore.model_validate({**_documentChunk.properties, "score": score})
        currentDocumentChunksWithDistance.append(cuurrentChunkWithScore)
    
    documentProperties = _documentChunk.references['belongsToDocument'].objects[0].properties
    
    documentSearchResult = DocumentSearchResult(
        document_id=documentProperties['document_id'],
        local_path=documentProperties['local_path'],
        original_public_path=documentProperties['original_public_path'],
        media_name=documentProperties['media_name'],
        min_score=min(chunk.score for chunk in currentDocumentChunksWithDistance),
        max_score=max(chunk.score for chunk in currentDocumentChunksWithDistance),
        chunks=currentDocumentChunksWithDistance
    )
    documentsSearchResponse.append(documentSearchResult)
for doc in documentsSearchResponse:
    print(doc.media_name)
    for chunk in doc.chunks:
        print(f"    -->  {chunk.score} | {chunk.text}")


# Single Document Search

In [None]:
import sys

sys.path.insert(0, '..')

from weaviate.util import get_valid_uuid
import weaviate.classes as wvc
import weaviate
from collections import defaultdict
from app.schemas import ChunkWithScore, DocumentSearchResult
from weaviate.classes.query import Filter, GeoCoordinate, MetadataQuery, QueryReference, GroupBy


query = "l'île de Paques"
client = weaviate.connect_to_local()
# print("SEARCH_MULTI_DOCUMENTS 0 ", query)
document_chunk = client.collections.get("DocumentChunk")
response = document_chunk.query.hybrid(
    query=query,
    return_metadata=wvc.query.MetadataQuery(score=True),
    query_properties=["text"],
    filters=Filter.by_ref(link_on="belongsToDocument").by_property("document_id").equal(get_valid_uuid(uuid="9e80a4c3-7139-4f66-b450-e94cbbf06e2b")),
    return_references=[QueryReference(link_on="belongsToDocument", return_properties=["document_id", "local_path", "original_public_path", "media_name", "max_score", "min_score"])]
)
# print("SEARCH_MULTI_DOCUMENTS 1 ", response)
client.close()


# Assume response is the result from the above query
grouped_results = defaultdict(list)
for chunk in response.objects:
    doc_ref = chunk.references['belongsToDocument'].objects[0].properties
    grouped_results[doc_ref['document_id']].append(chunk)

# Now `grouped_results` contains your data grouped by 'document_id'
# for document_id, chunks in grouped_results.items():
#     print(f"Media id: {document_id}, || Chunks: {chunks}")

documentsSearchResponse = []

for document_id, group in grouped_results.items():  # View by group
    # max_score = min(chunk for chunk in group)#group.max_score
    # min_score = min(chunk for chunk in group)#group.min_score
    currentDocumentChunksWithDistance = []
    for _documentChunk in group:
        score = _documentChunk.metadata.score
        cuurrentChunkWithScore = ChunkWithScore.model_validate({**_documentChunk.properties, "score": score})
        currentDocumentChunksWithDistance.append(cuurrentChunkWithScore)
    
    documentProperties = _documentChunk.references['belongsToDocument'].objects[0].properties
    
    documentSearchResult = DocumentSearchResult(
        document_id=documentProperties['document_id'],
        local_path=documentProperties['local_path'],
        original_public_path=documentProperties['original_public_path'],
        media_name=documentProperties['media_name'],
        min_score=min(chunk.score for chunk in currentDocumentChunksWithDistance),
        max_score=max(chunk.score for chunk in currentDocumentChunksWithDistance),
        chunks=currentDocumentChunksWithDistance
    )
    documentsSearchResponse.append(documentSearchResult)
for doc in documentsSearchResponse:
    print(doc.media_name)
    for chunk in doc.chunks:
        print(f"    -->  {chunk.score} | {chunk.text}")


# Slack answer

In [10]:
import json
import sys

sys.path.insert(0, '..')

import weaviate.classes as wvc
import weaviate
from collections import defaultdict
from app.schemas import ChunkWithScore, DocumentSearchResult
from weaviate.classes.query import Filter, GeoCoordinate, MetadataQuery, QueryReference, GroupBy
from weaviate.classes.query import QueryReference, Filter, GroupBy

query = "l'île de Pâques"
client = weaviate.connect_to_local()
document_collection = client.collections.get("Document")
response = document_collection.query.hybrid(
    query="document",
    group_by=GroupBy(prop="media_name", objects_per_group=100, number_of_groups=100),
    #filters=Filter.by_property("document_id").equal("Document1"),
    return_metadata=wvc.query.MetadataQuery(score=True, distance=True, certainty=True),
    return_references=QueryReference(
        link_on="hasChunks",
        return_properties=["text"],
        return_metadata=wvc.query.MetadataQuery(score=True, distance=True, certainty=True),
    ),
)
client.close()

print(len(response.groups))

for document_id, group in response.groups.items():  # View by group
    max_distance = group.max_distance
    min_distance = group.min_distance
    currentDocumentChunksWithDistance = []
    # print(f"Group {document_id} has {len(group.objects)} objects")
    print("DEBUGGGG -------------- ", group)
    # print("DEBUGGGG -------------- ", max_distance)
    # print("DEBUGGGG -------------- ", min_distance)
    # print("DEBUGGGG -------------- ", group)
    
    properties = group.objects[0].properties
    DocumentSearchResult.model_validate({
        **object.properties,
        # "chunks": ChunkWithScore
        "max_score": max_distance,
        "min_score": min_distance,
    })
    

4
DEBUGGGG --------------  Group(name='Ile de Paques : la théorie de l’effondrement précolonial à nouveau démentie | Actu de science', min_distance=-4.76837158203125e-07, max_distance=-4.76837158203125e-07, number_of_objects=1, objects=[GroupByObject(uuid=_WeaviateUUIDInt('1e1c4482-a564-4fc1-aa4c-52e51af53a87'), metadata=GroupByMetadataReturn(distance=-4.76837158203125e-07), properties={'local_path': '/home/erwan/Desktop/clients/ScienceInfuse/server/notebooks/../documents/youtube/9e80a4c3-7139-4f66-b450-e94cbbf06e2b.mp4', 'chunks': [{'end_offset': 18.0, 'media_type': 'youtube', 'start_offset': 2.0, 'text': "Une croyance encore bien ancrée dans les esprits laisse penser que jadis, les habitants de l'île de Pâques, de son nom autochtone Rapanoui, auraient surexploité leur environnement naturel, provoquant ainsi eux-mêmes leur disparition. Mais cette hypothèse a été maintes fois démentie et une nouvelle étude enfonce le clou."}, {'start_offset': 25.0, 'end_offset': 158.0, 'text': "En effe

ValidationError: 2 validation errors for DocumentSearchResult
chunks.0.score
  Field required [type=missing, input_value={'media_type': 'youtube',....", 'start_offset': 9.0}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.7/v/missing
chunks.1.score
  Field required [type=missing, input_value={'text': "Les IRM du cerv...e', 'end_offset': 146.0}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.7/v/missing

In [16]:
group.objects[0]

GroupByObject(uuid=_WeaviateUUIDInt('1e1c4482-a564-4fc1-aa4c-52e51af53a87'), metadata=GroupByMetadataReturn(distance=-4.76837158203125e-07), properties={'local_path': '/home/erwan/Desktop/clients/ScienceInfuse/server/notebooks/../documents/youtube/9e80a4c3-7139-4f66-b450-e94cbbf06e2b.mp4', 'chunks': [{'end_offset': 18.0, 'media_type': 'youtube', 'start_offset': 2.0, 'text': "Une croyance encore bien ancrée dans les esprits laisse penser que jadis, les habitants de l'île de Pâques, de son nom autochtone Rapanoui, auraient surexploité leur environnement naturel, provoquant ainsi eux-mêmes leur disparition. Mais cette hypothèse a été maintes fois démentie et une nouvelle étude enfonce le clou."}, {'start_offset': 25.0, 'end_offset': 158.0, 'text': "En effet, l'île n'a jamais accueilli assez d'humains pour que cela ait entraîné leur Lorsque Rapa Nui est accosté par les colons européens, en 1722, il découvre d'imposantes statues de pierres érigées par milliers. Pourtant, il n'y a que 3 000 