# Multi Document Search

In [3]:
import sys

sys.path.insert(0, '..')

import weaviate.classes as wvc
import weaviate
from collections import defaultdict
from schemas import ChunkWithScore, DocumentSearchResult
from weaviate.classes.query import Filter, GeoCoordinate, MetadataQuery, QueryReference, GroupBy


query = "l'île de Pâques"
client = weaviate.connect_to_local()
# print("SEARCH_MULTI_DOCUMENTS 0 ", query)
document_chunk = client.collections.get("DocumentChunk")
response = document_chunk.query.hybrid(
    query=query,
    return_metadata=wvc.query.MetadataQuery(score=True),
    query_properties=["text"],
    return_references=[QueryReference(link_on="belongsToDocument", return_properties=["document_id", "public_path", "original_path", "media_name", "max_score", "min_score"])]
)
# print("SEARCH_MULTI_DOCUMENTS 1 ", response)
client.close()

# Assume response is the result from the above query
grouped_results = defaultdict(list)
for chunk in response.objects:
    doc_ref = chunk.references['belongsToDocument'].objects[0].properties
    grouped_results[doc_ref['document_id']].append(chunk)

# Now `grouped_results` contains your data grouped by 'document_id'
# for document_id, chunks in grouped_results.items():
#     print(f"Media id: {document_id}, || Chunks: {chunks}")

documentsSearchResponse = []

for document_id, group in grouped_results.items():  # View by group
    # max_score = min(chunk for chunk in group)#group.max_score
    # min_score = min(chunk for chunk in group)#group.min_score
    currentDocumentChunksWithDistance = []
    for _documentChunk in group:
        score = _documentChunk.metadata.score
        cuurrentChunkWithScore = ChunkWithScore.model_validate({**_documentChunk.properties, "score": score})
        currentDocumentChunksWithDistance.append(cuurrentChunkWithScore)
    
    documentProperties = _documentChunk.references['belongsToDocument'].objects[0].properties
    
    documentSearchResult = DocumentSearchResult(
        document_id=documentProperties['document_id'],
        public_path=documentProperties['public_path'],
        original_path=documentProperties['original_path'],
        media_name=documentProperties['media_name'],
        min_score=min(chunk.score for chunk in currentDocumentChunksWithDistance),
        max_score=max(chunk.score for chunk in currentDocumentChunksWithDistance),
        chunks=currentDocumentChunksWithDistance
    )
    documentsSearchResponse.append(documentSearchResult)
for doc in documentsSearchResponse:
    print(doc.media_name)
    for chunk in doc.chunks:
        print(f"    -->  {chunk.score} | {chunk.text}")


QueryReturn(objects=[Object(uuid=_WeaviateUUIDInt('4a0d74fa-3ebf-4cb6-b150-e6c973cdbcdb'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=0.8472616076469421, explain_score=None, is_consistent=None, rerank_score=None), properties={'text': "Une capture d'écran d'un écran d'ordinateur montrant une simulation. Il y a quatre graphes sur l'écran. Le premier montre une île verte avec un arbre de Palmier au-dessus de celle -ci et une personne debout sur l'île. À côté de l'île verte se trouve une image d'un globe avec une ligne rouge représentant la température et la température de la Terre.", 'metadata': {'height': 945, 'public_path': '/pdf/images/3b525860-775f-4dc2-b083-69fe8f6459b3.png', 'width': 1912, 'page_number': 16}, 'media_type': 'image'}, references={'belongsToDocument': <weaviate.collections.classes.internal._CrossReference object at 0x78698edfdaf0>}, vector={}, collection='DocumentChunk'), Object(uuid=_WeaviateUUIDInt('a2ca9dc

# Single Document Search

In [1]:
import sys

sys.path.insert(0, '..')

from weaviate.util import get_valid_uuid
import weaviate.classes as wvc
import weaviate
from collections import defaultdict
from schemas import ChunkWithScore, DocumentSearchResult
from weaviate.classes.query import Filter, GeoCoordinate, MetadataQuery, QueryReference, GroupBy


query = "l'île de Paques"
client = weaviate.connect_to_local()
# print("SEARCH_MULTI_DOCUMENTS 0 ", query)
document_chunk = client.collections.get("DocumentChunk")
response = document_chunk.query.hybrid(
    query=query,
    return_metadata=wvc.query.MetadataQuery(score=True),
    query_properties=["text"],
    filters=Filter.by_ref(link_on="belongsToDocument").by_property("document_id").equal(get_valid_uuid(uuid="9e80a4c3-7139-4f66-b450-e94cbbf06e2b")),
    return_references=[QueryReference(link_on="belongsToDocument", return_properties=["document_id", "public_path", "original_path", "media_name", "max_score", "min_score"])]
)
# print("SEARCH_MULTI_DOCUMENTS 1 ", response)
client.close()


# Assume response is the result from the above query
grouped_results = defaultdict(list)
for chunk in response.objects:
    doc_ref = chunk.references['belongsToDocument'].objects[0].properties
    grouped_results[doc_ref['document_id']].append(chunk)

# Now `grouped_results` contains your data grouped by 'document_id'
# for document_id, chunks in grouped_results.items():
#     print(f"Media id: {document_id}, || Chunks: {chunks}")

documentsSearchResponse = []

for document_id, group in grouped_results.items():  # View by group
    # max_score = min(chunk for chunk in group)#group.max_score
    # min_score = min(chunk for chunk in group)#group.min_score
    currentDocumentChunksWithDistance = []
    for _documentChunk in group:
        score = _documentChunk.metadata.score
        cuurrentChunkWithScore = ChunkWithScore.model_validate({**_documentChunk.properties, "score": score})
        currentDocumentChunksWithDistance.append(cuurrentChunkWithScore)
    
    documentProperties = _documentChunk.references['belongsToDocument'].objects[0].properties
    
    documentSearchResult = DocumentSearchResult(
        document_id=documentProperties['document_id'],
        public_path=documentProperties['public_path'],
        original_path=documentProperties['original_path'],
        media_name=documentProperties['media_name'],
        min_score=min(chunk.score for chunk in currentDocumentChunksWithDistance),
        max_score=max(chunk.score for chunk in currentDocumentChunksWithDistance),
        chunks=currentDocumentChunksWithDistance
    )
    documentsSearchResponse.append(documentSearchResult)
for doc in documentsSearchResponse:
    print(doc.media_name)
    for chunk in doc.chunks:
        print(f"    -->  {chunk.score} | {chunk.text}")


  class ChunkWithScore(Generic[ChunkType], BaseDocumentChunk):


# Search Chunks

In [4]:
import sys

sys.path.insert(0, '..')

import weaviate.classes as wvc
import weaviate
from collections import defaultdict
from app.schemas import ChunkWithScore, DocumentSearchResult
from weaviate.classes.query import Filter, GeoCoordinate, MetadataQuery, QueryReference, GroupBy


query = "intestin"
client = weaviate.connect_to_local()
# print("SEARCH_MULTI_DOCUMENTS 0 ", query)
document_chunk = client.collections.get("DocumentChunk")
response = document_chunk.query.hybrid(
    query=query,
    limit=1,
    # filters=(Filter.by_property("media_type").contains_any(['website_qa']) & Filter.by_property("media_type").contains_any(['website_qa'])),
    return_metadata=wvc.query.MetadataQuery(score=True),
    query_properties=["text"],
    return_references=[QueryReference(link_on="belongsToDocument", return_properties=["document_id", "public_path", "original_path", "media_name", "max_score", "min_score"])]
)
# print("SEARCH_MULTI_DOCUMENTS 1 ", response)
client.close()

for obj in response.objects:
    print(obj.uuid)
    print(obj.properties.get("media_type"), obj.properties)


bf306afb-c870-4d9f-9dc5-e9c387f3f87e
pdf_text {'title': 'Janvier-Février-Mars>TERRE & UNIVERS', 'meta_end': None, 'meta_type': None, 'meta_bbox': {'x2': 212.0, 'y2': 406.0, 'x1': 39.0, 'y1': 65.0}, 'meta_url': None, 'meta_s3_object_name': None, 'user_approved': None, 'meta_question': None, 'text': 'du Massif armoricain à sa partie sud et au Massif central. Plus à l’est, la zone de Teplá dans l’Erzgebirge serait entrée également en collision avec la Bohême. Pour autant, les données paléontologiques et paléomagné\xad tiques ne révèlent aucune différence notable entre espèces planctoniques et paléolati\xad tudes de part et d’autre des sutures, ce qui suppose une forte proximité géographique entre les différents blocs. Des données qui infirment donc l’existence d’un tel océan, ou du moins lui imposeraient une taille extrê\xad mement réduite… En outre, le bloc saxo- thuringien situé au-dessus du massif Vosges-Forêt-Noire (équivalent oriental du Massif central) n’est pas apparenté au Massif 

In [73]:
from functools import reduce
from operator import and_

def add_filter(condition, filter_func):
    return filter_func() if condition else None

filter_conditions = [
    (False, lambda: Filter.by_property("media_type").contains_any(['website_qa'])),
    (False, lambda: Filter.by_property("some_property").equal("some_value")),
    (False, lambda: Filter.by_property("another_property").contains_all("another_value")),
    (False, lambda: Filter.by_property("yet_another").greater_than(5)),
    (False, lambda: Filter.by_property("final_property").less_than(10))
]

# Apply filters and remove None values
valid_filters = [f() for condition, f in filter_conditions if condition]

# Combine filters with & in a flat structure
filters = reduce(and_, valid_filters) if valid_filters else None

print(filters)

None
