In [None]:
# ## graphql query to seach
# {
#   Get{
#     DocumentChunk(
#       nearText: {
#         concepts: ["musique"],
#         # distance: 0.9
#       }
#       groupBy: {  # How to group the results
#         path: ["local_path"]
#         groups: 10
#         objectsPerGroup: 10
#       }
#     ) {
#       _additional {
#         group {  # Data to be returned
#           id
#           groupedBy{ value }
#           count
#           hits {  # Actual properties to be retrieved
#             media_type
#             chunk
#             local_path
#             _additional {
#               id
#               distance
#             }
#           }
#         }
#       }
#     }
#   }
# }

In [None]:
import weaviate
client = weaviate.connect_to_local()
from langchain_text_splitters import CharacterTextSplitter
from weaviate.util import get_valid_uuid
from weaviate.classes.query import Filter, GeoCoordinate, MetadataQuery, QueryReference 
import weaviate.classes as wvc
from weaviate import WeaviateClient
from typing import List, Dict, Optional


In [None]:
response = client.collections.get("DocumentChunk").query.hybrid(
    query="gymnastique",
    limit=1000,
    filters=Filter.by_property("document_id").equal(get_valid_uuid(uuid="2a7a0d87-eaca-45f5-b0b0-db10b06ff8d8")),
    return_metadata=MetadataQuery(score=True),
)
response.objects

In [None]:
from typing import Literal
from pydantic import BaseModel
from enum import Enum
import weaviate.classes as wvc

class DocumentChunk(BaseModel):
    chunk: str
    document_id: str
    local_path: str
    original_public_path: str
    media_type: str
    start_offset: int
    end_offset: int
    



#///////////
questions = client.collections.get("DocumentChunk")
response = questions.query.hybrid(
    query="danse",
    # distance=0.95,
    return_metadata=wvc.query.MetadataQuery(distance=True, score=True),
    group_by=wvc.query.GroupBy(
        prop="local_path",
        number_of_groups=10,
        objects_per_group=10
    )
)

for local_path, group in response.groups.items():  # View by group
    max_distance = group.max_distance
    min_distance = group.min_distance
    print(f"Group {local_path} has {len(group.objects)} objects")
    # print(group)
    for _documentChunk in group.objects:
        documentChunk = DocumentChunk.model_validate({**_documentChunk.properties, "document_id": str(_documentChunk.properties["document_id"])})
        print(documentChunk)
        print(_documentChunk.metadata.)



In [None]:

from typing import List, Dict
from pydantic import BaseModel, Field
from collections import defaultdict

# Define Pydantic models for output
class Chunk(BaseModel):
    chunk: str
    # local_path: str
    media_type: str
    score: float


class GroupedDocument(BaseModel):
    local_path: str
    chunks: List[Chunk] = Field(default_factory=list)
    max_score: float = float('-inf')


data = x['data']['Get']['DocumentChunk']

acc = defaultdict(lambda: GroupedDocument(local_path=''))

for doc in data:
    local_path = doc['_additional']['group']['groupedBy']['value']
    
    chunks = [Chunk(
        chunk=hit['chunk'],
        # local_path=hit['local_path'],
        media_type=hit['media_type'],
        score=hit['_additional']['distance']
    ) for hit in doc['_additional']['group']['hits']]
    
    max_score = max(chunk.score for chunk in chunks)
    acc[local_path].local_path = local_path
    acc[local_path].chunks.extend(chunks)
    acc[local_path].max_score = max(acc[local_path].max_score, max_score)

# Convert defaultdict to regular list of dicts
result = [group.dict() for group in acc.values()]

result
