In [7]:
import sys
import os
import dotenv
dotenv.load_dotenv('../../.env')

# Add parent directory to Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..','app')))

from app.SIWeaviateClient import SIWeaviateClient


In [35]:
import weaviate.classes as wvc
import weaviate
from weaviate.classes.query import Filter, GeoCoordinate, MetadataQuery, QueryReference, GroupBy



from collections import defaultdict

def create_toc(chunks):
    toc = defaultdict(lambda: defaultdict(list))
    
    for chunk in chunks:
        title_parts = chunk['title'].split('>')
        page = chunk['page']
        
        if len(title_parts) == 1:
            toc[title_parts[0]][''].append(page)
        elif len(title_parts) == 2:
            toc[title_parts[0]][title_parts[1]].append(page)
        else:
            print(f"Warning: Unexpected title format: {chunk['title']}")
    
    return toc

def toc_to_json(toc):
    json_toc = []
    for main_title, subtitles in toc.items():
        main_entry = {"title": main_title, "subtitles": []}
        for subtitle, pages in subtitles.items():
            if subtitle:
                main_entry["subtitles"].append({
                    "title": subtitle,
                    "pages": pages
                })
            else:
                main_entry["pages"] = pages
        json_toc.append(main_entry)
    return json_toc



client = weaviate.connect_to_local()
document_chunk = client.collections.get("DocumentChunk")
document = client.collections.get("Document")
doc = document.query.fetch_object_by_id(
    uuid="b6a7357a-cb3c-4329-ab6d-b0d1cff83df8",
    return_references=[QueryReference(
        link_on="hasChunks", 
        return_properties=["title", "meta_page_number"]
    )]
)
chunks = doc.references['hasChunks'].objects
chunks = [{"title": chunk.properties.get('title'), "page": chunk.properties.get('meta_page_number')} for chunk in chunks if chunk.properties.get('title')]

# Remove duplicates, keeping only the first occurrence (min page)
unique_chunks = {}

for chunk in chunks:
    title = chunk['title']
    page = chunk['page']
    if title not in unique_chunks or page < unique_chunks[title]['page']:
        unique_chunks[title] = chunk

chunks = list(unique_chunks.values())


toc = create_toc(chunks)
json_toc = toc_to_json(toc)
print(json_toc)



client.close()


[{'title': 'N° 436 \\ TRIMESTRIEL \\ JANVIER-MARS 2022 \\ 6,90 €', 'subtitles': [{'title': 'Bruno Maquart', 'pages': [3]}]}, {'title': 'N° 436 \\ JANVIER-MARS 2022', 'subtitles': [{'title': 'Bruno Maquart', 'pages': [4]}, {'title': 'Cliché du premier cristal de Wigner', 'pages': [6]}]}, {'title': 'Premières images de l’IRM', 'subtitles': [{'title': 'Premières images de l IRM la plus puissante du monde', 'pages': [6]}, {'title': 'Climatologie \\ Astrophysique', 'pages': [7]}, {'title': 'Les sommets inhospitaliers verdissent plus vite', 'pages': [7]}, {'title': 'Planète faite du même bois que son étoile', 'pages': [7]}]}, {'title': 'Physiologie-médecine \\ Médecine', 'subtitles': [{'title': 'Planète faite du même bois que son étoile', 'pages': [8]}, {'title': 'Nouvel antiparasitaire ?', 'pages': [8]}, {'title': 'Première greffe réussie d’un rein de cochon', 'pages': [8]}, {'title': 'Neurosciences \\ Physiologie', 'pages': [9]}, {'title': 'Le système immunitaire est du matin', 'pages': [9