In [1]:

from dotenv import load_dotenv

load_dotenv('../../.env')
import sys
sys.path.insert(0, '../app')
from SIWeaviateClient import SIWeaviateClient
from processing.image.SIImageDescription import SIImageDescription
from processing.text.SIITranslator import SITranslator
from processing.text.SISurya import SISurya
from S3Storage import S3Storage

s3 = S3Storage()
image_descriptor = SIImageDescription()
translator = SITranslator()
surya = SISurya()





Loaded detection model vikp/surya_det2 on device cuda with dtype torch.float16
Loaded detection model vikp/surya_layout2 on device cuda with dtype torch.float16
Loaded reading order model vikp/surya_order on device cuda with dtype torch.float16


## process single pdf

In [2]:
from processing.PDFProcessor import PDFProcessor
import os
import glob


pdf_path = "./data/exported_document.pdf"
with SIWeaviateClient() as client:
    try:
        pdf = PDFProcessor(image_descriptor, translator, surya, s3, pdf_path)
    except Exception as e:
        import traceback
        print("ERROR", e)
        traceback.print_exc()

PDFProcessor pdf_path ./data/exported_document.pdf
PDFProcessor extract_document self.pdf_path ./data/exported_document.pdf
File ./data/exported_document.pdf uploaded successfully to bucket science-infuse-content as pdf/f91f3fae-14b5-4fa0-a3fe-473b32da86ba.pdf.
PDFProcessor extract_document images 0


Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.07s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  6.99it/s]
Finding reading order: 100%|██████████| 1/1 [00:00<00:00,  7.71it/s]

PDFProcessor extract_document texts 1
RETURNING id='f91f3fae-14b5-4fa0-a3fe-473b32da86ba' publicPath='' originalPath='./data/exported_document.pdf' s3ObjectName='pdf/f91f3fae-14b5-4fa0-a3fe-473b32da86ba.pdf' mediaName='exported_document' [PdfTextChunk(document=Document(id='f91f3fae-14b5-4fa0-a3fe-473b32da86ba', publicPath='', originalPath='./data/exported_document.pdf', s3ObjectName='pdf/f91f3fae-14b5-4fa0-a3fe-473b32da86ba.pdf', mediaName='exported_document'), text="Mathilde l'Emperesse est une princesse de la maison de Normandie née vers le 7 février 1102, probablement à Sutton Courtenay dans l'Oxfordshire, et morte le 10 septembre 1167 à Rouen, en Normandie. Impératrice du Saint-Empire romain germanique, elle revendique le trône du royaume d'Angleterre contre Étienne de Blois. Leur lutte, une longue guerre civile surnommée « l'Anarchie », dure de 1135 à 1153.", title='', mediaType=<MediaType.PDF_TEXT: 'pdf_text'>, metadata=PdfTextMetadata(pageNumber=1, bbox=BoundingBox(x1=1.0, y1=3.




## process all pdfs

In [None]:
from processing.PDFProcessor import PDFProcessor
import os
from weaviate.classes.query import Filter, GeoCoordinate, MetadataQuery, QueryReference
# from app.SIWeaviateClient import SIWeaviateClient

def get_all_pdfs():
    pdf_files = []
    for root, dirs, files in os.walk("./data"):
        for file in files:
            if file.endswith(".pdf"):
                pdf_files.append(os.path.join(root, file))
                # pdf_files.append(os.path.abspath(os.path.join(root, file)))
    return pdf_files

def is_document_already_indexed(pdf_path, client: WeaviateClient):
    document = client.collections.get("Document")
    response = document.query.fetch_objects(
        filters=(
            Filter.by_property("originalPath").equal(pdf_path)
        ),
        limit=1,
        return_properties=[]
    )
    return len(response.objects) > 0

with SIWeaviateClient() as client:
    print("CLIENT", client)
    pdf_paths = get_all_pdfs()
    for pdf_path in pdf_paths:
        # print("==========================================")
        # print(pdf_path)
        if (is_document_already_indexed(pdf_path, client)):
            print(f"Already in DB, SKIP INDEXING {pdf_path}")
            continue
        pdf = PDFProcessor(client, image_descriptor, translator, pdf_path)



# test bugs

In [None]:
import sys
sys.path.insert(0, '../app')
from processing.image.SIImageDescription import SIImageDescription
image_descriptor = SIImageDescription()

In [None]:
from IPython.display import Image 
from dotenv import load_dotenv
load_dotenv('../../.env')
import sys
sys.path.insert(0, '../app')

import io
import fitz
from PIL import Image
from pymupdf import Document as PdfDocument

def get_pdf_images(doc: type[PdfDocument]):
    temp_images = []

    for page_num in range(len(doc)):
        page = doc[page_num]

        for item in doc.get_page_images(page_num):
            pix = fitz.Pixmap(doc, item[0])  # pixmap from the image xref
            pix0 = fitz.Pixmap(fitz.csRGB, pix)  # force into RGB
            pil_image = Image.frombytes("RGB", [pix0.width, pix0.height], pix0.samples)
            x0, y0, x1, y1 = page.get_image_bbox(item[7])
            width = x1 - x0
            height = y1 - y0
            # print(f"Width: {width}, Height: {height}")
            # print(f"Image bounding box: x0={x0}, y0={y0}, x1={x1}, y1={y1}")
            # display(pil_image)
            temp_images.append({"image": pil_image, "pageNumber": page_num+1, 'bbox': {"x0": x0, "y0": y0, "x1": x1, "y1": y1, }})

    return temp_images

# image_descriptor = SIImageDescription()
pdf_path = "/home/erwan/Desktop/clients/ScienceInfuse/server/notebooks/data/UniversScience/revue-decouverte/decouverte_436.pdf"
# pdf_path = "/home/erwan/Desktop/clients/ScienceInfuse/server/notebooks/data/UniversScience/dossiers-pédagogiques/astro/2015/2015 - Dinosaures.pdf"
with fitz.open(pdf_path) as fitz_doc:
    images = get_pdf_images(fitz_doc)
    print(images[0])
    images_chunks = [{**image, "description_en": image_descriptor.get_description(image['image'])} for image in images]

    # for image in images:
    #     display(image)
    # images_chunks = [{**image, "description_en": image_descriptor.get_description(image['image'])} for image in images]


In [None]:
images_chunks