In [None]:
import logging
import os
import shutil
import traceback
import tempfile
from db_models import File, FileLocation, FilingTag, FileTagLabel, FileEmbedding, get_db_engine
from embedding.minilm import MiniLMEmbedder
from sqlalchemy import func
from sqlalchemy.orm import Session
from text_extraction.pdf_extraction import PDFTextExtractor
from text_extraction.basic_extraction import TextFileTextExtractor, get_extractor_for_file
from text_extraction.image_extraction import ImageTextExtractor
from text_extraction.office_doc_extraction import PresentationTextExtractor, SpreadsheetTextExtractor, WordFileTextExtractor
from text_extraction.web_extraction import HtmlTextExtractor, EmailTextExtractor
from text_extraction.extraction_utils import common_char_replacements, strip_diacritics, normalize_unicode
from logging_setups import setup_logger

nb_logger = setup_logger(name="NotebookLogger", notebook=True, level=logging.DEBUG)

import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

pdf_extractor = PDFTextExtractor()
txt_extractor = TextFileTextExtractor()
image_extractor = ImageTextExtractor()
presentation_extractor = PresentationTextExtractor()
spreadsheet_extractor = SpreadsheetTextExtractor()
word_extractor = WordFileTextExtractor()
html_extractor = HtmlTextExtractor()
email_extractor = EmailTextExtractor()

extractors_list = [
    pdf_extractor,
    txt_extractor,
    image_extractor,
    presentation_extractor,
    spreadsheet_extractor,
    word_extractor,
    html_extractor,
    email_extractor
]

def get_files_from_tag_locations(db_session: Session, tag_obj: FilingTag, limit: int = 100, randomize: bool = False):
    """
    Retrieve files tagged with a specific tag string.
    """
    tag_locations = FileLocation.file_server_directories.ilike(f"%/{tag_obj.label_search_str}%")
    if randomize:
        files = db_session.query(File).join(FileLocation).filter(tag_locations).order_by(func.random()).limit(limit).all()
    else:
        files = db_session.query(File).join(FileLocation).filter(tag_locations).limit(limit).all()
    
    return files



filing_cod_tag = "G5.1 - Hot Work Permits"
file_server_location = r"N:\PPDO\Records"
embedding_client = MiniLMEmbedder()
with Session(get_db_engine()) as db_session:
    tag = FilingTag.retrieve_tag_by_label(db_session, filing_cod_tag)
    files = get_files_from_tag_locations(
        db_session= db_session,
        tag_obj = tag,
        randomize=True)

    with tempfile.TemporaryDirectory() as temp_dir:
        for file_obj in files:
            nb_logger.info(f"Processing file id # {file_obj.id}")

            if not file_obj.locations:
                nb_logger.warning(f"No locations found for file {file_obj.id}")
                continue
            
            filepath = ""
            filename = ""
            for location in file_obj.locations:
                server_filepath = location.local_filepath(server_mount_path=file_server_location)

                if os.path.exists(server_filepath):
                    filename = location.filename
                    break
                
                else:
                    nb_logger.warning(f"File {file_obj.id} not found at {server_filepath}")
                    continue
            
            # if no valid file found, skip to next file
            if not filename:
                nb_logger.warning(f"No valid file found for {file_obj.id}")
                continue

            try:
                temp_filepath = os.path.join(temp_dir, filename)
                shutil.copyfile(server_filepath, temp_filepath)
                extractor = get_extractor_for_file(temp_filepath, extractors_list)
                if extractor:
                    text = extractor(temp_filepath)
                    if text:
                        text = common_char_replacements(text)
                        text = strip_diacritics(text)
                        text = normalize_unicode(text)
                        emb_list = embedding_client.encode([text])
                        emb_vec = emb_list[0] if emb_list else None
                        if emb_vec is not None:
                            file_embedding = FileEmbedding(
                                file_hash =file_obj.hash,
                                source_text =text,
                                minilm_model=embedding_client.model_name,
                                minilm_emb=emb_vec
                            )
                            db_session.add(file_embedding)
                            db_session.commit()
                            nb_logger.info(f"Processed file {file_obj.id} with tag {filing_cod_tag}")
                        else:
                            nb_logger.warning(f"Failed to create embedding for file {file_obj.id}")
                            continue
                        
                        current_tag = tag
                        while current_tag is not None:
                            labeling_record = FileTagLabel(
                                file_id = file_obj.id,
                                file_hash = file_obj.hash,
                                tag = current_tag.label,
                                is_primary = True if not current_tag.parent else False,
                                label_source = 'rule'
                            )
                            db_session.add(labeling_record)
                            nb_logger.info(f"Added tag {current_tag.label} to file {file_obj.id}")
                            current_tag = current_tag.parent
                        db_session.commit()
                    else:
                        nb_logger.warning(f"No text extracted from {file_obj.id}")
                else:
                    nb_logger.warning(f"No suitable extractor found for file {file_obj.id}")
            except Exception as e:


                nb_logger.error(f"Error processing file {file_obj.id}: {e}")
                nb_logger.debug(traceback.format_exc())
                continue

INFO - Processing file id # 898310
INFO:NotebookLogger:Processing file id # 898310
ERROR - Error processing file 898310: [Errno 2] No such file or directory: 'N:\\PPDO\\Records\\22xx   Fine Arts & Communication\\2200\\2200-100\\G5.1 - Hot Work Permits\\File 2200-100 Hot Work Permit.pdf'
ERROR:NotebookLogger:Error processing file 898310: [Errno 2] No such file or directory: 'N:\\PPDO\\Records\\22xx   Fine Arts & Communication\\2200\\2200-100\\G5.1 - Hot Work Permits\\File 2200-100 Hot Work Permit.pdf'
DEBUG - Traceback (most recent call last):
  File "C:\Users\adankert\AppData\Local\Temp\ipykernel_30316\3168570773.py", line 90, in <module>
    shutil.copyfile(server_filepath, temp_filepath)
    ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.13_3.13.1520.0_x64__qbz5n2kfra8p0\Lib\shutil.py", line 260, in copyfile
    with open(src, 'rb') as fsrc:
         ~~~~^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or di

Output()

Output()

Output()

Output()

Output()

Output()

Possible reasons for this include:
--deskew was issued, causing transcoding.
The optional dependency 'jbig2' was not found, so some image optimizations could not be attempted.
The optional dependency 'pngquant' was not found, so some image optimizations could not be attempted.

ERROR - Error processing file 374052: 'MiniLMEmbedder' object has no attribute 'model_name'
ERROR:NotebookLogger:Error processing file 374052: 'MiniLMEmbedder' object has no attribute 'model_name'
DEBUG - Traceback (most recent call last):
  File "C:\Users\adankert\AppData\Local\Temp\ipykernel_30316\3168570773.py", line 104, in <module>
    minilm_model=embedding_client.model_name,
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'MiniLMEmbedder' object has no attribute 'model_name'

DEBUG:NotebookLogger:Traceback (most recent call last):
  File "C:\Users\adankert\AppData\Local\Temp\ipykernel_30316\3168570773.py", line 104, in <module>
    minilm_model=embedding_client.model_name,
                 ^^

Output()

Output()

Output()

Output()

Output()

Output()

Possible reasons for this include:
--deskew was issued, causing transcoding.
The optional dependency 'jbig2' was not found, so some image optimizations could not be attempted.
The optional dependency 'pngquant' was not found, so some image optimizations could not be attempted.

ERROR - Error processing file 712489: 'MiniLMEmbedder' object has no attribute 'model_name'
ERROR:NotebookLogger:Error processing file 712489: 'MiniLMEmbedder' object has no attribute 'model_name'
DEBUG - Traceback (most recent call last):
  File "C:\Users\adankert\AppData\Local\Temp\ipykernel_30316\3168570773.py", line 104, in <module>
    minilm_model=embedding_client.model_name,
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'MiniLMEmbedder' object has no attribute 'model_name'

DEBUG:NotebookLogger:Traceback (most recent call last):
  File "C:\Users\adankert\AppData\Local\Temp\ipykernel_30316\3168570773.py", line 104, in <module>
    minilm_model=embedding_client.model_name,
                 ^^

Output()

Output()

ERROR:ocrmypdf._exec.tesseract:[tesseract] Error during processing.
