In [1]:
import logging
import os
import shutil
import traceback
import tempfile
from db_models import File, FileLocation, FilingTag, FileTagLabel, FileEmbedding, get_db_engine
from embedding.minilm import MiniLMEmbedder
from sqlalchemy import func
from sqlalchemy.orm import Session
from text_extraction.pdf_extraction import PDFTextExtractor
from text_extraction.basic_extraction import TextFileTextExtractor, get_extractor_for_file
from text_extraction.image_extraction import ImageTextExtractor
from text_extraction.office_doc_extraction import PresentationTextExtractor, SpreadsheetTextExtractor, WordFileTextExtractor
from text_extraction.web_extraction import HtmlTextExtractor, EmailTextExtractor
from text_extraction.extraction_utils import common_char_replacements, strip_diacritics, normalize_unicode
from logging_setups import setup_logger

nb_logger = setup_logger(name="NotebookLogger", notebook=True, level=logging.DEBUG)

import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

pdf_extractor = PDFTextExtractor()
txt_extractor = TextFileTextExtractor()
image_extractor = ImageTextExtractor()
presentation_extractor = PresentationTextExtractor()
spreadsheet_extractor = SpreadsheetTextExtractor()
word_extractor = WordFileTextExtractor()
html_extractor = HtmlTextExtractor()
email_extractor = EmailTextExtractor()

extractors_list = [
    pdf_extractor,
    txt_extractor,
    image_extractor,
    presentation_extractor,
    spreadsheet_extractor,
    word_extractor,
    html_extractor,
    email_extractor
]

def get_files_from_tag_locations(
        db_session: Session,
        tag_obj: FilingTag,
        limit: int = 100,
        randomize: bool = False,
        exclude_embedded: bool = False
        ):
    """
    Retrieve files tagged with a specific tag string.
    Optionally exclude files that already have a FileEmbedding.
    """
    tag_locations = FileLocation.file_server_directories.ilike(f"%/{tag_obj.label_search_str}%")

    # start the base query
    q = db_session.query(File) \
                  .join(FileLocation) \
                  .filter(tag_locations)

    if exclude_embedded:
        # left‐outer join to file_embeddings, keep only those with no match
        q = q.outerjoin(FileEmbedding, File.hash == FileEmbedding.file_hash) \
             .filter(FileEmbedding.file_hash == None)

    if randomize:
        q = q.order_by(func.random())
    q = q.limit(limit)

    return q.all()


filing_code_tag = "F10 - Escrow Agreement"
file_server_location = r"N:\PPDO\Records"
embedding_client = MiniLMEmbedder()
with Session(get_db_engine()) as db_session:
    tag = FilingTag.retrieve_tag_by_label(db_session, filing_code_tag)
    files = get_files_from_tag_locations(
        db_session= db_session,
        tag_obj = tag,
        randomize=True,
        exclude_embedded=True
    )

    with tempfile.TemporaryDirectory() as temp_dir:
        for file_i, file_obj in enumerate(files):
            nb_logger.info(f"Processing file number {file_i + 1} of {len(files)}: {file_obj.id}")

            if not file_obj.locations:
                nb_logger.warning(f"No locations found for file {file_obj.id}")
                continue
            
            filepath = ""
            filename = ""
            for location in file_obj.locations:
                local_location_filepath = location.local_filepath(server_mount_path=file_server_location)

                if os.path.exists(local_location_filepath):
                    nb_logger.info(f"Found file {file_obj.id} at {local_location_filepath}")
                    
                    # if the tag's full label is not in the file server directories, do something...
                    if tag.full_tag_label_str.lower() not in location.file_server_directories.lower():
                        nb_logger.warning(f"File {file_obj.id} does not match tag {filing_code_tag} in its location directories")
                        #TODO
                        continue
                    filename = location.filename
                    break
                
                else:
                    nb_logger.warning(f"File {file_obj.id} not found at {local_location_filepath}")
                    continue
            
            # if no valid file found, skip to next file
            if not filename:
                nb_logger.warning(f"No valid file found for {file_obj.id}")
                continue

            try:
                temp_filepath = os.path.join(temp_dir, filename)
                shutil.copyfile(local_location_filepath, temp_filepath)
                extractor = get_extractor_for_file(temp_filepath, extractors_list)
                if extractor:
                    text = extractor(temp_filepath)
                    if text:
                        text = common_char_replacements(text)
                        text = strip_diacritics(text)
                        text = normalize_unicode(text)
                        emb_list = embedding_client.encode([text])
                        emb_vec = emb_list[0] if emb_list else None
                        if emb_vec is not None:
                            file_embedding = FileEmbedding(
                                file_hash =file_obj.hash,
                                source_text =text,
                                minilm_model=embedding_client.model_name,
                                minilm_emb=emb_vec
                            )
                            db_session.add(file_embedding)
                            db_session.commit()
                            nb_logger.info(f"Processed file {file_obj.id} with tag {filing_code_tag}")
                        else:
                            nb_logger.warning(f"Failed to create embedding for file {file_obj.id}")
                            continue
                        
                        current_tag = tag
                        while current_tag is not None:
                            labeling_record = FileTagLabel(
                                file_id = file_obj.id,
                                file_hash = file_obj.hash,
                                tag = current_tag.label,
                                is_primary = True if not current_tag.parent else False,
                                label_source = 'rule'
                            )
                            db_session.add(labeling_record)
                            nb_logger.info(f"Added tag {current_tag.label} to file {file_obj.id}")
                            current_tag = current_tag.parent
                        db_session.commit()
                    else:
                        nb_logger.warning(f"No text extracted from {file_obj.id}")
                else:
                    nb_logger.warning(f"No suitable extractor found for file {file_obj.id}")
            except Exception as e:


                nb_logger.error(f"Error processing file {file_obj.id}: {e}")
                nb_logger.debug(traceback.format_exc())
                continue

  Full path to tesseract.exe if not on PATH. (eg r"C:\Program Files\Tesseract-OCR\tesseract.exe")


Output()

INFO - Processing file number 1 of 9: 205400
INFO - Found file 205400 at N:\PPDO\Records\18xx   Mc Henry Library Unit I\1810\1810\G - Construction\G21 - Photos\F10 - Escrow Agreement\P5140030.JPG
INFO - Processing file number 2 of 9: 205407
INFO - Found file 205407 at N:\PPDO\Records\18xx   Mc Henry Library Unit I\1810\1810\G - Construction\G21 - Photos\F10 - Escrow Agreement\P6020089.JPG
INFO - Processing file number 3 of 9: 205409
INFO - Found file 205409 at N:\PPDO\Records\18xx   Mc Henry Library Unit I\1810\1810\G - Construction\G21 - Photos\F10 - Escrow Agreement\P6020091.JPG
INFO - Processing file number 4 of 9: 205401
INFO - Found file 205401 at N:\PPDO\Records\18xx   Mc Henry Library Unit I\1810\1810\G - Construction\G21 - Photos\F10 - Escrow Agreement\P5140031.JPG
INFO - Processing file number 5 of 9: 205402
INFO - Found file 205402 at N:\PPDO\Records\18xx   Mc Henry Library Unit I\1810\1810\G - Construction\G21 - Photos\F10 - Escrow Agreement\P5140032.JPG
INFO - Processing fi