In [None]:
import logging
import os
import shutil
import traceback
import tempfile
from db_models import File, FileLocation, FilingTag, FileTagLabel, FileEmbedding, get_db_engine
from embedding.minilm import MiniLMEmbedder
from sqlalchemy import func
from sqlalchemy.orm import Session
from text_extraction.pdf_extraction import PDFTextExtractor
from text_extraction.basic_extraction import TextFileTextExtractor, get_extractor_for_file
from text_extraction.image_extraction import ImageTextExtractor
from text_extraction.office_doc_extraction import PresentationTextExtractor, SpreadsheetTextExtractor, WordFileTextExtractor
from text_extraction.web_extraction import HtmlTextExtractor, EmailTextExtractor
from text_extraction.extraction_utils import common_char_replacements, strip_diacritics, normalize_unicode, normalize_whitespace
from typing import Optional, Union
from logging_setups import setup_logger

nb_logger = setup_logger(name="NotebookLogger", notebook=True, level=logging.DEBUG)

import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

pdf_extractor = PDFTextExtractor()
txt_extractor = TextFileTextExtractor()
image_extractor = ImageTextExtractor()
presentation_extractor = PresentationTextExtractor()
spreadsheet_extractor = SpreadsheetTextExtractor()
word_extractor = WordFileTextExtractor()
html_extractor = HtmlTextExtractor()
email_extractor = EmailTextExtractor()

extractors_list = [
    pdf_extractor,
    txt_extractor,
    image_extractor,
    presentation_extractor,
    spreadsheet_extractor,
    word_extractor,
    html_extractor,
    email_extractor
]

def get_files_from_tag_locations(
        db_session: Session,
        tag_obj: FilingTag,
        n: int = 100,
        randomize: bool = False,
        exclude_embedded: bool = False,
        max_size_mb: Optional[float] = None
        ):
    """
    Retrieve files tagged with a specific tag string.
    Optionally exclude files that already have a FileEmbedding.
    """
    tag_locations = FileLocation.file_server_directories.ilike(f"%/{tag_obj.full_tag_label_str}%")

    # start the base query
    q = db_session.query(File) \
                  .join(FileLocation) \
                  .filter(tag_locations)

    if exclude_embedded:
        # left‐outer join to file_embeddings, keep only those with no match
        q = q.outerjoin(FileEmbedding, File.hash == FileEmbedding.file_hash)\
            .filter(FileEmbedding.file_hash == None)\
            
    if max_size_mb is not None:
        max_size_bytes = max_size_mb * 1024 * 1024
        q = q.filter(File.size <= max_size_bytes)

    if randomize:
        q = q.order_by(func.random())
    
    q = q.limit(n)

    return q.all()

def label_file_using_tag(
        db_session: Session,
        file_obj: File,
        some_tag: FilingTag|str,
        label_source: str = 'rule'
    ):
    """
    Label a file with a specific tag.
    """
    
    if isinstance(some_tag, str):
        tag_obj = FilingTag.retrieve_tag_by_label(db_session, some_tag)
        if not tag_obj:
            raise ValueError(f"Tag '{some_tag}' not found in the database.")
        current_tag = tag_obj
    elif isinstance(some_tag, FilingTag):
        tag_obj = some_tag
        current_tag = tag_obj
    else:
        raise TypeError("some_tag must be a FilingTag instance or a string representing the tag label.")
    
    while current_tag is not None:

        labeling_record = FileTagLabel(
            file_id=file_obj.id,
            file_hash=file_obj.hash,
            tag=tag_obj.label,
            is_primary=True if not current_tag.parent else False,
            label_source=label_source
        )
        db_session.add(labeling_record)
        nb_logger.info(f"Added tag {current_tag.label} to file with hash {file_obj.hash}")
        current_tag = current_tag.parent

    db_session.commit()
    return labeling_record


filing_code_tag = "F10 - Escrow Agreement"
filing_code_tag = "H - Submittals"
file_server_location = r"N:\PPDO\Records"
embedding_client = MiniLMEmbedder()
with Session(get_db_engine()) as db_session:
    tag = FilingTag.retrieve_tag_by_label(db_session, filing_code_tag)
    files = get_files_from_tag_locations(
        db_session= db_session,
        tag_obj = tag,
        randomize=True,
        n = 30,
        exclude_embedded=True,
        max_size_mb=150
    )

    with tempfile.TemporaryDirectory() as temp_dir:
        for file_i, file_obj in enumerate(files):
            nb_logger.info(f"Processing file number {file_i + 1} of {len(files)}: {file_obj.id}")

            if not file_obj.locations:
                nb_logger.warning(f"No locations found for file {file_obj.id}")
                continue
            
            filepath = ""
            filename = ""
            for location in file_obj.locations:
                local_location_filepath = location.local_filepath(server_mount_path=file_server_location)

                if os.path.exists(local_location_filepath):
                    nb_logger.info(f"Found file {file_obj.id} at {local_location_filepath}")
                    
                    # if the tag's full label is not in the file server directories, do something...
                    if tag.full_tag_label_str.lower() not in location.file_server_directories.lower():
                        nb_logger.warning(f"File {file_obj.id} does not match tag {filing_code_tag} in its location directories")
                        #TODO
                        continue
                    filename = location.filename
                    break
                
                else:
                    nb_logger.warning(f"File {file_obj.id} not found at {local_location_filepath}")
                    continue
            
            # if no valid file found, skip to next file
            if not filename:
                nb_logger.warning(f"No valid file found for {file_obj.id}")
                continue

            try:
                temp_filepath = os.path.join(temp_dir, filename)
                shutil.copyfile(local_location_filepath, temp_filepath)
                extractor = get_extractor_for_file(temp_filepath, extractors_list)
                if extractor:
                    text = extractor(temp_filepath)
                    if text:
                        text = common_char_replacements(text)
                        text = strip_diacritics(text)
                        text = normalize_unicode(text)
                        text = normalize_whitespace(text)
                        emb_list = embedding_client.encode([text])
                        emb_vec = emb_list[0] if emb_list else None
                        if emb_vec is not None:
                            file_embedding = FileEmbedding(
                                file_hash =file_obj.hash,
                                source_text =text,
                                minilm_model=embedding_client.model_name,
                                minilm_emb=emb_vec
                            )
                            db_session.add(file_embedding)
                            db_session.commit()
                            nb_logger.info(f"Processed file {file_obj.id} with tag {filing_code_tag}")
                        else:
                            nb_logger.warning(f"Failed to create embedding for file {file_obj.id}")
                            continue
                        
                        file_labeling = label_file_using_tag(
                            db_session=db_session,
                            file_obj=file_obj,
                            some_tag=tag
                        )
                    else:
                        nb_logger.warning(f"No text extracted from {file_obj.id}")
                else:
                    nb_logger.warning(f"No suitable extractor found for file {file_obj.id}")
            except Exception as e:


                nb_logger.error(f"Error processing file {file_obj.id}: {e}")
                nb_logger.debug(traceback.format_exc())
                continue

Output()

INFO - Processing file number 1 of 30: 940676
INFO - Found file 940676 at N:\PPDO\Records\106xx  2300 Delaware Westside Research Park\10641\10641\H - Submittals and O&M's\distribution document.lnk
No extractor found for file extension: lnk
ERROR - Error processing file 940676: No extractor found for file extension: lnk
DEBUG - Traceback (most recent call last):
  File "C:\Users\adankert\AppData\Local\Temp\ipykernel_3580\1711867908.py", line 168, in <module>
    extractor = get_extractor_for_file(temp_filepath, extractors_list)
  File "c:\Users\adankert\projects\file_code_tagger\text_extraction\basic_extraction.py", line 138, in get_extractor_for_file
    raise ValueError(f"No extractor found for file extension: {file_extension}")
ValueError: No extractor found for file extension: lnk

INFO - Processing file number 2 of 30: 303391
INFO - Found file 303391 at N:\PPDO\Records\28xx   Cowell Student Health Center\2806\2806\H - Submittals\Project Manager Email Correspondence\.MSG Files\Submi

Output()

Output()

Output()

Output()

Output()

Output()

INFO - Processed file 500606 with tag H - Submittals
INFO - Added tag H to file with hash bf1a039920071bbac4dd233b369ea9dfa704b93a
INFO - Processing file number 11 of 30: 255649
INFO - Found file 255649 at N:\PPDO\Records\26xx   Merrill College (College 4)\2635\2635\H - Submittals\H - Submittals and OMs\Div 07\079500-01-R1.pdf
INFO - Processed file 255649 with tag H - Submittals
INFO - Added tag H to file with hash ebc515893cea39d2922a574a2a045821c1ab1a79
INFO - Processing file number 12 of 30: 377761
INFO - Found file 377761 at N:\PPDO\Records\41xx   EH&S Facility\4101\4101\G - Construction\G12 - Request for Information (RFI)\4101 RFI 209 Response.pdf
INFO - Found file 377761 at N:\PPDO\Records\41xx   EH&S Facility\4101\4101\H - Submittals and O&M's\Div 01\4101_-_017700-03_R1_-_Closeout_-_Project_Record_Documents_(As-Built_Drawings)\3 - RFI RESPONSES\4101 RFI 209 Response.pdf
INFO - Processed file 377761 with tag H - Submittals
INFO - Added tag H to file with hash 159a6501e7902b9c3537

Output()

Output()

Output()

Output()

Output()

Output()

INFO - Processing file number 14 of 30: 611809
INFO - Found file 611809 at N:\PPDO\Records\90xx  Campus Utilities\9098\9098D\H - Submittals\Hold 9098D\01 77 00 Closeout Submittals\9098D Submittal 01 77 00 -01\T0.3.dwg
No extractor found for file extension: dwg
ERROR - Error processing file 611809: No extractor found for file extension: dwg
DEBUG - Traceback (most recent call last):
  File "C:\Users\adankert\AppData\Local\Temp\ipykernel_3580\1711867908.py", line 168, in <module>
    extractor = get_extractor_for_file(temp_filepath, extractors_list)
  File "c:\Users\adankert\projects\file_code_tagger\text_extraction\basic_extraction.py", line 138, in get_extractor_for_file
    raise ValueError(f"No extractor found for file extension: {file_extension}")
ValueError: No extractor found for file extension: dwg

INFO - Processing file number 15 of 30: 317792
INFO - Found file 317792 at N:\PPDO\Records\30xx   Porter College (College 5)\3045\3045\H - Submittals\Division 2\02511-3195 through 320

Output()

Output()

Output()

Output()

Output()

Output()

The output file size is 1.62× larger than the input file.
Possible reasons for this include:
--deskew was issued, causing transcoding.
The optional dependency 'jbig2' was not found, so some image optimizations could not be attempted.
The optional dependency 'pngquant' was not found, so some image optimizations could not be attempted.

INFO - Processed file 317792 with tag H - Submittals
INFO - Added tag H to file with hash 0a103109147febe7f2851f22c9a3ebdf7cfbcdbc
INFO - Processing file number 16 of 30: 618064
INFO - Found file 618064 at N:\PPDO\Records\98xx   Campus Utilities 2012 on\9805\9805C\H - Submittals\Div 01\9805C_-_017700-03_R1_-_As-Builts\UCSC 9805C 01-77-00-03 AS- BUILTS PDF\E0_13_9805_RACHEL CARSON 1 LINE.pdf
INFO - Processed file 618064 with tag H - Submittals
INFO - Added tag H to file with hash 2d3009b551561477e851028595701b0e22377e7c
INFO - Processing file number 17 of 30: 380543
INFO - Found file 380543 at N:\PPDO\Records\41xx   EH&S Facility\4101\4101\H - Submittals a

Output()

This PDF is marked as a Tagged PDF. This often indicates that the PDF was generated from an office document and does not need OCR. PDF pages processed by OCRmyPDF may not be tagged correctly.


Output()

Output()

Output()

Output()

Output()

INFO - Processed file 716717 with tag H - Submittals
INFO - Added tag H to file with hash 9dc0f69f66ca3e5dfee9471d218a254964ed985b
INFO - Processing file number 24 of 30: 908674
INFO - Found file 908674 at N:\PPDO\Records\30xx   Porter College (College 5)\3048\3048\H - Submittals O&M\3048.H.phase 2 plan 4 diagram marked.pdf


Output()

This PDF is marked as a Tagged PDF. This often indicates that the PDF was generated from an office document and does not need OCR. PDF pages processed by OCRmyPDF may not be tagged correctly.


Output()