In [2]:
import logging
import os
import shutil
import traceback
import tempfile
from db_models import File, FileLocation, FilingTag, FileTagLabel, FileEmbedding, get_db_engine
from embedding.minilm import MiniLMEmbedder
from sqlalchemy import func
from sqlalchemy.orm import Session
from text_extraction.pdf_extraction import PDFTextExtractor
from text_extraction.basic_extraction import TextFileTextExtractor, TikaTextExtractor, get_extractor_for_file
from text_extraction.image_extraction import ImageTextExtractor
from text_extraction.office_doc_extraction import PresentationTextExtractor, SpreadsheetTextExtractor, WordFileTextExtractor
from text_extraction.web_extraction import HtmlTextExtractor, EmailTextExtractor
from text_extraction.extraction_utils import common_char_replacements, strip_diacritics, normalize_unicode, normalize_whitespace
from typing import Optional, Union
from logging_setups import setup_logger

nb_logger = setup_logger(name="NotebookLogger", notebook=True, level=logging.DEBUG)

import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

pdf_extractor = PDFTextExtractor()
txt_extractor = TextFileTextExtractor()
image_extractor = ImageTextExtractor()
presentation_extractor = PresentationTextExtractor()
spreadsheet_extractor = SpreadsheetTextExtractor()
word_extractor = WordFileTextExtractor()
html_extractor = HtmlTextExtractor()
email_extractor = EmailTextExtractor()
tika_extractor = TikaTextExtractor()

extractors_list = [
    pdf_extractor,
    txt_extractor,
    image_extractor,
    presentation_extractor,
    spreadsheet_extractor,
    word_extractor,
    html_extractor,
    email_extractor
]

def get_files_from_tag_locations(
        db_session: Session,
        tag_obj: FilingTag,
        n: int = 100,
        randomize: bool = False,
        exclude_embedded: bool = False,
        max_size_mb: Optional[float] = None
        ):
    """
    Retrieve files tagged with a specific tag string.
    Optionally exclude files that already have a FileEmbedding.
    """
    tag_locations = FileLocation.file_server_directories.ilike(f"%/{tag_obj.full_tag_label_str}%")

    # start the base query
    q = db_session.query(File) \
                  .join(FileLocation) \
                  .filter(tag_locations)

    if exclude_embedded:
        # left‐outer join to file_embeddings, keep only those with no match
        q = q.outerjoin(FileEmbedding, File.hash == FileEmbedding.file_hash)\
            .filter(FileEmbedding.file_hash == None)\
            
    if max_size_mb is not None:
        max_size_bytes = max_size_mb * 1024 * 1024
        q = q.filter(File.size <= max_size_bytes)

    if randomize:
        q = q.order_by(func.random())
    
    q = q.limit(n)

    return q.all()

def label_file_using_tag(
        db_session: Session,
        file_obj: File,
        some_tag: FilingTag|str,
        label_source: str = 'rule'
    ):
    """
    Label a file with a specific tag.
    """
    
    if isinstance(some_tag, str):
        tag_obj = FilingTag.retrieve_tag_by_label(db_session, some_tag)
        if not tag_obj:
            raise ValueError(f"Tag '{some_tag}' not found in the database.")
        current_tag = tag_obj
    elif isinstance(some_tag, FilingTag):
        tag_obj = some_tag
        current_tag = tag_obj
    else:
        raise TypeError("some_tag must be a FilingTag instance or a string representing the tag label.")
    
    while current_tag is not None:
        already = (
            db_session.query(FileTagLabel)
            .filter_by(file_id=file_obj.id, tag=current_tag.label)
            .first()
        )
        if already:
            nb_logger.info(f"File {file_obj.id} already labeled with tag {current_tag.label}")
            current_tag = current_tag.parent
            continue

        labeling_record = FileTagLabel(
            file_id=file_obj.id,
            file_hash=file_obj.hash,
            tag=current_tag.label,
            is_primary=True if not current_tag.parent else False,
            label_source=label_source
        )
        db_session.add(labeling_record)
        nb_logger.info(f"Added tag {current_tag.label} to file with hash {file_obj.hash}")
        current_tag = current_tag.parent

    db_session.commit()
    return labeling_record


filing_code_tag = "E7 - Value Engineering"
text_leng_threshold = 250
file_server_location = r"N:\PPDO\Records"
embedding_client = MiniLMEmbedder()
with Session(get_db_engine()) as db_session:
    tag = FilingTag.retrieve_tag_by_label(db_session, filing_code_tag)
    files = get_files_from_tag_locations(
        db_session= db_session,
        tag_obj = tag,
        randomize=True,
        n = 30,
        exclude_embedded=True,
        max_size_mb=150
    )

    with tempfile.TemporaryDirectory() as temp_dir:
        for file_i, file_obj in enumerate(files):
            nb_logger.info(f"Processing file number {file_i + 1} of {len(files)}: {file_obj.id}")

            if not file_obj.locations:
                nb_logger.warning(f"No locations found for file {file_obj.id}")
                continue
            
            filepath = ""
            filename = ""
            for location in file_obj.locations:
                local_location_filepath = location.local_filepath(server_mount_path=file_server_location)

                if os.path.exists(local_location_filepath):
                    nb_logger.info(f"Found file {file_obj.id} at {local_location_filepath}")
                    
                    # if the tag's full label is not in the file server directories, do something...
                    if tag.full_tag_label_str.lower() not in location.file_server_directories.lower():
                        nb_logger.warning(f"File {file_obj.id} does not match tag {filing_code_tag} in its location directories")
                        #TODO
                        continue
                    filename = location.filename
                    break
                
                else:
                    nb_logger.warning(f"File {file_obj.id} not found at {local_location_filepath}")
                    continue
            
            # if no valid file found, skip to next file
            if not filename:
                nb_logger.warning(f"No valid file found for {file_obj.id}")
                continue

            try:
                temp_filepath = os.path.join(temp_dir, filename)
                shutil.copyfile(local_location_filepath, temp_filepath)
                extractor = get_extractor_for_file(temp_filepath, extractors_list)
                if extractor:
                    text = extractor(temp_filepath)
                else:
                    text = tika_extractor(temp_filepath)  # Fallback to Tika extractor
                
                # if the text is empty or too short, skip processing
                if text and len(text) >= text_leng_threshold:
                    text = common_char_replacements(text)
                    text = strip_diacritics(text)
                    text = normalize_unicode(text)
                    text = normalize_whitespace(text)
                    emb_list = embedding_client.encode([text])
                    emb_vec = emb_list[0] if emb_list else None
                    if emb_vec is not None:
                        file_embedding = FileEmbedding(
                            file_hash =file_obj.hash,
                            source_text =text,
                            minilm_model=embedding_client.model_name,
                            minilm_emb=emb_vec
                        )
                        db_session.add(file_embedding)
                        db_session.commit()
                        nb_logger.info(f"Processed file {file_obj.id} with tag {filing_code_tag}")
                    else:
                        nb_logger.warning(f"Failed to create embedding for file {file_obj.id}")
                        continue
                    
                    file_labeling = label_file_using_tag(
                        db_session=db_session,
                        file_obj=file_obj,
                        some_tag=tag
                    )
                else:
                    if len(text) < text_leng_threshold:
                        nb_logger.warning(f"Text length for file {file_obj.id} is {len(text)}:\n{text}")
                    else:
                        nb_logger.warning(f"No text extracted from {file_obj.id}")
                    continue
            
            except Exception as e:
                nb_logger.error(f"Error processing file {file_obj.id}: {e}")
                nb_logger.debug(traceback.format_exc())
                continue

INFO - Processing file number 1 of 30: 15560
INFO - Found file 15560 at N:\PPDO\Records\13xx   Original Ranch Buildings\1324\1324\E - Program and Design\E7 - Value Engineering\1324.E7.Layout 2.PDF


Output()

Output()

Output()

Output()

Output()

Output()

INFO - Processed file 15560 with tag E7 - Value Engineering
INFO - Added tag E7 to file with hash 2df297890231f45967494ad584386ee1ce54c79b
INFO - Added tag E to file with hash 2df297890231f45967494ad584386ee1ce54c79b
INFO - Processing file number 2 of 30: 256564
INFO - Found file 256564 at N:\PPDO\Records\26xx   Merrill College (College 4)\2636\2636 - Merrill Phase 2\E - Program and Design\E7 - Value Engineering\Guzman VE Options..xlsx
INFO - Processed file 256564 with tag E7 - Value Engineering
INFO - Added tag E7 to file with hash 7be7cc1fb8c2e83f864c6a0c705d5e58144349e8
INFO - Added tag E to file with hash 7be7cc1fb8c2e83f864c6a0c705d5e58144349e8
INFO - Processing file number 3 of 30: 354894
INFO - Found file 354894 at N:\PPDO\Records\32xx   Kresge College (College 6)\3239 Non-Acad Reno\3239\E - Program and Design\E7 - Value Engineering\UC Santa Cruz Mail - Re_ Kresge - Below Grade Waterproofing and Foundation Drainage - Follow Up.pdf
INFO - Processed file 354894 with tag E7 - Value

Output()

Output()

Output()

Output()

Output()

Output()

INFO - Processed file 962990 with tag E7 - Value Engineering
INFO - Added tag E7 to file with hash 4e38b486b2117e1cc798060712ba0d26619b335a
INFO - Added tag E to file with hash 4e38b486b2117e1cc798060712ba0d26619b335a
INFO - Processing file number 5 of 30: 917476
INFO - Found file 917476 at N:\PPDO\Records\62xx   Physical Sciences Building (Natural Sciences 5)\6201\6201\6201\E - Program and Design\E7 - Value Engineering\Agenda-VE-2.doc
INFO - Processed file 917476 with tag E7 - Value Engineering
INFO - Added tag E7 to file with hash 97293a8e28a96ce6acd299ff429b604ffb5792bc
INFO - Added tag E to file with hash 97293a8e28a96ce6acd299ff429b604ffb5792bc
INFO - Processing file number 6 of 30: 474400
INFO - Found file 474400 at N:\PPDO\Records\52xx   Earth Marine Science Building\5201\5201\E7 - Value Engineering\5201.E7.Value Engineering Session 1.pdf


Output()

Output()

Output()

Output()

Output()

Output()

The output file size is 1.60× larger than the input file.
Possible reasons for this include:
--deskew was issued, causing transcoding.
The optional dependency 'jbig2' was not found, so some image optimizations could not be attempted.
The optional dependency 'pngquant' was not found, so some image optimizations could not be attempted.

INFO - Processed file 474400 with tag E7 - Value Engineering
INFO - Added tag E7 to file with hash b994b8d0a8c3836ff805868d86e06287d84541b5
INFO - Added tag E to file with hash b994b8d0a8c3836ff805868d86e06287d84541b5
INFO - Processing file number 7 of 30: 133701
INFO - Found file 133701 at N:\PPDO\Records\111xx  Area Planning Documents and Studies\11105\E - Program and Design\E7 - Value Engineering\E7.1 - Value Engineering Correspondence\File 11105 Attendees 2.12.90 (2).pdf
INFO - Processed file 133701 with tag E7 - Value Engineering
INFO - Added tag E7 to file with hash 90e7e0f13a11924acf63cc494a3e65d7b4fa4d3b
INFO - Added tag E to file with hash 90e7e0

Output()

Output()

Output()

Output()

Output()

Output()

INFO - Processed file 819596 with tag E7 - Value Engineering
INFO - Added tag E7 to file with hash e2ce167ca3b5e00edb7e6ab0d0c518e6cc284b66
INFO - Added tag E to file with hash e2ce167ca3b5e00edb7e6ab0d0c518e6cc284b66
INFO - Processing file number 12 of 30: 819590
INFO - Found file 819590 at N:\PPDO\Records\75xx   Humanities\7501\7501\E - Program and Design\E7 - Value Engineering\Value Engineering PDFs\VE-A-017.pdf


Output()

Output()

Output()

Output()

Output()

Output()

INFO - Processed file 819590 with tag E7 - Value Engineering
INFO - Added tag E7 to file with hash d2fdcd90ab9c5cd7393151e8040d3cf3215db992
INFO - Added tag E to file with hash d2fdcd90ab9c5cd7393151e8040d3cf3215db992
INFO - Processing file number 13 of 30: 236276
INFO - Found file 236276 at N:\PPDO\Records\23xx   Crown College (College 3)\2358\2358\E - Program and Design\E7 - Value Engineering\Email - Site VE Items.pdf
INFO - Processed file 236276 with tag E7 - Value Engineering
INFO - Added tag E7 to file with hash 4b2b43e34d092b9c96c339816197b280d4085600
INFO - Added tag E to file with hash 4b2b43e34d092b9c96c339816197b280d4085600
INFO - Processing file number 14 of 30: 805564
INFO - Found file 805564 at N:\PPDO\Records\68xx   East Campus Infill\6801\6801\E- Program and Design\E7 - Value Engineering\File 6801 100%SD VE List.pdf
INFO - Processed file 805564 with tag E7 - Value Engineering
INFO - Added tag E7 to file with hash dba5a89d0673f5de993dcfce289cfaeabf7163ee
INFO - Added tag 

Output()

Output()

Output()

Output()

Output()

Output()

INFO - Processed file 819589 with tag E7 - Value Engineering
INFO - Added tag E7 to file with hash 92f0de79933239a68af1dfc957d31234f5ee46d7
INFO - Added tag E to file with hash 92f0de79933239a68af1dfc957d31234f5ee46d7
INFO - Processing file number 18 of 30: 819602
INFO - Found file 819602 at N:\PPDO\Records\75xx   Humanities\7501\7501\E - Program and Design\E7 - Value Engineering\Value Engineering PDFs\VE-SS-030.pdf


Output()

Output()

Output()

Output()

Output()

Output()

INFO - Processed file 819602 with tag E7 - Value Engineering
INFO - Added tag E7 to file with hash ef0e1f2d3cacc504ed8bc88ca325a67ef91b3fcc
INFO - Added tag E to file with hash ef0e1f2d3cacc504ed8bc88ca325a67ef91b3fcc
INFO - Processing file number 19 of 30: 132560
INFO - Found file 132560 at N:\PPDO\Records\110xx  Infrastructure Planning Documents and Studies\11005\11005\E - Program and Design\E7 - Value Engineering\11005.E7.Area Planning Documents and Studies.pdf


Output()

Output()

Output()

Output()

Output()

Output()

INFO - Processed file 132560 with tag E7 - Value Engineering
INFO - Added tag E7 to file with hash 41071d3003a99bab9e7e18cfd0d13b2c445f9a79
INFO - Added tag E to file with hash 41071d3003a99bab9e7e18cfd0d13b2c445f9a79
INFO - Processing file number 20 of 30: 247365
INFO - Found file 247365 at N:\PPDO\Records\26xx   Merrill College (College 4)\2635\2635\E - Program and Design\E5 - Program and Design Correspondence\File 2635 Additional Utilitiy Survey.pdf
INFO - Found file 247365 at N:\PPDO\Records\26xx   Merrill College (College 4)\2635\2635\E - Program and Design\E7 - Value Engineering\File 2635 Proposed Value Engineering Log.pdf
INFO - Processed file 247365 with tag E7 - Value Engineering
INFO - Added tag E7 to file with hash 754b1062eadefc9e4127d94f3a66757af867a9d3
INFO - Added tag E to file with hash 754b1062eadefc9e4127d94f3a66757af867a9d3
INFO - Processing file number 21 of 30: 819592
INFO - Found file 819592 at N:\PPDO\Records\75xx   Humanities\7501\7501\E - Program and Design\E7 

Output()

Output()

Output()

Output()

Output()

Output()

722.2
Q G
aD)MOUNTEDGENERATOR NATURALNGAS
280kW
NEMA
3R”
NON-SOUNDATTENUATED
ENCLOSURE
>
JND
INICATION
JUITS——_#
!
PRIMARYSWITCHL
LA
12004

INFO - Processing file number 22 of 30: 339495
INFO - Found file 339495 at N:\PPDO\Records\32xx   Kresge College (College 6)\3238\3238\E - Program and Design\E6 - Reports (soils, structural, acoustic, calculations, etc.)\190923 Kresge - ACAD VE idea\Kresge Constructability Review.xlsx
INFO - Found file 339495 at N:\PPDO\Records\32xx   Kresge College (College 6)\3238\3238\E - Program and Design\E7 - Value Engineering\190923 Kresge - ACAD VE idea\Kresge Constructability Review.xlsx
INFO - Processed file 339495 with tag E7 - Value Engineering
INFO - Added tag E7 to file with hash c61af6a539f20bb49a7dff8cdc89a122cf82c73f
INFO - Added tag E to file with hash c61af6a539f20bb49a7dff8cdc89a122cf82c73f
INFO - Processing file number 23 of 30: 468828
INFO - Found file 468828 at N:\PPDO\Records\49xx   Long Marine Lab\4934\4934\E - Program and Design\E7 - Value

Output()

Output()

Output()

Output()

Output()

Output()

INFO - Processed file 468828 with tag E7 - Value Engineering
INFO - Added tag E7 to file with hash 7532ab6a92b9e222abbb011306825cf57f4d6bc8
INFO - Added tag E to file with hash 7532ab6a92b9e222abbb011306825cf57f4d6bc8
INFO - Processing file number 24 of 30: 236275
INFO - Found file 236275 at N:\PPDO\Records\23xx   Crown College (College 3)\2358\2358\E - Program and Design\E7 - Value Engineering\2017-02-23 - VE Meeting.pdf


Output()

Output()

Output()

Output()

Output()

Output()

INFO - Processed file 236275 with tag E7 - Value Engineering
INFO - Added tag E7 to file with hash c801677c328b4eb6963d87831a4bf106f4bdbbdd
INFO - Added tag E to file with hash c801677c328b4eb6963d87831a4bf106f4bdbbdd
INFO - Processing file number 25 of 30: 612681
INFO - Found file 612681 at N:\PPDO\Records\92xx   Antenna and Supporting Infrastructures\9201\9201-007\E - Program and Design\Final Report NextG UCSC Oakes.pdf
INFO - Found file 612681 at N:\PPDO\Records\92xx   Antenna and Supporting Infrastructures\9201\9201-007\E - Program and Design\USCS RF STUDIES\Final Report NextG UCSC Oakes.pdf
INFO - Found file 612681 at N:\PPDO\Records\92xx   Antenna and Supporting Infrastructures\9200\9200-007\D - Environmental Review Process\EMF_RF Studies\Preliminary\Final Report NextG UCSC Oakes.pdf
INFO - Found file 612681 at N:\PPDO\Records\92xx   Antenna and Supporting Infrastructures\9201\9201-004\DOCS\E- Program and Design\E7 - Value Engineering\E7.2 - VE Workshop Minutes, Summaries, Final 

Output()

Output()

Output()

Output()

Output()

Output()

(
aaa
NI)
FIVOS
OIHdvuyD

INFO - Processing file number 29 of 30: 339692
INFO - Found file 339692 at N:\PPDO\Records\32xx   Kresge College (College 6)\3238\3238\E - Program and Design\E7 - Value Engineering\kresgevesketches.zip
No extractor found for file extension: zip
INFO - Processed file 339692 with tag E7 - Value Engineering
INFO - Added tag E7 to file with hash 20fed40bfb235f1f1509f4cb1a434e3a676d9d46
INFO - Added tag E to file with hash 20fed40bfb235f1f1509f4cb1a434e3a676d9d46
INFO - Processing file number 30 of 30: 339688
INFO - Found file 339688 at N:\PPDO\Records\32xx   Kresge College (College 6)\3238\3238\E - Program and Design\E7 - Value Engineering\190510_100DDCOSTESTIMATE_FINAL_VE_.xlsx
INFO - Processed file 339688 with tag E7 - Value Engineering
INFO - Added tag E7 to file with hash 882692bc722943e585e89400cc3b4659c1424da2
INFO - Added tag E to file with hash 882692bc722943e585e89400cc3b4659c1424da2
