In [None]:
from db.db import backup_database

backup_destination = r"N:\PPDO\BS\Records Department\Archive_Data"
backup_database(backup_dir=backup_destination, compress=True)

In [1]:
# The purpose of this script is to find build a dataframe of files in a target directory,
# sorted by similarity to a target file, based on text embeddings stored in the database.

from sqlalchemy.orm import Session
from sqlalchemy import or_
import pandas as pd
import numpy as np
from db import get_db_engine
from db.models import FileLocation, File, FileContent
from text_extraction.pdf_extraction import PDFTextExtractor
from embedding.minilm import MiniLMEmbedder
from utils import extract_server_dirs, build_file_path
server_mount = r"N:\PPDO\Records"
target_filepath = r"N:\PPDO\Records\27xx   Applied Sciences Baskin Engineering\2739\2739\H - Submittals\File 2739.H DMG North Inc O&M. Air Handling Unit with Coil.pdf"
target_location = r"N:\PPDO\Records\27xx   Applied Sciences Baskin Engineering\2739\2739"


pdf_extractor = PDFTextExtractor()
embedder = MiniLMEmbedder()

text = pdf_extractor(target_filepath)
embeddings = embedder.encode(text)
target_loc_dirs = extract_server_dirs(full_path=target_location, base_mount=server_mount)
engine = get_db_engine()

target_sim_distance = lambda file_embeds: 1 - np.dot(file_embeds, embeddings)
build_local_path = lambda row: build_file_path(base_mount=server_mount, server_dir=row['file_server_directories'], filename=row['filename'])

with Session(engine) as db_session:
    files_located_in_dir = or_(
        FileLocation.file_server_directories == target_loc_dirs,
        FileLocation.file_server_directories.startswith(target_loc_dirs + '/')
    )

    # fetch server dirs, filenames and embeddings for all files in the directory
    query = (
        db_session.query(
            FileLocation.file_server_directories,
            FileLocation.filename,                    # ← use FileLocation.filename, not File.filename
            FileContent.minilm_emb
        )
        .join(File, File.id == FileLocation.file_id)
        .join(FileContent, File.hash == FileContent.file_hash)   # ← join on hash, not on id
        .filter(files_located_in_dir)
    )
    results = query.all()

    # build dataframe …
    df = pd.DataFrame(results, columns=['file_server_directories','filename','minilm_emb'])
    df['local_path'] = df.apply(build_local_path, axis=1)
    df['distance']   = df['minilm_emb'].apply(lambda emb: float(target_sim_distance(np.array(emb))))
    df = df.sort_values('distance').reset_index(drop=True)

    display(df[['local_path','distance']])

2025-09-24 20:05:07,381 pikepdf._core INFO pikepdf C++ to Python logger bridge initialized
2025-09-24 20:05:38,222 sentence_transformers.SentenceTransformer INFO Use pytorch device_name: cpu
2025-09-24 20:05:38,223 sentence_transformers.SentenceTransformer INFO Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-09-24 20:05:42,502 text_extraction.pdf_extraction INFO OCR needed for document: N:\PPDO\Records\27xx   Applied Sciences Baskin Engineering\2739\2739\H - Submittals\File 2739.H DMG North Inc O&M. Air Handling Unit with Coil.pdf


Output()

2025-09-24 20:05:44,960 ocrmypdf._pipelines.ocr INFO Start processing 7 pages concurrently


Output()

2025-09-24 20:05:46,380 ocrmypdf._pipeline INFO page is facing ⇧, confidence 8.28 - no change


2025-09-24 20:05:46,542 ocrmypdf._pipeline INFO page is facing ⇧, confidence 1.87 - no change


2025-09-24 20:05:47,249 ocrmypdf._pipeline INFO page is facing ⇧, confidence 7.96 - no change


2025-09-24 20:05:47,299 ocrmypdf._pipeline INFO page is facing ⇧, confidence 10.13 - no change
2025-09-24 20:05:47,303 ocrmypdf._pipeline INFO page is facing ⇧, confidence 12.70 - no change
2025-09-24 20:05:47,306 ocrmypdf._pipeline INFO page is facing ⇧, confidence 8.78 - no change
2025-09-24 20:05:47,323 ocrmypdf._pipeline INFO page is facing ⇧, confidence 11.62 - no change




2025-09-24 20:05:49,742 ocrmypdf._pipeline INFO page is facing ⇧, confidence 9.16 - no change


2025-09-24 20:05:51,188 ocrmypdf._pipeline INFO page is facing ⇧, confidence 11.46 - no change


2025-09-24 20:05:51,235 ocrmypdf._pipeline INFO page is facing ⇧, confidence 11.13 - no change


2025-09-24 20:05:51,704 ocrmypdf._pipeline INFO page is facing ⇧, confidence 11.97 - no change
2025-09-24 20:05:51,741 ocrmypdf._pipeline INFO page is facing ⇧, confidence 13.86 - no change


2025-09-24 20:05:52,051 ocrmypdf._pipeline INFO page is facing ⇧, confidence 14.01 - rotation appears correct


2025-09-24 20:05:53,159 ocrmypdf._pipeline INFO page is facing ⇧, confidence 11.45 - no change


2025-09-24 20:05:55,721 ocrmypdf._pipeline INFO page is facing ⇩, confidence 0.63 - confidence too low to rotate


2025-09-24 20:05:56,504 ocrmypdf._pipeline INFO page is facing ⇧, confidence 10.00 - no change


2025-09-24 20:05:57,473 ocrmypdf._pipeline INFO page is facing ⇧, confidence 5.78 - no change


2025-09-24 20:05:58,614 ocrmypdf._pipeline INFO page is facing ⇧, confidence 5.78 - no change


2025-09-24 20:05:59,306 ocrmypdf._pipeline INFO page is facing ⇧, confidence 4.79 - no change


2025-09-24 20:06:00,170 ocrmypdf._pipeline INFO page is facing ⇧, confidence 5.30 - no change


2025-09-24 20:06:00,945 ocrmypdf._pipeline INFO page is facing ⇧, confidence 7.83 - no change




2025-09-24 20:06:04,903 ocrmypdf._pipeline INFO page is facing ⇧, confidence 5.46 - no change


2025-09-24 20:06:05,182 ocrmypdf._pipeline INFO page is facing ⇧, confidence 6.45 - no change




2025-09-24 20:06:11,855 ocrmypdf._pipelines.ocr INFO Postprocessing...


Output()

2025-09-24 20:06:11,996 ocrmypdf._pipeline INFO Image optimization ratio: 1.00 savings: 0.0%
2025-09-24 20:06:11,997 ocrmypdf._pipeline INFO Total file size ratio: 0.19 savings: -426.4%
Possible reasons for this include:
--deskew was issued, causing transcoding.
Optimization was disabled.



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-09-24 20:06:12,655 db.db INFO Creating database engine


Unnamed: 0,local_path,distance
0,N:\PPDO\Records\27xx Applied Sciences Baskin...,0.316600
1,N:\PPDO\Records\27xx Applied Sciences Baskin...,0.346429
2,N:\PPDO\Records\27xx Applied Sciences Baskin...,0.351275
3,N:\PPDO\Records\27xx Applied Sciences Baskin...,0.352529
4,N:\PPDO\Records\27xx Applied Sciences Baskin...,0.357121
...,...,...
2120,N:\PPDO\Records\27xx Applied Sciences Baskin...,0.993532
2121,N:\PPDO\Records\27xx Applied Sciences Baskin...,1.011335
2122,N:\PPDO\Records\27xx Applied Sciences Baskin...,1.018780
2123,N:\PPDO\Records\27xx Applied Sciences Baskin...,1.018780


In [None]:
# The purpose of this script is to add an exclusion pattern to the database for a specific directory and its contents.

from sqlalchemy.orm import Session
from db import get_db_engine
from db.models import PathPattern, Base
from utils import extract_server_dirs

def add_exclusion_pattern():
    # Full path to exclude
    full_path = r"N:\PPDO\Records\16xx   Cowell College\1639\1639\G - Construction\G22 - Legal Claims and Public Records Act Request\Provided to Legal 2.5.13 FTP"
    base_mount = r"N:\PPDO\Records"
    
    # Convert to server-relative path format (the format stored in the database)
    server_relative_path = extract_server_dirs(full_path, base_mount)
    
    # Create pattern that will match this directory and all contents
    pattern = f"{server_relative_path}/*"
    
    print(f"Creating exclusion pattern: {pattern}")
    
    # Connect to database
    engine = get_db_engine()
    
    with Session(engine) as session:
        # Check if pattern already exists
        existing = session.query(PathPattern).filter_by(pattern=pattern).first()
        if existing:
            print(f"Pattern already exists with ID {existing.id}")
            return
        
        # Create new pattern with both embedding and tagging contexts
        new_pattern = PathPattern(
            pattern=pattern,
            pattern_type="directory",
            treatment="exclude",
            description="Legal claims and public records request documents",
            contexts=["add_files_embedding", "add_files_tagging"],
            enabled=True
        )
        
        session.add(new_pattern)
        session.commit()
        print(f"Exclusion pattern added with ID {new_pattern.id}")

# Execute the function
add_exclusion_pattern()

2025-09-23 15:54:29,268 db.db INFO Creating database engine


Creating exclusion pattern: 16xx   Cowell College/1639/1639/G - Construction/G22 - Legal Claims and Public Records Act Request/Provided to Legal 2.5.13 FTP/*
Exclusion pattern added with ID 1
