In [1]:
from db.db import backup_database

backup_destination = r"N:\PPDO\BS\Records Department\Archive_Data"
backup_database(backup_dir=backup_destination, compress=True)

'N:\\PPDO\\BS\\Records Department\\Archive_Data\\archives_backup_20251216_142352.sql.bz2'

In [1]:
# The purpose of this script is to find build a dataframe of files in a target directory,
# sorted by similarity to a target file, based on text embeddings stored in the database.

from sqlalchemy.orm import Session
from sqlalchemy import or_
import pandas as pd
import numpy as np
from db import get_db_engine
from db.models import FileLocation, File, FileContent
from text_extraction.pdf_extraction import PDFTextExtractor
from embedding.minilm import MiniLMEmbedder
from utils import extract_server_dirs, build_file_path
server_mount = r"N:\PPDO\Records"
target_filepath = r"N:\PPDO\Records\27xx   Applied Sciences Baskin Engineering\2739\2739\H - Submittals\File 2739.H DMG North Inc O&M. Air Handling Unit with Coil.pdf"
target_location = r"N:\PPDO\Records\27xx   Applied Sciences Baskin Engineering\2739\2739"


pdf_extractor = PDFTextExtractor()
embedder = MiniLMEmbedder()

text = pdf_extractor(target_filepath)
embeddings = embedder.encode(text)
target_loc_dirs = extract_server_dirs(full_path=target_location, base_mount=server_mount)
engine = get_db_engine()

target_sim_distance = lambda file_embeds: 1 - np.dot(file_embeds, embeddings)
build_local_path = lambda row: build_file_path(base_mount=server_mount, server_dir=row['file_server_directories'], filename=row['filename'])

with Session(engine) as db_session:
    files_located_in_dir = or_(
        FileLocation.file_server_directories == target_loc_dirs,
        FileLocation.file_server_directories.startswith(target_loc_dirs + '/')
    )

    # fetch server dirs, filenames and embeddings for all files in the directory
    query = (
        db_session.query(
            FileLocation.file_server_directories,
            FileLocation.filename,                    # ← use FileLocation.filename, not File.filename
            FileContent.minilm_emb
        )
        .join(File, File.id == FileLocation.file_id)
        .join(FileContent, File.hash == FileContent.file_hash)   # ← join on hash, not on id
        .filter(files_located_in_dir)
    )
    results = query.all()

    # build dataframe …
    df = pd.DataFrame(results, columns=['file_server_directories','filename','minilm_emb'])
    df['local_path'] = df.apply(build_local_path, axis=1)
    df['distance']   = df['minilm_emb'].apply(lambda emb: float(target_sim_distance(np.array(emb))))
    df = df.sort_values('distance').reset_index(drop=True)

    display(df[['local_path','distance']])

2025-11-12 13:57:04,647 pikepdf._core INFO pikepdf C++ to Python logger bridge initialized


Rebuilding cache of generated files for COM support...
Checking 00020905-0000-0000-C000-000000000046x0x8x7
Could not add module (IID('{00020905-0000-0000-C000-000000000046}'), 0, 8, 7) - <class 'AttributeError'>: module 'win32com.gen_py.00020905-0000-0000-C000-000000000046x0x8x7' has no attribute 'CLSIDToClassMap'
Done.


  r"N:\PPDO\Records"  (Windows)  or  "/mnt/records" (Linux).
2025-11-12 13:59:44,865 sentence_transformers.SentenceTransformer INFO Use pytorch device_name: cpu
2025-11-12 13:59:44,867 sentence_transformers.SentenceTransformer INFO Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-11-12 14:00:08,185 text_extraction.pdf_extraction INFO OCR needed for document: N:\PPDO\Records\27xx   Applied Sciences Baskin Engineering\2739\2739\H - Submittals\File 2739.H DMG North Inc O&M. Air Handling Unit with Coil.pdf


Output()

2025-11-12 14:00:10,734 ocrmypdf._pipelines.ocr INFO Start processing 7 pages concurrently


Output()

2025-11-12 14:00:18,615 ocrmypdf._pipeline INFO page is facing ⇧, confidence 8.28 - no change


2025-11-12 14:00:20,483 ocrmypdf._pipeline INFO page is facing ⇧, confidence 11.62 - no change


2025-11-12 14:00:21,126 ocrmypdf._pipeline INFO page is facing ⇧, confidence 7.96 - no change
2025-11-12 14:00:21,184 ocrmypdf._pipeline INFO page is facing ⇧, confidence 8.78 - no change


2025-11-12 14:00:21,584 ocrmypdf._pipeline INFO page is facing ⇧, confidence 12.70 - no change




2025-11-12 14:00:29,817 ocrmypdf._pipeline INFO page is facing ⇧, confidence 9.16 - no change


2025-11-12 14:00:30,550 ocrmypdf._pipeline INFO page is facing ⇧, confidence 12.45 - no change


2025-11-12 14:00:32,631 ocrmypdf._pipeline INFO page is facing ⇧, confidence 11.46 - no change


2025-11-12 14:00:33,606 ocrmypdf._pipeline INFO page is facing ⇧, confidence 11.13 - no change


2025-11-12 14:00:33,960 ocrmypdf._pipeline INFO page is facing ⇧, confidence 13.86 - no change


2025-11-12 14:00:34,140 ocrmypdf._pipeline INFO page is facing ⇧, confidence 11.97 - no change


2025-11-12 14:00:34,901 ocrmypdf._pipeline INFO page is facing ⇧, confidence 14.01 - rotation appears correct


2025-11-12 14:00:42,312 ocrmypdf._pipeline INFO page is facing ⇩, confidence 0.63 - confidence too low to rotate


2025-11-12 14:00:43,911 ocrmypdf._pipeline INFO page is facing ⇧, confidence 10.00 - no change


2025-11-12 14:00:45,829 ocrmypdf._pipeline INFO page is facing ⇧, confidence 5.78 - no change


2025-11-12 14:00:46,186 ocrmypdf._pipeline INFO page is facing ⇧, confidence 5.78 - no change


2025-11-12 14:00:46,471 ocrmypdf._pipeline INFO page is facing ⇧, confidence 5.30 - no change


2025-11-12 14:00:47,129 ocrmypdf._pipeline INFO page is facing ⇧, confidence 4.79 - no change


2025-11-12 14:00:49,981 ocrmypdf._pipeline INFO page is facing ⇧, confidence 7.83 - no change


2025-11-12 14:00:54,990 ocrmypdf._pipeline INFO page is facing ⇧, confidence 5.46 - no change


2025-11-12 14:00:56,679 ocrmypdf._pipeline INFO page is facing ⇧, confidence 6.45 - no change






2025-11-12 14:01:11,113 ocrmypdf._pipelines.ocr INFO Postprocessing...


Output()

2025-11-12 14:01:11,656 ocrmypdf._pipeline INFO Image optimization ratio: 1.00 savings: 0.0%
2025-11-12 14:01:11,660 ocrmypdf._pipeline INFO Total file size ratio: 0.19 savings: -426.4%
Possible reasons for this include:
--deskew was issued, causing transcoding.
Optimization was disabled.



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-11-12 14:01:13,875 db.db INFO Creating database engine


Unnamed: 0,local_path,distance
0,N:\PPDO\Records\27xx Applied Sciences Baskin...,0.316600
1,N:\PPDO\Records\27xx Applied Sciences Baskin...,0.346429
2,N:\PPDO\Records\27xx Applied Sciences Baskin...,0.351275
3,N:\PPDO\Records\27xx Applied Sciences Baskin...,0.352529
4,N:\PPDO\Records\27xx Applied Sciences Baskin...,0.357121
...,...,...
2126,N:\PPDO\Records\27xx Applied Sciences Baskin...,0.993532
2127,N:\PPDO\Records\27xx Applied Sciences Baskin...,1.011335
2128,N:\PPDO\Records\27xx Applied Sciences Baskin...,1.018780
2129,N:\PPDO\Records\27xx Applied Sciences Baskin...,1.018780


In [None]:
# The purpose of this script is to add an exclusion pattern to the database for a specific directory and its contents.

from sqlalchemy.orm import Session
from db import get_db_engine
from db.models import PathPattern, Base
from utils import extract_server_dirs

def add_exclusion_pattern():
    # Full path to exclude
    full_path = r"N:\PPDO\Records\16xx   Cowell College\1639\1639\G - Construction\G22 - Legal Claims and Public Records Act Request\Provided to Legal 2.5.13 FTP"
    base_mount = r"N:\PPDO\Records"
    
    # Convert to server-relative path format (the format stored in the database)
    server_relative_path = extract_server_dirs(full_path, base_mount)
    
    # Create pattern that will match this directory and all contents
    pattern = f"{server_relative_path}/*"
    
    print(f"Creating exclusion pattern: {pattern}")
    
    # Connect to database
    engine = get_db_engine()
    
    with Session(engine) as session:
        # Check if pattern already exists
        existing = session.query(PathPattern).filter_by(pattern=pattern).first()
        if existing:
            print(f"Pattern already exists with ID {existing.id}")
            return
        
        # Create new pattern with both embedding and tagging contexts
        new_pattern = PathPattern(
            pattern=pattern,
            pattern_type="directory",
            treatment="exclude",
            description="Legal claims and public records request documents",
            contexts=["add_files_embedding", "add_files_tagging"],
            enabled=True
        )
        
        session.add(new_pattern)
        session.commit()
        print(f"Exclusion pattern added with ID {new_pattern.id}")

# Execute the function
add_exclusion_pattern()

2025-09-23 15:54:29,268 db.db INFO Creating database engine


Creating exclusion pattern: 16xx   Cowell College/1639/1639/G - Construction/G22 - Legal Claims and Public Records Act Request/Provided to Legal 2.5.13 FTP/*
Exclusion pattern added with ID 1


In [1]:
import os
from pathlib import Path
from itertools import islice


def visualize_directory_tree(
    dir_path: Path,
    level: int = -1,
    limit_to_directories: bool = False,
    length_limit: int = 1000000,
    exclusion_list: list = None
):
    """Given a directory Path object print a visual tree structure, with optional exclusions."""
    space = '    '
    branch = '│   '
    # pointers:
    tee = '├── '
    last = '└── '

    dir_path = Path(dir_path)  # accept string coerceable to Path
    files = 0
    directories = 0
    exclusion_set = set(exclusion_list) if exclusion_list else set()

    def inner(dir_path: Path, prefix: str = '', level=-1):
        nonlocal files, directories
        if not level:
            return  # 0, stop iterating
        if limit_to_directories:
            contents = [d for d in dir_path.iterdir() if d.is_dir() and d.name not in exclusion_set]
        else:
            contents = [d for d in dir_path.iterdir() if d.name not in exclusion_set]
        pointers = [tee] * (len(contents) - 1) + [last] if contents else []
        for pointer, path in zip(pointers, contents):
            if path.is_dir():
                yield prefix + pointer + path.name
                directories += 1
                extension = branch if pointer == tee else space
                yield from inner(path, prefix=prefix + extension, level=level - 1)
            elif not limit_to_directories:
                yield prefix + pointer + path.name
                files += 1

    print(dir_path.name)
    iterator = inner(dir_path, level=level)
    for line in islice(iterator, length_limit):
        print(line)
    if next(iterator, None):
        print(f'... length_limit, {length_limit}, reached, counted:')
    print(f'\n{directories} directories' + (f', {files} files' if files else ''))
# exclude irrelevant directories
exclude_dirs = [
    '.venv',
    '__pycache__',
    'dev\__pycache__',
    '.git',
    'test_files'
]

visualize_directory_tree(
    dir_path= os.getcwd(),
    exclusion_list= exclude_dirs
)

file_code_tagger
├── .env
├── .github
│   └── copilot-instructions.md
├── .gitignore
├── .vscode
│   └── launch.json
├── app.log
├── cli
│   ├── add_files.py
│   ├── admin.py
│   ├── extract_date_mentions.py
│   └── __init__.py
├── db
│   ├── db.py
│   ├── models.py
│   └── __init__.py
├── dev
│   ├── add_file_tags.py
│   ├── archives_db.py
│   ├── archives_schema.sql
│   ├── convo_summary_20250916.md
│   ├── create_db_tables.sql
│   ├── current_project_directory_schema
│   ├── dev_notebook_archive.ipynb
│   ├── dev_utils.py
│   ├── docker-compose.yml
│   ├── embedding_counts_by_tag.csv
│   ├── extract_db_schema.ps1
│   ├── filing_tags_by_location.csv
│   ├── full_filing_tags_table.json
│   ├── new_filing_codes.txt
│   ├── project_structure_42cdd24
│   ├── proposed project structure.txt
│   ├── query_archived_files.py
│   └── sync_tables_v1.py
├── dev_notebook2.ipynb
├── dim_reduction_viz.ipynb
├── embedding
│   ├── base.py
│   ├── minilm.py
│   └── __init__.py
├── knn
│   ├── knn.py
│

  'dev\__pycache__',
