In [10]:
from db.db import backup_database

backup_destination = r"N:\PPDO\BS\Records Department\Archive_Data"
backup_database(backup_dir=backup_destination, compress=True)

'N:\\PPDO\\BS\\Records Department\\Archive_Data\\archives_backup_20250817_102822.sql.bz2'

In [9]:
from sqlalchemy.orm import Session
from sqlalchemy import or_
import pandas as pd
import numpy as np
from db import get_db_engine
from db.models import FileLocation, File, FileEmbedding
from text_extraction.pdf_extraction import PDFTextExtractor
from embedding.minilm import MiniLMEmbedder
from utils import extract_server_dirs, build_file_path
server_mount = r"N:\PPDO\Records"
target_filepath = r"N:\PPDO\Records\16xx   Cowell College\1639\1639\F - Bid Documents and Contract Award\F5 - Drawings and Specifications\Rebid Specs\Record Specs\05510 Precast Concrete Tread Metal Stairs.pdf"
target_location = r"N:\PPDO\Records\37xx   Oakes College\3701\3701"


pdf_extractor = PDFTextExtractor()
embedder = MiniLMEmbedder()

text = pdf_extractor(target_filepath)
embeddings = embedder.encode(text)
target_loc_dirs = extract_server_dirs(full_path=target_location, base_mount=server_mount)
engine = get_db_engine()

target_sim_distance = lambda file_embeds: 1 - np.dot(file_embeds, embeddings)
build_local_path = lambda row: build_file_path(base_mount=server_mount, server_dir=row['file_server_directories'], filename=row['filename'])

with Session(engine) as db_session:
    files_located_in_dir = or_(
        FileLocation.file_server_directories == target_loc_dirs,
        FileLocation.file_server_directories.startswith(target_loc_dirs + '/')
    )

    query = db_session.query(FileLocation, FileEmbedding)\
        .join(File, FileLocation.file)\
        .join(FileEmbedding, File.hash == FileEmbedding.file_hash)\
        .filter(files_located_in_dir)
    
    # query into dataframe
    df = pd.read_sql(query.statement, db_session.bind)
    df['similarity'] = df['minilm_emb'].apply(target_sim_distance)
    df = df.sort_values(by='similarity', ascending=True)

    # Add local file path
    df['local_filepath'] = df.apply(build_local_path, axis=1)

df.head()




2025-08-15 19:02:14,214 sentence_transformers.SentenceTransformer INFO Use pytorch device_name: cpu
2025-08-15 19:02:14,217 sentence_transformers.SentenceTransformer INFO Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-08-15 19:02:16,307 text_extraction.pdf_extraction INFO Extracting text from PDF: N:\PPDO\Records\16xx   Cowell College\1639\1639\F - Bid Documents and Contract Award\F5 - Drawings and Specifications\Rebid Specs\Record Specs\05510 Precast Concrete Tread Metal Stairs.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-15 19:02:16,902 db.db INFO Creating database engine


Unnamed: 0,id,file_id,existence_confirmed,hash_confirmed,file_server_directories,filename,file_hash,source_text,minilm_model,minilm_emb,mpnet_model,mpnet_emb,updated_at,similarity,local_filepath
140,551144,98760,2025-08-13 11:45:16.917310,2025-08-13 02:35:25.201997,37xx Oakes College/3701/3701/F - Bid Documen...,File 3701 A 16.TIF,dea44581835cc932cd40209ee8322aed95b45946,STAIR e. THIRD FLOOR | SECOND FLOOR ' STAIR *8...,all-MiniLM-L6-v2,"[-0.06646472, -0.021325612, 0.005512745, -0.00...",,,2025-08-13 04:07:36.111721+00:00,0.448826,N:\PPDO\Records\37xx Oakes College\3701\3701...
189,551225,367836,2025-08-13 11:45:06.872326,2025-08-13 02:36:00.557249,37xx Oakes College/3701/3701/F - Bid Documen...,File 3701 G 01.TIF,b077e1c4e83f332a8711ba033ad2a060c960dc3d,UNIVERSITY OF CALIFORNIA COLLEGE 7 PROJECT NO....,all-MiniLM-L6-v2,"[-0.040841658, 0.012339933, 0.02262754, 0.0080...",,,2025-08-13 20:11:44.768736+00:00,0.486759,N:\PPDO\Records\37xx Oakes College\3701\3701...
57,551334,367888,2025-08-13 11:45:20.555142,2025-08-13 02:36:59.896922,37xx Oakes College/3701/3701/G - Constructio...,3701.G15.ChangeOrder29-R.pdf,ec3475efdd7cf33592a1194445cf414c8b790c2d,SANTA CRUZ: OFFICE... Z NERD ARE Campus Facili...,all-MiniLM-L6-v2,"[-0.036431514, 0.057476886, 0.048418455, -0.00...",,,2025-08-14 01:13:12.941121+00:00,0.5427,N:\PPDO\Records\37xx Oakes College\3701\3701...
128,551131,98747,2025-08-13 11:45:16.752591,2025-08-13 02:35:19.696366,37xx Oakes College/3701/3701/F - Bid Documen...,File 3701 A 04.TIF,0725473c34468f39793d99687a412ec3ffc6ea10,"""ta dadkl Cul ad WILNaCigae HAN ead wad 184 NI...",all-MiniLM-L6-v2,"[-0.066026025, 0.009855664, 0.0044676038, 0.03...",,,2025-08-13 04:05:20.308410+00:00,0.564538,N:\PPDO\Records\37xx Oakes College\3701\3701...
150,551154,98770,2025-08-13 11:45:17.082676,2025-08-13 02:35:29.275225,37xx Oakes College/3701/3701/F - Bid Documen...,File 3701 A 32.TIF,03f151c601de85fb76997ffd9240dda674aa5dea,"""Sines 187 Beara ""Sinvlsisey"" iNsTRuepieNAL SE...",all-MiniLM-L6-v2,"[-0.09259008, 0.041282676, -0.027558442, 0.010...",,,2025-08-13 04:08:43.392656+00:00,0.577102,N:\PPDO\Records\37xx Oakes College\3701\3701...
