In [None]:
import os
from db_models import FilingTag, FileLocation, File
from sqlalchemy import create_engine, func
from sqlalchemy.orm import sessionmaker
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def get_db_engine():
    """Create and return a SQLAlchemy engine for the project database."""
    conn_string = (
        f"postgresql+psycopg://{os.getenv('PROJECT_DB_USERNAME')}:{os.getenv('PROJECT_DB_PASSWORD')}"
        f"@{os.getenv('PROJECT_DB_HOST')}:{os.getenv('PROJECT_DB_PORT')}/{os.getenv('PROJECT_DB_NAME')}"
    )
    return create_engine(conn_string)

# Configure your database session
engine = get_db_engine()
Session = sessionmaker(bind=engine)
session = Session()

def file_tag_file_locations(filing_tag):
    """
    Find all FileLocation entries where file_server_directories contains the filing tag label.
    
    Args:
        filing_tag (FilingTag): The filing tag object to search for
    
    Returns:
        list: FileLocation entries matching the criteria
    """
    search_pattern = f"{filing_tag.label} - "
    
    # Query for FileLocation entries where file_server_directories contains the pattern
    locations = session.query(FileLocation).filter(
        FileLocation.file_server_directories.like(f"%{search_pattern}%")
    ).all()
    
    return locations

# Get all filing tags
all_filing_tags = session.query(FilingTag).all()

# Create a list to store results
results = []

# Iterate through all filing tags
for tag in all_filing_tags:
    # Get matching file locations
    locations = file_tag_file_locations(tag)
    count = len(locations)
    
    # Add to results
    results.append({
        'tag': tag.label,
        'description': tag.description,
        'file_locations_count': count
    })

# Convert to DataFrame for better visualization
df_results = pd.DataFrame(results)

# Sort by count in descending order
df_results = df_results.sort_values('file_locations_count', ascending=False)

# Display summary statistics
print(f"Found {len(df_results)} filing tags with {df_results['file_locations_count'].sum()} total file locations")

# Display top tags by location count
display(df_results.head(20))

# Visualize the top 15 tags by file location count
plt.figure(figsize=(12, 8))
top_tags = df_results.head(15)
sns.barplot(x='file_locations_count', y='tag', data=top_tags)
plt.title('Top 15 Filing Tags by File Location Count')
plt.xlabel('Number of File Locations')
plt.ylabel('Filing Tag')
plt.tight_layout()
plt.show()

In [None]:
# create spreadsheet from df_results
df_results.to_csv('filing_tags_file_locations_summary.csv', index=False)

In [None]:
import os
import psycopg
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Retrieve database credentials from environment variables
DB_NAME = os.getenv("DB_NAME")
DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST")

# Establish a connection to the database
conn = psycopg.connect(
    dbname=DB_NAME,
    user=DB_USER,
    password=DB_PASSWORD,
    host=DB_HOST
)

# Create a cursor to execute SQL commands
cur = conn.cursor()

# SQL statements to create tables
sql_commands = [
    """
    CREATE TABLE prototype_runs (
      run_id SERIAL PRIMARY KEY,
      model_name TEXT NOT NULL,
      model_version TEXT NOT NULL,
      algorithm TEXT NOT NULL,
      hyperparams JSONB,
      tag_filter TEXT,
      created_at TIMESTAMPTZ DEFAULT now()
    );
    """,
    """
    CREATE TABLE prototype_members (
      run_id INTEGER REFERENCES prototype_runs(run_id) ON DELETE CASCADE,
      tag TEXT REFERENCES filing_tags(label),
      prototype_id SMALLINT DEFAULT 0,
      file_id INTEGER REFERENCES files(id),
      PRIMARY KEY (run_id, tag, prototype_id, file_id)
    );
    """,
    """
    ALTER TABLE tag_prototypes ADD COLUMN run_id INTEGER
      REFERENCES prototype_runs(run_id) ON DELETE SET NULL;
    """,
    """
    CREATE TABLE prototype_run_metrics (
      run_id INTEGER REFERENCES prototype_runs(run_id) ON DELETE CASCADE,
      metric_name TEXT,
      value NUMERIC,
      split TEXT,
      PRIMARY KEY (run_id, metric_name, split)
    );
    """
]

# Execute each command
for command in sql_commands:
    cur.execute(command)

# Commit changes and close the connection
conn.commit()
cur.close()
conn.close()

In [None]:
import os
from pathlib import Path
from itertools import islice


def visualize_directory_tree(
    dir_path: Path,
    level: int = -1,
    limit_to_directories: bool = False,
    length_limit: int = 1000000,
    exclusion_list: list = None
):
    """Given a directory Path object print a visual tree structure, with optional exclusions."""
    space = '    '
    branch = '│   '
    # pointers:
    tee = '├── '
    last = '└── '

    dir_path = Path(dir_path)  # accept string coerceable to Path
    files = 0
    directories = 0
    exclusion_set = set(exclusion_list) if exclusion_list else set()

    def inner(dir_path: Path, prefix: str = '', level=-1):
        nonlocal files, directories
        if not level:
            return  # 0, stop iterating
        if limit_to_directories:
            contents = [d for d in dir_path.iterdir() if d.is_dir() and d.name not in exclusion_set]
        else:
            contents = [d for d in dir_path.iterdir() if d.name not in exclusion_set]
        pointers = [tee] * (len(contents) - 1) + [last] if contents else []
        for pointer, path in zip(pointers, contents):
            if path.is_dir():
                yield prefix + pointer + path.name
                directories += 1
                extension = branch if pointer == tee else space
                yield from inner(path, prefix=prefix + extension, level=level - 1)
            elif not limit_to_directories:
                yield prefix + pointer + path.name
                files += 1

    print(dir_path.name)
    iterator = inner(dir_path, level=level)
    for line in islice(iterator, length_limit):
        print(line)
    if next(iterator, None):
        print(f'... length_limit, {length_limit}, reached, counted:')
    print(f'\n{directories} directories' + (f', {files} files' if files else ''))
# exclude irrelevant directories
exclude_dirs = [
    '.venv',
    '__pycache__',
    'dev\__pycache__',
    '.git'
]

visualize_directory_tree(
    dir_path= os.getcwd(),
    exclusion_list= exclude_dirs
)

In [None]:
import os
from text_extraction.pdf_extraction import PDFTextExtractor

def test_pdf_text_extraction(directory_path):
    """
    Test PDF text extraction on all files in the given directory.
    
    Args:
        directory_path (str): Path to the directory containing PDF files.
    """
    extractor = PDFTextExtractor()
    for root, _, files in os.walk(directory_path):
        for file in files:
            if file.lower().endswith('.pdf'):
                file_path = os.path.join(root, file)
                try:
                    print(f"Extracting text from: {file_path}")
                    text = extractor(file_path)
                    print(f"Extracted text (first 500 characters):\n{text[:500]}\n")
                except Exception as e:
                    print(f"Failed to extract text from {file_path}: {e}")

# Example usage
test_directory = "path/to/your/pdf/directory"
test_pdf_text_extraction(test_directory)

In [None]:
# this cell is for populating /test_files with test files from the server
import os
import shutil
from db_models import FileLocation, File, get_db_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql import func, or_

SERVER_MOUNT_LOCATION = r"N:\PPDO\Records"

ext_lists_list = [
['pdf'],
["html", "htm", "mhtml", "mht"],
["eml", "msg"],
["png", "jpg", "jpeg", "tif", "tiff", "bmp", "gif"],
["docx", "docm", "doc", "rtf"],
["xlsx", "xlsm", "xls", "xlsb", "ods", "csv", "tsv"],
['txt', 'md', 'log', 'csv', 'json', 'xml', 'yaml', 'yml', 'ini', 'cfg', 'conf']
]

def get_test_file_locations(session, n, size_limit, ext_list):
    """
    Get random test files from the database based on specified criteria.
    """
    query = (
        session
        .query(FileLocation)
        .join(File, FileLocation.file)
        .filter(
            File.size < size_limit,
            FileLocation.file_server_directories.isnot(None)
        )
    )

    if ext_list:
        ext_filters = [
            FileLocation.filename.ilike(f"%.{ext}") 
            for ext in ext_list
        ]
        query = (
            query
            .filter(or_(*ext_filters))
            .order_by(func.random())
        )

    return query.limit(n).all()

def save_test_files_to_directory(session, n, size_limit, ext_lists_list, destintation = os.path.join(os.getcwd(), "test_files")):
    """
    Save test files to a specified directory based on given criteria.
    
    Args:
        session: SQLAlchemy session object.
        n (int): Number of files to retrieve.
        size_limit (int): Maximum file size in bytes.
        ext_lists_list: List of lists containing file extensions.
        base_dir (str): Base directory to save the test files.
    """
    for ext_list in ext_lists_list:
        locations = get_test_file_locations(session, n, size_limit, ext_list)
        for loc in locations:
            file_path = loc.local_filepath(SERVER_MOUNT_LOCATION)
            if file_path and file_path.exists():
                dest_path = os.path.join(destintation, loc.filename)
                shutil.copy2(file_path, dest_path)
                print(f"Copied {loc.filename} to {dest_path}")
                continue

            if file_path and not file_path.exists():
                print(f"File {file_path} does not exist, skipping.")
                continue

        print(f"Test files saved to {destintation}")
            


In [None]:
# 50MB size limit


In [1]:
import os
import shutil
import traceback 
import tempfile
from text_extraction.pdf_extraction import PDFTextExtractor
from text_extraction.basic_extraction import TextFileTextExtractor, get_extractor_for_file
from text_extraction.image_extraction import ImageTextExtractor
from text_extraction.office_doc_extraction import PresentationTextExtractor, SpreadsheetTextExtractor, WordFileTextExtractor
from text_extraction.web_extraction import HtmlTextExtractor, EmailTextExtractor
from text_extraction.extraction_utils import common_char_replacements, strip_diacritics, normalize_unicode

import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Initialize extractors for different file types
pdf_extractor = PDFTextExtractor()
txt_extractor = TextFileTextExtractor()
image_extractor = ImageTextExtractor()
presentation_extractor = PresentationTextExtractor()
spreadsheet_extractor = SpreadsheetTextExtractor()
word_extractor = WordFileTextExtractor()
html_extractor = HtmlTextExtractor()
email_extractor = EmailTextExtractor()

extractors_list = [
    pdf_extractor,
    txt_extractor,
    image_extractor,
    presentation_extractor,
    spreadsheet_extractor,
    word_extractor,
    html_extractor,
    email_extractor
]
display_text_len = 800
test_extraction_path = os.path.join(os.getcwd(), "test_files")

for file in os.listdir(test_extraction_path):
    file_path = os.path.join(test_extraction_path, file)
    def text_assessment(extracted_text):
        if not extracted_text:
            print(f"No text extracted from {file} in temporary directory")
        elif len(extracted_text) > display_text_len:
            print(f"Extracted text (first {display_text_len} characters) from {file}:\n{extracted_text[:display_text_len]}\n")
        else:
            print(f"Extracted text from {file}:\n{extracted_text}\n")

    if not os.path.isfile(file_path):
        print(f"Skipping {file}, not a file.")
    
    extractor = get_extractor_for_file(file_path=file_path, extractors=extractors_list)
    if extractor:
        try:
            print(f"Extracting text from {file} using {extractor.__class__.__name__}")
            text = extractor(file_path)
            text_assessment(text)
        
        except PermissionError as pe:
            # Handle PermissionError by copying the file to a temporary directory
            temp_dir = tempfile.gettempdir()
            temp_file_path = os.path.join(temp_dir, file)
            shutil.copy2(file_path, temp_file_path)
            print(f"Copied {file} to temporary directory: {temp_file_path}")
            
            # Retry extraction from the temporary file
            try:
                text = extractor(temp_file_path)
                text = common_char_replacements(text)
                text = strip_diacritics(text)
                text = normalize_unicode(text)
                text_assessment(text)

            except Exception as e:
                raise Exception(f"Failed to extract text from {file} in temporary directory: {e}")
        
        except Exception as e:

            print(f"Failed to extract text from {file}: {e}")
            traceback.print_exc()
    else:
        print(f"No suitable extractor found for {file}")


Extracting text from 00 Bid Summary.xls using SpreadsheetTextExtractor
Extracted text from 00 Bid Summary.xls:
=== Sheet: Bid Summary === Unnamed: 2 Unnamed: 3 PROJECT:Campuswide Street Striping '2000" PROJECT No. 02700 TIME: 3:00 P.M. PROJECT MANAGER: Pete Butterworth DATE: May 18, 2000 CONTRACTOR BASE BID

Extracting text from 10641_-_081416-01_R0_-_Interior_Flush_Wood_Doors.pdf using PDFTextExtractor
Extracted text (first 800 characters) from 10641_-_081416-01_R0_-_Interior_Flush_Wood_Doors.pdf:
U n i v e r s i t y o f C a l i f o r n i a , S a n t a C r u z Physical Planning & Development Operations 1156 High Street - Barn G Santa Cruz, CA 95064 constdoc@ucsc.edu Project 10641, Westside Research Park (2300 Delaware) - Building A & B Research Office Renovation Submittal No: 081416-01 R0 Review Ends on 5/15/2025 Review Action Taken: □ No Review Necessary □ No Exceptions Taken □ Implement Exceptions Noted □ Revise and Resubmit □ Rejected The review is for general conformance with desi

Output()

Output()

Output()

Output()

Output()

Output()

The output file size is 11.49× larger than the input file.
Possible reasons for this include:
--deskew was issued, causing transcoding.
The optional dependency 'jbig2' was not found, so some image optimizations could not be attempted.
The optional dependency 'pngquant' was not found, so some image optimizations could not be attempted.



Extracted text (first 800 characters) from 2318-069B.H.SKMBT C224e18061307240.pdf:
Felton Aggregate Plant 1800 Felton Quarry Road Felton, CA 95018 TESTING LABORATORIES 831-335-3445 06/13/2018 The following is submitted for your review and acceptance: 1827 - 3/8" Crushed-3/8" AGG Procedure Sieve/Test Average Unit Granite 2017 CT 202 1/2" 100 % 98-100 CT 202 3/8" 88 % 80-98 CT 202 #4 13 % 1-25 CT 202 #8 4 * 0-10 CT 202 #16 3 % CT 202 #30 2 % CT 202 #50 2 % CT 202 #100 4 % CT 202 #200 09 % oo "Gradloss SS 0.000 8 »§% ~ ASTM C-131 LA Abrasion (C,100) 13.8 % ASTM C-131 LA Abrasion (C,500) 42.8 % CT 207 Absorption 0.68 % CT 207 SPGR (Dry,Gsb) 2.610 CT 207 SPGR (SSD) 2.628 CT 208 SPGR (Apparent,Gsa) 2.657 CT 208 SPGR Coarse (Dry, Gsb) 2.610 CT 208 SPGR Coarse (SSD) 2.628 CT 208 SPGR Coarse (Apparent, Gsa) 2.657 CT 212 Unit Wt (Rodded) 92.5 Ib/AA3 CT 226 Total Moisture 0.74 % If

Extracting text from 2638 Daily 4.24.17.pdf using PDFTextExtractor
Extracted text (first 800 characters) from 2638 

Output()

Output()

Output()

Output()

Output()

Output()

Extracted text (first 800 characters) from Item 23 - CHCT06151 (Wiring).pdf:
TO SITE [| POWER [2 Ei _RED RED DIAGRAM DESIGNATIONS _BLK _BLK @ GATHER ee aeo vec? e460 vec? Yo SPLICE POINT STRIPED WIRING MR Tl MR T2 ee ——— CONTRACTORS WIRING AAMAS vas * OPTIONAL ITEMS ———— _ [3] 14] [5] OPTIONAL ITEMS ORN 24 VAC BFA a, ORN® a4 VAC BFA a BS] PBo [7B - - COMPONENT LEGEND 14 GA AC AUXILIARY CONTACT = BC BLOWER CONTACTOR BR BLOWER RELAY a CC COMPRESSOR CONTACTOR CSTB CCH | CRANK CASE HEATER CFC | FAN CONTACTOR CSTB | COMMON SIDE TERMINAL BLOCK DFTC | DEFROST CONTROLLER BLK gg BLK DFTS | DEFROST SENSOR ACI BLKEIBLK CCH1 DLPS| DIGITAL LOW PRESSURE SWITCH LPS| BLEEK FCS | FAN CYCLE SWITCH x O20 > LI rT TL BLK FU FUSE RI @ lan a 24,12 BLU GND GROUND S + a = 131,73 RED GOT | GAURANTEED OFF TIMER ae YEL ve. ~LEPBT1 Pv HES! ven ave Yel]. [BLK] HPS HIGH PRESSURE SWITCH | Tt CCl 

Extracting text from main.html using HtmlTextExtractor
Extracted text (first 800 characters) from main.html:
The Main Ent