In [None]:
import os
from db_models import FilingTag, FileLocation, File
from sqlalchemy import create_engine, func
from sqlalchemy.orm import sessionmaker
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def get_db_engine():
    """Create and return a SQLAlchemy engine for the project database."""
    conn_string = (
        f"postgresql+psycopg://{os.getenv('PROJECT_DB_USERNAME')}:{os.getenv('PROJECT_DB_PASSWORD')}"
        f"@{os.getenv('PROJECT_DB_HOST')}:{os.getenv('PROJECT_DB_PORT')}/{os.getenv('PROJECT_DB_NAME')}"
    )
    return create_engine(conn_string)

# Configure your database session
engine = get_db_engine()
Session = sessionmaker(bind=engine)
session = Session()

def file_tag_file_locations(filing_tag):
    """
    Find all FileLocation entries where file_server_directories contains the filing tag label.
    
    Args:
        filing_tag (FilingTag): The filing tag object to search for
    
    Returns:
        list: FileLocation entries matching the criteria
    """
    search_pattern = f"{filing_tag.label} - "
    
    # Query for FileLocation entries where file_server_directories contains the pattern
    locations = session.query(FileLocation).filter(
        FileLocation.file_server_directories.like(f"%{search_pattern}%")
    ).all()
    
    return locations

# Get all filing tags
all_filing_tags = session.query(FilingTag).all()

# Create a list to store results
results = []

# Iterate through all filing tags
for tag in all_filing_tags:
    # Get matching file locations
    locations = file_tag_file_locations(tag)
    count = len(locations)
    
    # Add to results
    results.append({
        'tag': tag.label,
        'description': tag.description,
        'file_locations_count': count
    })

# Convert to DataFrame for better visualization
df_results = pd.DataFrame(results)

# Sort by count in descending order
df_results = df_results.sort_values('file_locations_count', ascending=False)

# Display summary statistics
print(f"Found {len(df_results)} filing tags with {df_results['file_locations_count'].sum()} total file locations")

# Display top tags by location count
display(df_results.head(20))

# Visualize the top 15 tags by file location count
plt.figure(figsize=(12, 8))
top_tags = df_results.head(15)
sns.barplot(x='file_locations_count', y='tag', data=top_tags)
plt.title('Top 15 Filing Tags by File Location Count')
plt.xlabel('Number of File Locations')
plt.ylabel('Filing Tag')
plt.tight_layout()
plt.show()

In [None]:
# create spreadsheet from df_results
df_results.to_csv('filing_tags_file_locations_summary.csv', index=False)

In [None]:
import os
import psycopg
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Retrieve database credentials from environment variables
DB_NAME = os.getenv("DB_NAME")
DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST")

# Establish a connection to the database
conn = psycopg.connect(
    dbname=DB_NAME,
    user=DB_USER,
    password=DB_PASSWORD,
    host=DB_HOST
)

# Create a cursor to execute SQL commands
cur = conn.cursor()

# SQL statements to create tables
sql_commands = [
    """
    CREATE TABLE prototype_runs (
      run_id SERIAL PRIMARY KEY,
      model_name TEXT NOT NULL,
      model_version TEXT NOT NULL,
      algorithm TEXT NOT NULL,
      hyperparams JSONB,
      tag_filter TEXT,
      created_at TIMESTAMPTZ DEFAULT now()
    );
    """,
    """
    CREATE TABLE prototype_members (
      run_id INTEGER REFERENCES prototype_runs(run_id) ON DELETE CASCADE,
      tag TEXT REFERENCES filing_tags(label),
      prototype_id SMALLINT DEFAULT 0,
      file_id INTEGER REFERENCES files(id),
      PRIMARY KEY (run_id, tag, prototype_id, file_id)
    );
    """,
    """
    ALTER TABLE tag_prototypes ADD COLUMN run_id INTEGER
      REFERENCES prototype_runs(run_id) ON DELETE SET NULL;
    """,
    """
    CREATE TABLE prototype_run_metrics (
      run_id INTEGER REFERENCES prototype_runs(run_id) ON DELETE CASCADE,
      metric_name TEXT,
      value NUMERIC,
      split TEXT,
      PRIMARY KEY (run_id, metric_name, split)
    );
    """
]

# Execute each command
for command in sql_commands:
    cur.execute(command)

# Commit changes and close the connection
conn.commit()
cur.close()
conn.close()

In [None]:
import os
from pathlib import Path
from itertools import islice


def visualize_directory_tree(
    dir_path: Path,
    level: int = -1,
    limit_to_directories: bool = False,
    length_limit: int = 1000000,
    exclusion_list: list = None
):
    """Given a directory Path object print a visual tree structure, with optional exclusions."""
    space = '    '
    branch = '│   '
    # pointers:
    tee = '├── '
    last = '└── '

    dir_path = Path(dir_path)  # accept string coerceable to Path
    files = 0
    directories = 0
    exclusion_set = set(exclusion_list) if exclusion_list else set()

    def inner(dir_path: Path, prefix: str = '', level=-1):
        nonlocal files, directories
        if not level:
            return  # 0, stop iterating
        if limit_to_directories:
            contents = [d for d in dir_path.iterdir() if d.is_dir() and d.name not in exclusion_set]
        else:
            contents = [d for d in dir_path.iterdir() if d.name not in exclusion_set]
        pointers = [tee] * (len(contents) - 1) + [last] if contents else []
        for pointer, path in zip(pointers, contents):
            if path.is_dir():
                yield prefix + pointer + path.name
                directories += 1
                extension = branch if pointer == tee else space
                yield from inner(path, prefix=prefix + extension, level=level - 1)
            elif not limit_to_directories:
                yield prefix + pointer + path.name
                files += 1

    print(dir_path.name)
    iterator = inner(dir_path, level=level)
    for line in islice(iterator, length_limit):
        print(line)
    if next(iterator, None):
        print(f'... length_limit, {length_limit}, reached, counted:')
    print(f'\n{directories} directories' + (f', {files} files' if files else ''))
# exclude irrelevant directories
exclude_dirs = [
    '.venv',
    '__pycache__',
    'dev\__pycache__',
    '.git'
]

visualize_directory_tree(
    dir_path= os.getcwd(),
    exclusion_list= exclude_dirs
)

In [None]:
import os
from text_extraction.pdf_extraction import PDFTextExtractor

def test_pdf_text_extraction(directory_path):
    """
    Test PDF text extraction on all files in the given directory.
    
    Args:
        directory_path (str): Path to the directory containing PDF files.
    """
    extractor = PDFTextExtractor()
    for root, _, files in os.walk(directory_path):
        for file in files:
            if file.lower().endswith('.pdf'):
                file_path = os.path.join(root, file)
                try:
                    print(f"Extracting text from: {file_path}")
                    text = extractor(file_path)
                    print(f"Extracted text (first 500 characters):\n{text[:500]}\n")
                except Exception as e:
                    print(f"Failed to extract text from {file_path}: {e}")

# Example usage
test_directory = "path/to/your/pdf/directory"
test_pdf_text_extraction(test_directory)

In [3]:
# this cell is for populating /test_files with test files from the server
import os
import shutil
from db_models import FileLocation, File, get_db_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql import func, or_

SERVER_MOUNT_LOCATION = r"N:\PPDO\Records"

ext_lists_list = [
['pdf'],
["html", "htm", "mhtml", "mht"],
["eml", "msg"],
["png", "jpg", "jpeg", "tif", "tiff", "bmp", "gif"],
["docx", "docm", "doc", "rtf"],
["xlsx", "xlsm", "xls", "xlsb", "ods", "csv", "tsv"],
['txt', 'md', 'log', 'csv', 'json', 'xml', 'yaml', 'yml', 'ini', 'cfg', 'conf']
]

def get_test_file_locations(session, n, size_limit, ext_list):
    """
    Get random test files from the database based on specified criteria.
    """
    query = (
        session
        .query(FileLocation)
        .join(File, FileLocation.file)
        .filter(
            File.size < size_limit,
            FileLocation.file_server_directories.isnot(None)
        )
    )

    if ext_list:
        ext_filters = [
            FileLocation.filename.ilike(f"%.{ext}") 
            for ext in ext_list
        ]
        query = (
            query
            .filter(or_(*ext_filters))
            .order_by(func.random())
        )

    return query.limit(n).all()

def save_test_files_to_directory(session, n, size_limit, ext_lists_list, destintation = os.path.join(os.getcwd(), "test_files")):
    """
    Save test files to a specified directory based on given criteria.
    
    Args:
        session: SQLAlchemy session object.
        n (int): Number of files to retrieve.
        size_limit (int): Maximum file size in bytes.
        ext_lists_list: List of lists containing file extensions.
        base_dir (str): Base directory to save the test files.
    """
    for ext_list in ext_lists_list:
        locations = get_test_file_locations(session, n, size_limit, ext_list)
        for loc in locations:
            file_path = loc.local_filepath(SERVER_MOUNT_LOCATION)
            if file_path and file_path.exists():
                dest_path = os.path.join(destintation, loc.filename)
                shutil.copy2(file_path, dest_path)
                print(f"Copied {loc.filename} to {dest_path}")
                continue

            if file_path and not file_path.exists():
                print(f"File {file_path} does not exist, skipping.")
                continue

        print(f"Test files saved to {destintation}")


size_limit = 150 * 1024 * 1024  # 150 MB
n = 15  # Number of files per extension
destintation = os.path.join(os.getcwd(), "test_files")
session = sessionmaker(bind=get_db_engine())()

save_test_files_to_directory(
    session=session,
    n=n,
    size_limit=size_limit,
    ext_lists_list=ext_lists_list,
    destintation=destintation
)           


File N:\PPDO\Records\16xx   Cowell College\1652\1652\F - Bid Documents and Contract Award\Bid Documents\Phase II A Bid Documents\Addendum B issued 6.27.14\4.5 Addenda A Drawings issued by Pyatok 6.19.14.pdf does not exist, skipping.
Copied 056 File 6701 D027 REV 1.PDF to c:\Users\adankert\projects\file_code_tagger\test_files\056 File 6701 D027 REV 1.PDF
Copied Notice of Completion and Determination - Signed by Vani.pdf to c:\Users\adankert\projects\file_code_tagger\test_files\Notice of Completion and Determination - Signed by Vani.pdf
File N:\PPDO\Records\106xx  2300 Delaware Westside Research Park\10630\10630\G - Construction\G12 - Request for Information RFI\RFI_113R1 response.pdf does not exist, skipping.
File N:\PPDO\Records\_To Be Archived\WM 06.20.19\WM\WO00411027 ERC, Install New Roof for Bike Cage\E - Program and Design\Budget\19 02 20 WO00411027 Cost Report.pdf does not exist, skipping.
Copied Summary of Changes - ASTM D2513 (2011b-2012).pdf to c:\Users\adankert\projects\file_

In [4]:
import logging
import os
import shutil
import traceback 
import tempfile
from text_extraction.pdf_extraction import PDFTextExtractor
from text_extraction.basic_extraction import TextFileTextExtractor, get_extractor_for_file
from text_extraction.image_extraction import ImageTextExtractor
from text_extraction.office_doc_extraction import PresentationTextExtractor, SpreadsheetTextExtractor, WordFileTextExtractor
from text_extraction.web_extraction import HtmlTextExtractor, EmailTextExtractor
from text_extraction.extraction_utils import common_char_replacements, strip_diacritics, normalize_unicode
from logging_setups import setup_logger

setup_logger(name="NotebookLogger", notebook=True, level=logging.DEBUG)

import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Initialize extractors for different file types
pdf_extractor = PDFTextExtractor()
txt_extractor = TextFileTextExtractor()
image_extractor = ImageTextExtractor()
presentation_extractor = PresentationTextExtractor()
spreadsheet_extractor = SpreadsheetTextExtractor()
word_extractor = WordFileTextExtractor()
html_extractor = HtmlTextExtractor()
email_extractor = EmailTextExtractor()

extractors_list = [
    pdf_extractor,
    txt_extractor,
    image_extractor,
    presentation_extractor,
    spreadsheet_extractor,
    word_extractor,
    html_extractor,
    email_extractor
]
display_text_len = 800
test_extraction_path = os.path.join(os.getcwd(), "test_files")

for file in os.listdir(test_extraction_path):
    file_path = os.path.join(test_extraction_path, file)
    def text_assessment(extracted_text):
        if not extracted_text:
            print(f"No text extracted from {file} in temporary directory")
        elif len(extracted_text) > display_text_len:
            print(f"Extracted text (first {display_text_len} characters) from {file}:\n{extracted_text[:display_text_len]}\n")
        else:
            print(f"Extracted text from {file}:\n{extracted_text}\n")

    if not os.path.isfile(file_path):
        print(f"Skipping {file}, not a file.")
        continue
    
    extractor = get_extractor_for_file(file_path=file_path, extractors=extractors_list)
    if extractor:
        try:
            print(f"Extracting text from {file} using {extractor.__class__.__name__}")
            text = extractor(file_path)
            text_assessment(text)
        
        except PermissionError as pe:
            # Handle PermissionError by copying the file to a temporary directory
            temp_dir = tempfile.gettempdir()
            temp_file_path = os.path.join(temp_dir, file)
            shutil.copy2(file_path, temp_file_path)
            print(f"Copied {file} to temporary directory: {temp_file_path}")
            
            # Retry extraction from the temporary file
            try:
                text = extractor(temp_file_path)
                text = common_char_replacements(text)
                text = strip_diacritics(text)
                text = normalize_unicode(text)
                text_assessment(text)

            except Exception as e:
                raise Exception(f"Failed to extract text from {file} in temporary directory: {e}")
        
        except Exception as e:

            print(f"Failed to extract text from {file}: {e}")
            traceback.print_exc()
    else:
        print(f"No suitable extractor found for {file}")


Extracting text from 02 File 4001 MH 920-88406-2x.TIF using ImageTextExtractor




Extracted text (first 800 characters) from 02 File 4001 MH 920-88406-2x.TIF:
2

UCSC COOLING & HEATING COIL VALVE SCHEDULE UCSC LAB VAY CDDLING & HEATING VALVE SCHEDULE UCSC LAB VAY COOLING & HEATING VALVE SCHEDULE UCSC LAB VAV COOLING & HEATING VALVE

SCHEDULE

IVALVE AG ILOCATION!

TYALVE,
is¥s 1

LOCATION
TAG #

SIZEICV

100-01 1V5013B1011 #13 FLGi 100

IVPS26A1027 #15/8F!1.6 3 ict-03-57 3 3.5 1AC-213-S 0/21
nPOS3E1304 IVPS2SA1027 #15/8F 11.8 15 16-03-57 3 a9 iAC~213-8 D/Z1

1We-01 19501381029 153141004 3 ¢c-03-50 Ls H/2Stil& ists AC-213-8 0/20

iMPenSE1418 IVPSS1Aa1008 1.3 HC-03-s8 3 H/28t116 10.9 1AC-2:3-8 D720

1ee-o21VE013B1011 IVPSS1A1004 3 1CC-03-S9t St H/2EI1L6 {3.8 {AG-213-8 0/20

MP9S3E1364 1VP531A1004 1S 1HC-03-59: VP53101004 3 Hy2611.6 13.5 1AG-2:3-8 0/20

!HC-02 VSOLSB1029 #24 FLGI140 12 I¥PSS1A1004 11/2511.6 3 st 10.9 HACH213-S BANG!

MPTSSEL418 + V

Extracting text from 041 File 6701 M110B.pdf using PDFTextExtractor


Output()

Output()

Output()

Output()

Output()

Output()

No text extracted from 041 File 6701 M110B.pdf in temporary directory
Extracting text from 056 File 6701 D027 REV 1.PDF using PDFTextExtractor


Output()

Output()

Output()

Output()

Output()

Output()

The output file size is 2.10× larger than the input file.
Possible reasons for this include:
--deskew was issued, causing transcoding.
The optional dependency 'jbig2' was not found, so some image optimizations could not be attempted.
The optional dependency 'pngquant' was not found, so some image optimizations could not be attempted.



Extracted text (first 800 characters) from 056 File 6701 D027 REV 1.PDF:
4'-2 3’ll 144 eS \ DN. 3’-8 4'-2 3-8 UNE - STAIR - Ae7 10TREADS @ tl = 9’-e we FOR REMAINDER S SEE SECT-A 4 VN lews (TYP,10-PLCS) p33<Ns) (TYP.10-PLCS) (REF .E6E7) 9-64 10 TREADS @ tl = 9’-2 a ZZ . NCS i J ; 2 a + % TPN 4 Oo OO Z Wa oa | a rd a27- 7 \ oy 16 é p “EB <Te. Or 1p ao a ZB 4’-e ose S ZO Pu \O tin 1 > 1330NS) T r ZA (TYP.10-PLCS) a: fA Lv 4 =| X& >» p33(NS) 1 © 3 6 2 (TYP,L0-PLCS) as © Lies SY? i] fom) in = A ZA 2-PLACES . NO PAINT Ags A wis <p On wo | =< 3-SIDES W/1! | 2 bY ih STYP, — RETURNS ~ ot = S 3-SIDES W/1" & hth “TYP. RETURNS + it = ono it) 17 a on / gi P\THFS) 1 _ o Mf ‘ yy a OX Z H os LA — 144(FS NO PAINT ° 3 | 34] stor 3'-6% ~ C27 SECTION - B CUT, BILL OF MATERIAL NO TO] Se | ASSY | cs. SECTION ee REMARKS UNT | WEIGHT | P.O.NO. ONE | Ae7 "STAIR ae7| 1 Ci2x20,7 1 | 7% 

Extracting text from 092 File 6701 MP7.07.PDF using PDFTextExtractor


Output()

Output()

Output()

Output()

Output()

Output()

The output file size is 2.70× larger than the input file.
Possible reasons for this include:
--deskew was issued, causing transcoding.
The optional dependency 'jbig2' was not found, so some image optimizations could not be attempted.
The optional dependency 'pngquant' was not found, so some image optimizations could not be attempted.



Extracted text (first 800 characters) from 092 File 6701 MP7.07.PDF:
HARDWARE INTERLOCK (TYP.) 480/30/60HZ VFD _~ SEE 9/MP7.04 M D-No ) EF-3A PI Ml] EF-3B L\_JN /\TFTN RHC 140, 423, 415 (SEE PLANS) Ye CAMPUS HOT WATER PI SS DPI __ VFD yy NN SEE 9/MP7.04 a ‘ad BASEMENT EXHAUST AIR 480/30/60HZ (6 EF-3 ay, SCALE: NONE DPT M M S FH-1 + FAN S a P| Pl M M } - FH-2 af \ oS VED Pl P| a ( BO) (BO ) SEE 9/MP7.04 x DPT AD 480/30/60HZ /-7\ SF-1 HEPA FILETED SUPPLY AIR ay, SCALE: NONE CAD 13 WATER Ir SUPPLY BMS CONTROL VALVE y | f to 1G; iF > ¢ T3 OCAL CONTROL VALVE WATER Ir <q RETURN 8 TANK MOUNTED HEAT EXCHANGER WH-1, IWH-1, PWH-1 (8 a, SCALE: NONE (9\ TYP (=) ) A | ! MP7.0 ! MP7.03 LON TO JACE Ty Zo ee | | \WP7.03) Yeon A =F . ved | 46ow 3 | —~— =H No . > / Y CTRL. PNL. | | n a ! ) ) FS FS | 13 | | @ | | (4, CWS -— tJ CH-1 - - Lf! . = | | P7067 CWR ~—— BASEMENT - ! |

Extracting text from 09_Appendix C_UCSC EEC DD Specs VOL 2.csv using TextFileTextExtractor
Extracted text (first 800 characters) 

Traceback (most recent call last):
  File "C:\Users\adankert\AppData\Local\Temp\ipykernel_6096\85131941.py", line 60, in <module>
    text = extractor(file_path)
  File "c:\Users\adankert\projects\file_code_tagger\text_extraction\image_extraction.py", line 76, in __call__
    img = self.detect_and_correct_orientation(img)
  File "c:\Users\adankert\projects\file_code_tagger\text_extraction\image_extraction.py", line 151, in detect_and_correct_orientation
    osd = pytesseract.image_to_osd(pil_img)
  File "c:\Users\adankert\projects\file_code_tagger\.venv\Lib\site-packages\pytesseract\pytesseract.py", line 621, in image_to_osd
    return {
           ~
    ...<2 lines>...
        Output.STRING: lambda: run_and_get_output(*args),
        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    }[output_type]()
    ~~~~~~~~~~~~~~^^
  File "c:\Users\adankert\projects\file_code_tagger\.venv\Lib\site-packages\pytesseract\pytesseract.py", line 624, in <lambda>
    Output.STRING: lambda: run_and_g

Extracted text (first 800 characters) from 21kv Cable Project description Ilse cwd.doc:
Ilse, Henry and I edited your draft and also added a list of dates at the bottom for your use. Chuck DRAFT The project will replace two 45+ year-old sets of cables that transmit electricity from the PG&E meter to the main campus substation. A new cable will be pulled into an empty "spare" conduit that runs along with the other two conduits for the existing cables. At one point the old cable will be moved to allow clear access to the "spare" conduit. During this move, the old cables will be de-energized and the campus will run on a single set of cables for a short time. A single set of cables will also serve the campus when the new cables are tied into the existing system. A period of seven to eight days, from December 14 through 21 will be needed for this work. The Campus load will be hand

Extracting text from 22-5205X-1.pdf using PDFTextExtractor
Extracted text (first 800 characters) from 22-5205X

Traceback (most recent call last):
  File "C:\Users\adankert\AppData\Local\Temp\ipykernel_6096\85131941.py", line 60, in <module>
    text = extractor(file_path)
  File "c:\Users\adankert\projects\file_code_tagger\text_extraction\image_extraction.py", line 76, in __call__
    img = self.detect_and_correct_orientation(img)
  File "c:\Users\adankert\projects\file_code_tagger\text_extraction\image_extraction.py", line 151, in detect_and_correct_orientation
    osd = pytesseract.image_to_osd(pil_img)
  File "c:\Users\adankert\projects\file_code_tagger\.venv\Lib\site-packages\pytesseract\pytesseract.py", line 621, in image_to_osd
    return {
           ~
    ...<2 lines>...
        Output.STRING: lambda: run_and_get_output(*args),
        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    }[output_type]()
    ~~~~~~~~~~~~~~^^
  File "c:\Users\adankert\projects\file_code_tagger\.venv\Lib\site-packages\pytesseract\pytesseract.py", line 624, in <lambda>
    Output.STRING: lambda: run_and_g

Output()

Output()

Output()

Output()

Output()

Output()

The output file size is 9.26× larger than the input file.
Possible reasons for this include:
--deskew was issued, causing transcoding.
The optional dependency 'jbig2' was not found, so some image optimizations could not be attempted.
The optional dependency 'pngquant' was not found, so some image optimizations could not be attempted.



Extracted text (first 800 characters) from File 1323 #FO_45_Water Meter for EHS_5-23-14.pdf:
University of California, Santa Cruz Campus: Santa Cruz Physical Planning and Construction Field Order No. #45 Date: 5-23-14 FIELD ORDER This form to be used only for emergency instructions to a contractor where time required for preparation and execution ofa formal Change Order would result in delay or stoppage of this work, A duly authorized Change Order shall replace this Field Order as soon as possible and shall bear appropriate reference to the Field Order. Building or Job: EH&S TEMPORARY FACILITY & THIMANN LAB ROOM 144 ROOM U. C. Project No. 1323 To the Contractor: Thayer Construction Inc. General Subject: Install New Water Assembly at EHS You are hereby authorized and instructed to effect the following modifications in your contract for the above project: 1) Install new water assemb

Extracting text from File 1639 Submittal # 05.16 Lee's Imperial Welding 3-26-03 E _20030619_0059.TIF usin

Traceback (most recent call last):
  File "C:\Users\adankert\AppData\Local\Temp\ipykernel_6096\85131941.py", line 60, in <module>
    text = extractor(file_path)
  File "c:\Users\adankert\projects\file_code_tagger\text_extraction\image_extraction.py", line 76, in __call__
    img = self.detect_and_correct_orientation(img)
  File "c:\Users\adankert\projects\file_code_tagger\text_extraction\image_extraction.py", line 151, in detect_and_correct_orientation
    osd = pytesseract.image_to_osd(pil_img)
  File "c:\Users\adankert\projects\file_code_tagger\.venv\Lib\site-packages\pytesseract\pytesseract.py", line 621, in image_to_osd
    return {
           ~
    ...<2 lines>...
        Output.STRING: lambda: run_and_get_output(*args),
        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    }[output_type]()
    ~~~~~~~~~~~~~~^^
  File "c:\Users\adankert\projects\file_code_tagger\.venv\Lib\site-packages\pytesseract\pytesseract.py", line 624, in <lambda>
    Output.STRING: lambda: run_and_g

Extracting text from north.html using HtmlTextExtractor
Extracted text (first 800 characters) from north.html:
The Core North THE CORE NORTH Core North includes some of the most heavily used academic facilities of the campus. It will be one of the most urbanized places on campus; the site of most of the campus science buildings, the science library and a proposed site for a future administration building . It is an area of heavy pedestrian and bicycle circulation. The aim of the plan is to build on the spirit of the plaza and circulation adjacent to the recently completed Science Library and to create a collection of vibrant places set within a larger forest landscape. This area is bounded by the middle branch of Jordan Gulch on the east, Steinhart Way on the south, Heller Drive on the west, and extensions of Moore Creek and Jordan Gulch on the north. GUIDELINES Cluster buildings to define the majo

Extracting text from Notice of Completion and Determination - Signed by Vani.pdf using 

Traceback (most recent call last):
  File "C:\Users\adankert\AppData\Local\Temp\ipykernel_6096\85131941.py", line 60, in <module>
    text = extractor(file_path)
  File "c:\Users\adankert\projects\file_code_tagger\text_extraction\image_extraction.py", line 76, in __call__
    img = self.detect_and_correct_orientation(img)
  File "c:\Users\adankert\projects\file_code_tagger\text_extraction\image_extraction.py", line 151, in detect_and_correct_orientation
    osd = pytesseract.image_to_osd(pil_img)
  File "c:\Users\adankert\projects\file_code_tagger\.venv\Lib\site-packages\pytesseract\pytesseract.py", line 621, in image_to_osd
    return {
           ~
    ...<2 lines>...
        Output.STRING: lambda: run_and_get_output(*args),
        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    }[output_type]()
    ~~~~~~~~~~~~~~^^
  File "c:\Users\adankert\projects\file_code_tagger\.venv\Lib\site-packages\pytesseract\pytesseract.py", line 624, in <lambda>
    Output.STRING: lambda: run_and_g

Output()

Output()

Output()

Output()

Output()

Output()

The output file size is 1.87× larger than the input file.
Possible reasons for this include:
--deskew was issued, causing transcoding.
The optional dependency 'jbig2' was not found, so some image optimizations could not be attempted.
The optional dependency 'pngquant' was not found, so some image optimizations could not be attempted.



Extracted text (first 800 characters) from Notice of Completion and Determination - Signed by Vani.pdf:
UNIVERSITY OF CALIFORNIA, SANTA CRUZ BERKELEY « DAVIS * IRVINE * LOS ANGELES * MERCED* RIVERSIDE * SAN DIEGO * SAN FRANCISCO SANTA BARBARA ¢ SANTA CRUZ PHYSICAL PLANNING AND CONSTRUCTION SANTA CRUZ, CALIFORNIA 95064 November 9, 2006 Terry Roberts State Clearinghouse 1400 Tenth Street Sacramento, CA 95814 ‘ NOTICE OF COMPLETION—MITIGATED NEGATIVE DECLARATION Project Title: Biomedical Sciences Facility Project Location: University of California, Santa Cruz Campus County: Santa Cruz In accordance with State CEQA Guidelines and the University of California (UC) CEQA Handbook, a Tiered Initial Study has been prepared for the above-named project. The Initial Study is tiered from the Environmental Impact Report for the UC Santa Cruz Long Range Development Plan 2005-2020 (2005 LRDP). The proposed 

Extracting text from orientation_hand_written_mess.tif using ImageTextExtractor
Extracted text

  warn(msg)
  warn(msg)


Extracted text (first 800 characters) from Phase B Const Schedule 041015.xlsx:
=== Sheet: Phase B === Building Scheduling Unique Bldg. ID Building Name TR (New, Existing, Abandon) Floor TR ID TR Room Number (New)TR Room Number FOPS Hours Engineering Hours Construction for TR (9097H) Construction for Cabling (9097J) Construction for Cabling (9097K) Construction for Cabling (9097L) ITS time not needed if it is concurrent to Contractor Time. ITS time if not concurrent to Contractor Time. Construction Dependencies Description of Work Required Voice Over IP Support Premium/After Hours FOPS Hours Premium/After Hours Engineering Hours Date work needs to be completed by. Unnamed: 28 Unnamed: 29 Unnamed: 30 Unnamed: 31 Unnamed: 32 Unnamed: 33 Priority ID BldgNme TR ID trroom trroom hours hours hours hours hours 2 7098 Interdisciplinary Sci Bldg. ADF 1 7098-1.1 105 X Closet mu

Extracting text from plot.log using TextFileTextExtractor
Extracted text (first 800 characters) from plot.log:
C:\Docum



  soup = BeautifulSoup(html, parser)


Extracted text (first 800 characters) from Sprink Rev to G.R e2.doc:
Project No. 2208-002 Communication Building Fire Sprinkler Renovation REVISIONS TO GENERAL REQUIREMENTS 1.00 SUMMARY OF WORK 1.01 WORK REQUIRED BY CONTRACT DOCUMENTS A. Provide a complete wet-pipe automatic sprinkler system, and associated equipment, ready for operation. a. Work includes designing, engineering and installing modifications to an existing wet-pipe automatic sprinkler system for the existing Communications Building Basement EDP Facility spaces as shown on the contract drawings to afford complete fire protection throughout the areas indicated. b. Provide an accurate Ghant (bar) chart for Contract schedule, demonstrating all major construction events. B. Current Language 4.00 SUBMITTALS 4.02 PROPOSED PRODUCTS, SAMPLES & SHOP DRAWINGS A. Current Language. B. Submitted shop drawi

Extracting text from Summary of Changes - ASTM D2513 (2011b-2012).pdf using PDFTextExtractor
Extracted text (first 800 characters