In [2]:
import re

def parse_entity(entity):
    lines = [line.strip() for line in entity.splitlines() if line.strip()]
    result = {
        "entity_number": None,
        "business_name": None,
        "dba_name": None,
        "address": None,
        "zipcode": None,
        "license_number": None,
        "status": None,
        "alcohol_type": None,
        "file_name": None
    }

    application_details = " ".join(lines).lower()  
    result["file_name"] = lines[-1]

    if lines:
        match = re.match(r"^(\d+)\.?\s*(.+)", lines[0])
        if match:
            result["entity_number"] = match.group(1)
            result["business_name"] = match.group(2).strip()
    
    for line in lines:
        lower_line = line.lower()

        if "doing business as:" in lower_line:
            dba_match = re.search(r"doing business as:\s*(.+)", line, re.IGNORECASE)
            if dba_match:
                result["dba_name"] = dba_match.group(1).strip()

        if "license" in lower_line:
            license_match = re.search(r"license\s*#:\s*([\w\-]+)", line, re.IGNORECASE)
            if license_match:
                result["license_number"] = license_match.group(1).strip()


        if not result["address"] and re.search(r"\d{5}$", line):
            result["address"] = line.strip()
            zip_match = re.search(r"\b(\d{5})\b", line)
            if zip_match:
                result["zipcode"] = zip_match.group(1)


    if (
        "applied" in application_details and
        "all-alcoholic beverages" in application_details and
        "common victualler" in application_details and
        "7 day" in application_details
    ):
        result["alcohol_type"] = "All Alcoholic Beverages"

    elif (
        "applied" in application_details and
        "wines and malt beverages" in application_details and
        "common victualler" in application_details and
        "7 day" in application_details
    ):
        result["alcohol_type"] = "Wines and Malt Beverages"

    return result


In [4]:
import os
import re
import fitz  
import json

def extract_entities_from_pdf(pdf_path):
    heading_regex = r'^\d+\.?\s+.*'
    entities = []
    current_entity_lines = []
    in_target_section = False
    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        print(f"Error opening document '{pdf_path}': {e}")
        return []

    file_name = os.path.basename(pdf_path)
    print(f"\nProcessing document: {file_name}")

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)

        try:
            page_dict = page.get_text("dict", flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE)
            for block in page_dict["blocks"]:
                if block['type'] == 0:
                    for line in block['lines']:
                        stop_processing_line = False
                        for span in line['spans']:
                            span_text = span['text'].strip()
                            if not span_text:
                                continue

                            if 'Transactional Hearing' in span_text:
                                in_target_section = True
                                continue

                            if 'Non-Hearing Transactions' in span_text:
                                in_target_section = False
                                stop_processing_line = True
                                break

                            heading_match = re.match(heading_regex, span_text)
                            if in_target_section and heading_match and span['flags'] == 20:
                                if current_entity_lines:
                                    current_entity_lines.append(file_name) 
                                    entities.append('\n'.join(current_entity_lines))
                                    current_entity_lines = []
                                current_entity_lines.append(span_text)
                            elif in_target_section:
                                current_entity_lines.append(span_text)

                        if stop_processing_line or not in_target_section:
                            pass
        except Exception as e:
            print(f"Error processing page {page_num + 1} in {pdf_path}: {e}")
            continue

    if current_entity_lines:
        current_entity_lines.append(file_name) 
        entities.append('\n'.join(current_entity_lines))

    doc.close()
    return entities


In [6]:
import os
import json

def process_pdf_folder(folder_name):
    script_dir = os.getcwd() 
    pdf_folder = os.path.join(script_dir, folder_name)     
    data_folder = os.path.join(script_dir, 'data')          

    os.makedirs(data_folder, exist_ok=True) 

    final_result = []
    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            entities = extract_entities_from_pdf(pdf_path)
            for entity in entities:
                result = parse_entity(entity)
                if result['alcohol_type'] in ('Wines and Malt Beverages', 'All Alcoholic Beverages'):
                    result['file_name'] = filename
                    final_result.append(result)
                    print('--------------------------')

    output_file = os.path.join(data_folder, "data.json")
    with open(output_file, "w") as f:
        json.dump(final_result, f, indent=4)


process_pdf_folder('pdfs')


Processing document: Voting Minutes 2-13-25.docx.pdf
--------------------------
--------------------------
--------------------------

Processing document: Voting Minutes 1-9-25.docx_0.pdf
--------------------------
--------------------------
--------------------------
--------------------------
--------------------------
--------------------------
