In [1]:
from document_extractor import extract_itac_report

In [2]:
DOC_PATH_1 = "/Users/afschowdhury/Code Local/itac-report-validator/docs/report1/LS2502 - Final Draft R2.docx"
DOC_PATH_2 = "/Users/afschowdhury/Code Local/itac-report-validator/docs/report2/LS2508 - Final Draft.docx"

In [3]:
html_out = extract_itac_report(DOC_PATH_2, output="html", save_files=True)

In [4]:
html_out

{'general_information': "<p>General Information</p>\n<p></p>\n<table border='1' cellpadding='4' cellspacing='0' style='border-collapse:collapse;width:100%'><tr><td><p>SIC. No.: 3599</p></td><td><p>Annual Production: 1,800,000 parts/yr</p></td></tr><tr><td><p>NAICS Code: 332710</p></td><td><p>Annual Sales: $10.4 million</p></td></tr><tr><td><p>Principal Product: Machined Parts</p></td><td><p>Value per Finished Product: $5.78/parts </p></td></tr><tr><td><p>No. of Employees: 44</p></td><td><p>Total Energy Usage: 2,217 MMBTU/yr</p></td></tr><tr><td><p>Total Facility Area: 53,544 ft2</p></td><td><p>Total Utility Cost:  $79,508/yr</p></td></tr><tr><td><p>Operating Hours: 4,862 hrs/yr</p></td><td><p>No. of Assessment Recommendations: 8</p></td></tr></table>\n<p></p>",
 'annual_energy_usages_and_costs': '<p>Annual Energy Usages and Costs </p>\n<p></p>\n<p>Energy usage and the corresponding costs at the facility during the twelve-month period between September 2023 and August 2024 are summarize

In [6]:
from document_extractor import extract_general_info_fields


general_info = extract_general_info_fields(html_out['general_information'])

general_info

{'sic_no': 3599.0,
 'annual_production': 1800000.0,
 'naics_code': 332710.0,
 'annual_sales': 10400000.0,
 'principal_product': 'Machined Parts',
 'value_per_finished_product': 5.78,
 'no_of_employees': 44.0,
 'total_energy_usage': 2217.0,
 'total_facility_area': 53544.0,
 'total_utility_cost': 79508.0,
 'operating_hours': 4862.0,
 'no_of_assessment_recommendations': 8.0}

In [7]:
energy_usage = html_out['annual_energy_usages_and_costs']
energy_usage

'<p>Annual Energy Usages and Costs </p>\n<p></p>\n<p>Energy usage and the corresponding costs at the facility during the twelve-month period between September 2023 and August 2024 are summarized in Table 1-1:</p>\n<p></p>\n<p style="text-align:center">Table 1-1. The Facility Energy and Material Usage Summary</p>\n<table border=\'1\' cellpadding=\'4\' cellspacing=\'0\' style=\'border-collapse:collapse;width:100%\'><tr><td><p style="text-align:center"><b>Type</b></p></td><td><p style="text-align:center"><b>Usage</b></p></td><td><p style="text-align:center"><b>Cost </b></p></td><td><p style="text-align:center"><b>Unit Cost</b></p></td></tr><tr><td><p style="text-align:center"><b>Electrical Energy</b></p></td><td><p style="text-align:center">649,680 kWh/yr </p><p style="text-align:center">(2,217 MMBTU/yr)</p></td><td><p style="text-align:center">$66,137/yr</p></td><td><p style="text-align:center">$0.102/kWh</p></td></tr><tr><td><p style="text-align:center"><b>Electrical Demand</b></p></td>

In [8]:
from document_extractor import extract_energy_usage

energy_usage_data = extract_energy_usage(html_out["annual_energy_usages_and_costs"])
energy_usage_data

{'period': {'start': 'September 2023', 'end': 'August 2024'},
 'data': [{'type': 'Electrical Energy',
   'usage': {'kWh/yr': 649680.0, 'MMBTU/yr': 2217.0},
   'cost': 66137.0,
   'unit_cost': {'amount': 0.102, 'unit': 'kWh'}},
  {'type': 'Electrical Demand',
   'usage': {'kW/yr': 2726.0},
   'cost': 12326.0,
   'unit_cost': {'amount': 4.522, 'unit': 'kW'}},
  {'type': 'Propane',
   'usage': {'MMBTU/yr': 38.0},
   'cost': 1045.0,
   'unit_cost': {'amount': 27.5, 'unit': 'MMBTU'}},
  {'type': 'Total Utility',
   'usage': {'MMBTU/yr': 2255.0},
   'cost': 79508.0,
   'unit_cost': None}]}

In [9]:
# Extract total utility cost from energy usage data
total_utility_info = next((item for item in energy_usage_data['data'] if item['type'] == 'Total Utility'), None)
if total_utility_info:
    total_utility_cost = total_utility_info['cost']
    print(f"Total Utility Cost: ${total_utility_cost}")
else:
    print("Total Utility information not found")


Total Utility Cost: $79508.0


In [10]:
total_utility_cost

79508.0

In [None]:
general_info['total_utility_cost'] 

79508.0

In [12]:
general_info['total_utility_cost'] == total_utility_cost

True

In [9]:
# pip install python-docx
from docx import Document
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import Table
from docx.text.paragraph import Paragraph
import re
from typing import Iterable, List, Union, Optional, Dict, Any
import json

DOCX_PATH = "/mnt/data/LS2502 - Final Draft R2.docx"

# ---------- Low-level helpers ----------

def iter_block_items(doc: Document) -> Iterable[Union[Paragraph, Table]]:
    for child in doc.element.body.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, doc)
        elif isinstance(child, CT_Tbl):
            yield Table(child, doc)

def normalize(s: str) -> str:
    # collapse whitespace and non-breaking spaces
    return re.sub(r"\s+", " ", (s or "").replace("\xa0", " ")).strip()

def para_alignment_name(p: Paragraph) -> str:
    if p.alignment == 1:
        return "center"
    if p.alignment == 2:
        return "right"
    return "left"

def escape_html(text: str) -> str:
    return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")

# ---------- Renderers: HTML ----------

def paragraph_to_html(p: Paragraph) -> str:
    if not p.runs:
        return "<p></p>"
    parts = []
    for r in p.runs:
        t = escape_html(r.text)
        if not t:
            continue
        if r.bold:
            t = f"<b>{t}</b>"
        if r.italic:
            t = f"<i>{t}</i>"
        parts.append(t)
    align = para_alignment_name(p)
    style = f' style="text-align:{align}"' if align != "left" else ""
    return f"<p{style}>" + "".join(parts) + "</p>"

def table_to_html(tbl: Table) -> str:
    rows_html = []
    for row in tbl.rows:
        cells_html = []
        for cell in row.cells:
            cell_html = "".join(paragraph_to_html(p) for p in cell.paragraphs)
            cells_html.append(f"<td>{cell_html}</td>")
        rows_html.append("<tr>" + "".join(cells_html) + "</tr>")
    return "<table border='1' cellpadding='4' cellspacing='0' style='border-collapse:collapse;width:100%'>" + "".join(rows_html) + "</table>"

def blocks_to_html(blocks: List[Union[Paragraph, Table]]) -> str:
    html_parts = []
    for b in blocks:
        if isinstance(b, Paragraph):
            html_parts.append(paragraph_to_html(b))
        elif isinstance(b, Table):
            html_parts.append(table_to_html(b))
    return "\n".join(html_parts)

# ---------- Renderers: JSON ----------

def paragraph_to_json(p: Paragraph) -> Dict[str, Any]:
    runs = []
    for r in p.runs:
        if r.text:
            runs.append({
                "text": r.text,
                "bold": bool(r.bold),
                "italic": bool(r.italic),
            })
    return {"type": "paragraph", "alignment": para_alignment_name(p), "runs": runs}

def table_to_json(tbl: Table) -> Dict[str, Any]:
    grid = []
    for row in tbl.rows:
        row_cells = []
        for cell in row.cells:
            row_cells.append({"paragraphs": [paragraph_to_json(p) for p in cell.paragraphs]})
        grid.append(row_cells)
    return {"type": "table", "rows": grid}

def blocks_to_json(blocks: List[Union[Paragraph, Table]]) -> List[Dict[str, Any]]:
    out = []
    for b in blocks:
        if isinstance(b, Paragraph):
            out.append(paragraph_to_json(b))
        elif isinstance(b, Table):
            out.append(table_to_json(b))
    return out

# ---------- Finding utilities ----------

def slice_blocks(doc_blocks: List[Union[Paragraph, Table]], start: int, end: Optional[int]) -> List[Union[Paragraph, Table]]:
    return doc_blocks[start:end] if end is not None else doc_blocks[start:]

def find_first_index(doc_blocks: List[Union[Paragraph, Table]], pred) -> Optional[int]:
    for i, b in enumerate(doc_blocks):
        if isinstance(b, Paragraph) and pred(normalize(b.text)):
            return i
    return None

def find_next_index(doc_blocks: List[Union[Paragraph, Table]], from_idx: int, pred) -> Optional[int]:
    for i in range(from_idx + 1, len(doc_blocks)):
        b = doc_blocks[i]
        if isinstance(b, Paragraph) and pred(normalize(b.text)):
            return i
    return None

def find_table_after_caption(doc_blocks: List[Union[Paragraph, Table]], caption_pred) -> Optional[Table]:
    for i, b in enumerate(doc_blocks):
        if isinstance(b, Paragraph) and caption_pred(normalize(b.text)):
            for j in range(i + 1, len(doc_blocks)):
                if isinstance(doc_blocks[j], Table):
                    return doc_blocks[j]
            return None
    return None

# ---------- Document-specific finders ----------

# Major section anchors (flexible: accept plain titles)
EXEC_SUMMARY_PRED = lambda t: t.upper().startswith("EXECUTIVE SUMMARY")
NEXT_AFTER_EXEC_PRED = lambda t: t.upper().startswith("GENERAL FACILITY BACKGROUND")

# Exec-summary sub-section titles in this file (no numbers in body)
SUB_11_PRED = lambda t: t.lower().startswith("general information")
SUB_12_PRED = lambda t: t.lower().startswith("annual energy usages and costs")
SUB_13_PRED = lambda t: t.lower().startswith("carbon footprint")

# Recommendation Summary caption is "Table 13. The Assessment Recommendation Summary Table"
REC_TABLE_CAPTION_PRED = lambda t: bool(re.match(r"^table\s*13\b.*assessment recommendation summary table", t, flags=re.I))

# AR title patterns:
#   "4.1  AR No. 1 – ..."  OR  "AR No. 1 – ..."
AR_TITLE_PATTERNS = [
    re.compile(r"^4\.\d+\s+AR\s+No\.\s*\d+\b", re.I),
    re.compile(r"^AR\s+No\.\s*\d+\b", re.I),
]
def is_ar_title(t: str) -> bool:
    t = normalize(t)
    return any(p.match(t) for p in AR_TITLE_PATTERNS)

def extract_exec_summary_blocks(blocks: List[Union[Paragraph, Table]]) -> List[Union[Paragraph, Table]]:
    start = find_first_index(blocks, EXEC_SUMMARY_PRED)
    if start is None:
        return []
    end = find_next_index(blocks, start, NEXT_AFTER_EXEC_PRED)
    return slice_blocks(blocks, start, end)

def extract_subsection(blocks: List[Union[Paragraph, Table]], title_pred, stop_preds: List) -> List[Union[Paragraph, Table]]:
    start = find_first_index(blocks, title_pred)
    if start is None:
        return []
    # the first following heading that matches any stop predicate
    end_candidates = []
    for sp in stop_preds:
        idx = find_next_index(blocks, start, sp)
        if idx is not None:
            end_candidates.append(idx)
    end = min(end_candidates) if end_candidates else None
    return slice_blocks(blocks, start, end)

def extract_ars(blocks: List[Union[Paragraph, Table]]) -> List[List[Union[Paragraph, Table]]]:
    starts = []
    for i, b in enumerate(blocks):
        if isinstance(b, Paragraph) and is_ar_title(b.text):
            starts.append(i)
    results = []
    for k, s in enumerate(starts):
        next_s = starts[k + 1] if k + 1 < len(starts) else None
        # stop at next AR or at start of next major section "5"
        next_major = find_next_index(blocks, s, lambda t: bool(re.match(r"^\s*5(\.|$)", t)))
        end_candidates = [x for x in [next_s, next_major] if x is not None]
        end = min(end_candidates) if end_candidates else None
        results.append(slice_blocks(blocks, s, end))
    return results

# ---------- Build outputs ----------

def build_outputs(blocks: List[Union[Paragraph, Table]], output: str) -> Dict[str, Any]:
    exec_blocks = extract_exec_summary_blocks(blocks)

    # find the three subsections inside Executive Summary by their plain titles
    # Stop rules: next subsection title or the summary section title
    sub_11 = extract_subsection(exec_blocks, SUB_11_PRED, [SUB_12_PRED, SUB_13_PRED, lambda t: t.lower().startswith("summary of best practices")])
    sub_12 = extract_subsection(exec_blocks, SUB_12_PRED, [SUB_13_PRED, lambda t: t.lower().startswith("summary of best practices")])
    sub_13 = extract_subsection(exec_blocks, SUB_13_PRED, [lambda t: t.lower().startswith("summary of best practices")])

    # recommendation summary table (Table 13) anywhere in the doc (robust), but prefer inside exec summary
    rec_tbl = find_table_after_caption(exec_blocks if exec_blocks else blocks, REC_TABLE_CAPTION_PRED)

    # ARs (section 4.*), detected by title forms
    ar_lists = extract_ars(blocks)

    if output == "json":
        return {
            "general_information": blocks_to_json(sub_11),
            "annual_energy_usages_and_costs": blocks_to_json(sub_12),
            "carbon_footprint": blocks_to_json(sub_13),
            "recommendation_summary_table": (table_to_json(rec_tbl) if rec_tbl else None),
            "assessment_recommendations": [blocks_to_json(b) for b in ar_lists],
        }
    else:
        return {
            "general_information": blocks_to_html(sub_11),
            "annual_energy_usages_and_costs": blocks_to_html(sub_12),
            "carbon_footprint": blocks_to_html(sub_13),
            "recommendation_summary_table": (table_to_html(rec_tbl) if rec_tbl else ""),
            "assessment_recommendations": [blocks_to_html(b) for b in ar_lists],
        }

def write_artifacts(payload: Dict[str, Any], output: str) -> None:
    if output == "json":
        with open("extracted_sections.json", "w", encoding="utf-8") as f:
            json.dump(payload, f, ensure_ascii=False, indent=2)
    else:
        def write(name: str, content: str):
            with open(name, "w", encoding="utf-8") as f:
                f.write(content)
        write("general_information.html", payload["general_information"])
        write("annual_energy_usages_and_costs.html", payload["annual_energy_usages_and_costs"])
        write("carbon_footprint.html", payload["carbon_footprint"])
        write("recommendation_summary_table.html", payload["recommendation_summary_table"])
        for i, html in enumerate(payload["assessment_recommendations"], start=1):
            write(f"AR_{i:02d}.html", html)

def main(docx_path: str = DOCX_PATH, output: str = "html", save_files: bool = True) -> Dict[str, Any]:
    """
    output: "html" or "json"
    save_files: write artifacts to disk if True
    """
    assert output in {"html", "json"}, "output must be 'html' or 'json'"
    doc = Document(docx_path)
    blocks = list(iter_block_items(doc))
    data = build_outputs(blocks, output=output)
    if save_files:
        write_artifacts(data, output=output)
    return data


html_out = main(DOC_PATH, output="html", save_files=True)
print("HTML:", len(html_out["assessment_recommendations"]), "ARs")
json_out = main(DOC_PATH, output="json", save_files=True)
print("JSON:", len(json_out["assessment_recommendations"]), "ARs")


HTML: 10 ARs
JSON: 10 ARs


In [10]:
html_out

{'general_information': "<p>General Information</p>\n<p></p>\n<table border='1' cellpadding='4' cellspacing='0' style='border-collapse:collapse;width:100%'><tr><td><p>SIC. No.: 3491</p></td><td><p>Annual Production: 6,200 units/yr</p></td></tr><tr><td><p>NAICS Code: 332911</p></td><td><p>Annual Sales: $35,000,000/yr</p></td></tr><tr><td><p>Principal Product: Industrial Valves</p></td><td><p>Value per Finished Product: $5,645/unit</p></td></tr><tr><td><p>No. of Employees: 120</p></td><td><p>Total Energy Usage: 11,962 MMBTU/yr</p></td></tr><tr><td><p>Total Facility Area: 211,185 ft2</p></td><td><p>Total Utility Cost:  $340,614</p></td></tr><tr><td><p>Operating Hours: 5,616 hr/yr</p></td><td><p>No. of Assessment Recommendations: 5</p></td></tr></table>\n<p></p>",
 'annual_energy_usages_and_costs': '<p>Annual Energy Usages and Costs </p>\n<p></p>\n<p>Energy usage and the corresponding costs at the facility during the twelve-month period between June 2023 and July 2024 are summarized in Tab

In [11]:
general_info = html_out["general_information"]

In [12]:
general_info

"<p>General Information</p>\n<p></p>\n<table border='1' cellpadding='4' cellspacing='0' style='border-collapse:collapse;width:100%'><tr><td><p>SIC. No.: 3491</p></td><td><p>Annual Production: 6,200 units/yr</p></td></tr><tr><td><p>NAICS Code: 332911</p></td><td><p>Annual Sales: $35,000,000/yr</p></td></tr><tr><td><p>Principal Product: Industrial Valves</p></td><td><p>Value per Finished Product: $5,645/unit</p></td></tr><tr><td><p>No. of Employees: 120</p></td><td><p>Total Energy Usage: 11,962 MMBTU/yr</p></td></tr><tr><td><p>Total Facility Area: 211,185 ft2</p></td><td><p>Total Utility Cost:  $340,614</p></td></tr><tr><td><p>Operating Hours: 5,616 hr/yr</p></td><td><p>No. of Assessment Recommendations: 5</p></td></tr></table>\n<p></p>"

In [13]:
general_info_json = json_out["general_information"]

In [14]:
general_info_json

[{'type': 'paragraph',
  'alignment': 'left',
  'runs': [{'text': 'General Information', 'bold': False, 'italic': False}]},
 {'type': 'paragraph', 'alignment': 'left', 'runs': []},
 {'type': 'table',
  'rows': [[{'paragraphs': [{'type': 'paragraph',
       'alignment': 'left',
       'runs': [{'text': 'SIC. No.: ', 'bold': False, 'italic': False},
        {'text': '3491', 'bold': False, 'italic': False}]}]},
    {'paragraphs': [{'type': 'paragraph',
       'alignment': 'left',
       'runs': [{'text': 'Annual Production:', 'bold': False, 'italic': False},
        {'text': ' ', 'bold': False, 'italic': False},
        {'text': '6,200', 'bold': False, 'italic': False},
        {'text': ' ', 'bold': False, 'italic': False},
        {'text': 'u', 'bold': False, 'italic': False},
        {'text': 'nits', 'bold': False, 'italic': False},
        {'text': '/yr', 'bold': False, 'italic': False}]}]}],
   [{'paragraphs': [{'type': 'paragraph',
       'alignment': 'left',
       'runs': [{'text':