In [42]:
from pathlib import Path
import zipfile
import xml.etree.ElementTree as ET

In [43]:
DATA_DIR = Path("data")

# Excel XML namespaces (DO NOT SKIP THIS)
EXCEL_NS = {
    "main": "http://schemas.openxmlformats.org/spreadsheetml/2006/main",
    "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
}

In [44]:
def safe_read(z, name):
    if name in z.namelist():
        return z.read(name)
    return None


In [45]:
def discover_excel_sheets(xlsx_path: Path):
    with zipfile.ZipFile(xlsx_path) as z:

        if "xl/workbook.xml" not in z.namelist():
            raise ValueError("Missing xl/workbook.xml")

        workbook_root = ET.fromstring(z.read("xl/workbook.xml"))

        sheets = []
        for sheet in workbook_root.findall(".//main:sheet", EXCEL_NS):
            sheets.append({
                "sheet_name": sheet.attrib["name"],
                "sheet_id": sheet.attrib["sheetId"],
                "rId": sheet.attrib[
                    "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id"
                ]
            })

        if "xl/_rels/workbook.xml.rels" not in z.namelist():
            raise ValueError("Missing workbook relationships")

        rels_root = ET.fromstring(z.read("xl/_rels/workbook.xml.rels"))

        # --- READ TARGET + TYPE ---
        rels = {}
        for rel in rels_root.findall(".//"):
            rid = rel.attrib.get("Id")
            target = rel.attrib.get("Target")
            rtype = rel.attrib.get("Type")

            if rid and target and rtype:
                rels[rid] = {
                    "target": target,
                    "type": rtype
                }

        valid_sheets = []

        for s in sheets:
            rel = rels.get(s["rId"])

            if not rel:
                # no relationship → skip
                continue

            if not rel["type"].endswith("/worksheet"):
                # chartsheet / dialog / navigation → skip
                continue

            s["xml_path"] = "xl/" + rel["target"]
            valid_sheets.append(s)

        return valid_sheets


In [46]:
all_workbooks = []

for xlsx in DATA_DIR.glob("*.xlsx"):
    if xlsx.name.startswith("~$"):
        continue  # skip temp Excel files

    try:
        sheets = discover_excel_sheets(xlsx)
        all_workbooks.append({
            "file_name": xlsx.name,
            "file_path": xlsx,
            "sheets": sheets
        })
    except Exception as e:
        print(f"❌ Failed to read {xlsx.name}: {e}")


In [47]:
all_workbooks

[{'file_name': 'DFMC M18-3 FBL SRS Rev 2.5_F002_ET0_20241008.xlsx',
  'file_path': WindowsPath('data/DFMC M18-3 FBL SRS Rev 2.5_F002_ET0_20241008.xlsx'),
  'sheets': [{'sheet_name': '1.Revision',
    'sheet_id': '2',
    'rId': 'rId1',
    'xml_path': 'xl/worksheets/sheet1.xml'},
   {'sheet_name': '2.GeneralInfo',
    'sheet_id': '3',
    'rId': 'rId2',
    'xml_path': 'xl/worksheets/sheet2.xml'},
   {'sheet_name': '3.TimingPar',
    'sheet_id': '4',
    'rId': 'rId3',
    'xml_path': 'xl/worksheets/sheet3.xml'},
   {'sheet_name': '4.FBL-Services',
    'sheet_id': '5',
    'rId': 'rId4',
    'xml_path': 'xl/worksheets/sheet4.xml'},
   {'sheet_name': '5.Routine DID',
    'sheet_id': '7',
    'rId': 'rId5',
    'xml_path': 'xl/worksheets/sheet5.xml'},
   {'sheet_name': '6_DIDs',
    'sheet_id': '8',
    'rId': 'rId6',
    'xml_path': 'xl/worksheets/sheet6.xml'},
   {'sheet_name': '6_1 Default value',
    'sheet_id': '9',
    'rId': 'rId7',
    'xml_path': 'xl/worksheets/sheet7.xml'},
   

In [48]:
xlsx

WindowsPath('data/~$GAC DPT_T9M_CDC_V0.2_20240729.XLSX')

In [49]:
def discover_tables_for_sheet(z, sheet_xml_path):
    sheet_xml = z.read(sheet_xml_path)
    sheet_root = ET.fromstring(sheet_xml)

    # --- load worksheet relationships ---
    rels_path = "xl/worksheets/_rels/" + Path(sheet_xml_path).name + ".rels"
    if rels_path not in z.namelist():
        return []

    rels_root = ET.fromstring(z.read(rels_path))

    rels = {}
    for rel in rels_root.findall(".//"):
        rid = rel.attrib.get("Id")
        target = rel.attrib.get("Target")
        rtype = rel.attrib.get("Type")
        if rid and target and rtype:
            rels[rid] = {"target": target, "type": rtype}

    tables = []

    for tp in sheet_root.findall(".//main:tablePart", EXCEL_NS):
        rid = tp.attrib.get(
            "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id"
        )
        rel = rels.get(rid)
        if not rel:
            continue

        if not rel["type"].endswith("/table"):
            continue

        table_path = resolve_xl_path(sheet_xml_path, rel["target"])
        if table_path not in z.namelist():
            continue
        table_root = ET.fromstring(z.read(table_path))


        tables.append({
            "table_path": table_path,
            "name": table_root.attrib.get("name"),
            "ref": table_root.attrib.get("ref"),   # e.g. A1:D42
        })

    return tables


In [50]:
DRAWING_NS = {
    "xdr": "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing",
    "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
}

In [51]:
def discover_drawings_for_sheet(z, sheet_xml_path):
    sheet_xml = z.read(sheet_xml_path)
    sheet_root = ET.fromstring(sheet_xml)

    drawing_elem = sheet_root.find(".//main:drawing", EXCEL_NS)
    if drawing_elem is None:
        return None

    rid = drawing_elem.attrib.get(
        "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id"
    )

    rels_path = "xl/worksheets/_rels/" + Path(sheet_xml_path).name + ".rels"
    if rels_path not in z.namelist():
        return None

    rels_root = ET.fromstring(z.read(rels_path))

    drawing_target = None
    for rel in rels_root.findall(".//"):
        if rel.attrib.get("Id") == rid:
            drawing_target = rel.attrib.get("Target")
            break

    if not drawing_target:
        return None

    drawing_path = resolve_xl_path(sheet_xml_path, drawing_target)

    if drawing_path not in z.namelist():
        return None

    drawing_root = ET.fromstring(z.read(drawing_path))

    shapes = []

    for sp in drawing_root.findall(".//xdr:sp", DRAWING_NS):
        texts = sp.findall(".//a:t", DRAWING_NS)
        text = " ".join(t.text for t in texts if t.text)

        shapes.append({
            "type": "shape",
            "text": text
        })

    for cxn in drawing_root.findall(".//xdr:cxnSp", DRAWING_NS):
        shapes.append({
            "type": "connector"
        })

    return {
        "drawing_path": drawing_path,
        "elements": shapes
    }


In [52]:
def resolve_xl_path(base_path, target):
    """
    Resolve OpenXML relationship targets like ../drawings/drawing1.xml
    into normalized xl/... paths
    """
    base_dir = Path(base_path).parent
    resolved = (base_dir / target).resolve().as_posix()

    # Zip files never start with /
    return resolved.lstrip("/")


In [53]:
for xlsx in DATA_DIR.glob("*.xlsx"):
    if xlsx.name.startswith("~$"):
        continue  # skip temp Excel files
    with zipfile.ZipFile(xlsx) as z:
        sheets = discover_excel_sheets(xlsx)
    
        for sheet in sheets:
            tables = discover_tables_for_sheet(z, sheet["xml_path"])
            drawings = discover_drawings_for_sheet(z, sheet["xml_path"])
            sheet["tables"] = tables
            sheet["drawings"] = drawings


In [54]:
sheets

[{'sheet_name': 'Version Control',
  'sheet_id': '51',
  'rId': 'rId1',
  'xml_path': 'xl/worksheets/sheet1.xml',
  'tables': [],
  'drawings': None},
 {'sheet_name': 'Timeline Tracker',
  'sheet_id': '66',
  'rId': 'rId2',
  'xml_path': 'xl/worksheets/sheet2.xml',
  'tables': [],
  'drawings': None},
 {'sheet_name': 'ServiceIdentifiers',
  'sheet_id': '55',
  'rId': 'rId3',
  'xml_path': 'xl/worksheets/sheet3.xml',
  'tables': [],
  'drawings': None},
 {'sheet_name': 'Diagnostics_OPL',
  'sheet_id': '68',
  'rId': 'rId4',
  'xml_path': 'xl/worksheets/sheet4.xml',
  'tables': [],
  'drawings': None},
 {'sheet_name': 'Standard Identifiers ',
  'sheet_id': '71',
  'rId': 'rId5',
  'xml_path': 'xl/worksheets/sheet5.xml',
  'tables': [],
  'drawings': None},
 {'sheet_name': 'Naming Convention',
  'sheet_id': '72',
  'rId': 'rId6',
  'xml_path': 'xl/worksheets/sheet6.xml',
  'tables': [],
  'drawings': None},
 {'sheet_name': 'Timing Parameters',
  'sheet_id': '23',
  'rId': 'rId7',
  'xml_p

In [1]:
from pathlib import Path
import zipfile
import xml.etree.ElementTree as ET
from pathlib import PurePosixPath

In [2]:
def safe_read(z, name):
    if name in z.namelist():
        return z.read(name)
    return None


In [3]:
def discover_excel_sheets(xlsx_path: Path):
    with zipfile.ZipFile(xlsx_path) as z:

        if "xl/workbook.xml" not in z.namelist():
            raise ValueError("Missing xl/workbook.xml")

        workbook_root = ET.fromstring(z.read("xl/workbook.xml"))

        sheets = []
        for sheet in workbook_root.findall(".//main:sheet", EXCEL_NS):
            sheets.append({
                "sheet_name": sheet.attrib["name"],
                "sheet_id": sheet.attrib["sheetId"],
                "rId": sheet.attrib[
                    "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id"
                ]
            })

        if "xl/_rels/workbook.xml.rels" not in z.namelist():
            raise ValueError("Missing workbook relationships")

        rels_root = ET.fromstring(z.read("xl/_rels/workbook.xml.rels"))

        # --- READ TARGET + TYPE ---
        rels = {}
        for rel in rels_root.findall(".//"):
            rid = rel.attrib.get("Id")
            target = rel.attrib.get("Target")
            rtype = rel.attrib.get("Type")

            if rid and target and rtype:
                rels[rid] = {
                    "target": target,
                    "type": rtype
                }

        valid_sheets = []

        for s in sheets:
            rel = rels.get(s["rId"])

            if not rel:
                # no relationship → skip
                continue

            if not rel["type"].endswith("/worksheet"):
                # chartsheet / dialog / navigation → skip
                continue

            s["xml_path"] = "xl/" + rel["target"]
            valid_sheets.append(s)

        return valid_sheets


In [4]:
def parse_sheet_metadata(root):
    dim = root.find("x:dimension", NS)
    return {
        "dimension": dim.attrib["ref"] if dim is not None else None
    }


In [5]:
def load_shared_strings(z):
    try:
        root = ET.fromstring(z.read("xl/sharedStrings.xml"))
    except KeyError:
        return []

    return [
        si.find(".//x:t", NS).text if si.find(".//x:t", NS) is not None else ""
        for si in root.findall("x:si", NS)
    ]


In [6]:
def parse_sheet_data(root, shared_strings):
    rows = []

    for row in root.findall(".//x:row", NS):
        cells = {}
        for cell in row.findall("x:c", NS):
            ref = cell.attrib["r"]
            t = cell.attrib.get("t")
            v = cell.find("x:v", NS)

            if v is None:
                value = None
            elif t == "s":
                value = shared_strings[int(v.text)]
            else:
                value = v.text

            cells[ref] = value

        rows.append({
            "row": int(row.attrib["r"]),
            "cells": cells
        })

    return rows


In [7]:
def parse_merged_cells(root):
    return [
        mc.attrib["ref"]
        for mc in root.findall(".//x:mergeCell", NS)
    ]


In [8]:
import re

def extract_sheet_index(sheet_path):
    m = re.search(r"sheet(\d+)\.xml", sheet_path)
    return int(m.group(1)) if m else None


In [9]:
def parse_drawings_with_anchors(z, sheet_root, sheet_idx):
    drawings = []

    drawing_elem = sheet_root.find("x:drawing", NS)
    if drawing_elem is None:
        return drawings

    rels_path = f"xl/worksheets/_rels/sheet{sheet_idx}.xml.rels"
    if rels_path not in z.namelist():
        return drawings

    rid = drawing_elem.attrib[
        "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id"
    ]

    rels_root = ET.fromstring(z.read(rels_path))

    target = None
    for r in rels_root:
        if r.attrib.get("Id") == rid:
            target = r.attrib.get("Target")
            break

    if not target:
        return drawings

    drawing_path = str(
        PurePosixPath("xl/worksheets") / target
    ).replace("xl/worksheets/../", "xl/")

    if drawing_path not in z.namelist():
        return drawings

    droot = ET.fromstring(z.read(drawing_path))

    for anchor in droot.findall(".//xdr:twoCellAnchor", DRAW_NS):
        fr = anchor.find("xdr:from", DRAW_NS)
        to = anchor.find("xdr:to", DRAW_NS)

        texts = [
            t.text for t in anchor.findall(".//a:t", DRAW_NS) if t.text
        ]

        drawings.append({
            "from_row": int(fr.find("xdr:row", DRAW_NS).text),
            "to_row": int(to.find("xdr:row", DRAW_NS).text),
            "text": texts
        })

    return drawings


In [10]:
NS = {
    "x": "http://schemas.openxmlformats.org/spreadsheetml/2006/main",
    "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
}

DRAW_NS = {
    "xdr": "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing",
    "a": "http://schemas.openxmlformats.org/drawingml/2006/main"
}


EXCEL_NS = {
    "main": "http://schemas.openxmlformats.org/spreadsheetml/2006/main",
    "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
}

DATA_DIR = Path("data")
xlsx = "data/GAC DPT_T9M_CDC_V0.2_20240729.xlsx"
sheet_path = "xl/worksheets/sheet5.xml"

In [16]:
sheet_jsons = []
for xlsx in DATA_DIR.glob("*.xlsx"):
    if xlsx.name.startswith("~$"):
        continue

    try:
        sheets = discover_excel_sheets(xlsx)

        with zipfile.ZipFile(xlsx) as z:
            shared_strings = load_shared_strings(z)

            for s in sheets:
                sheet_path = s["xml_path"]
                sheet_name = s["sheet_name"]
                sheet_idx = extract_sheet_index(sheet_path)

                try:
                    if sheet_path not in z.namelist():
                        continue

                    sheet_root = ET.fromstring(z.read(sheet_path))

                    # optional: skip sheets with no real data
                    if not sheet_root.findall(".//x:sheetData", NS):
                        continue

                    sheet_jsons.append({
                        "xlsx_name": xlsx.name,
                        "sheet_name": sheet_name,
                        "sheet_path": sheet_path,
                        "dimension": parse_sheet_metadata(sheet_root),
                        "rows": parse_sheet_data(sheet_root, shared_strings),
                        "merged_cells": parse_merged_cells(sheet_root),
                        "drawings": parse_drawings_with_anchors(
                            z, sheet_root, sheet_idx
                        )
                    })

                    # TODO: append to results list / write to disk

                except Exception as sheet_err:
                    print(
                        f"[SHEET ERROR] {xlsx.name} | {sheet_name}\n"
                        f"  → {sheet_err}"
                    )

    except Exception as file_err:
        print(f"[FILE ERROR] {xlsx.name} → {file_err}")


In [17]:
sheet_jsons

[{'xlsx_name': 'DFMC M18-3 FBL SRS Rev 2.5_F002_ET0_20241008.xlsx',
  'sheet_name': '1.Revision',
  'sheet_path': 'xl/worksheets/sheet1.xml',
  'dimension': {'dimension': 'A2:H23'},
  'rows': [{'row': 2,
    'cells': {'A2': 'Revision Management',
     'B2': None,
     'C2': None,
     'D2': None,
     'E2': None,
     'F2': None,
     'G2': None,
     'H2': None}},
   {'row': 3,
    'cells': {'A3': 'Revision\n版本',
     'B3': 'Date\n日期',
     'C3': 'Author\n作者',
     'D3': 'Changes Section\n修改章节',
     'E3': 'Changes Comments\n修改说明',
     'F3': None,
     'G3': None,
     'H3': None}},
   {'row': 4,
    'cells': {'A4': '1.0',
     'B4': '45359',
     'C4': 'JH',
     'D4': 'First version',
     'E4': 'Version 1.0 means used for F002 S/W release',
     'F4': None,
     'G4': None,
     'H4': None}},
   {'row': 5,
    'cells': {'A5': '2.0',
     'B5': '45463',
     'C5': 'JH',
     'D5': 'F002（ET0）',
     'E5': '1). Diag ID Change\n     > ECU Tx:0x7C6 → 0x7B8\n     > ECU Rx (Phy):0x7CE → 

In [35]:
with zipfile.ZipFile(xlsx) as z:
    shared_strings = load_shared_strings(z)
    sheet_root = ET.fromstring(z.read(sheet_path))

    sheet_json = {
        "dimension": parse_sheet_metadata(sheet_root),
        "rows": parse_sheet_data(sheet_root, shared_strings),
        "merged_cells": parse_merged_cells(sheet_root),
        "drawings": parse_drawings_with_anchors(z, sheet_root, sheet_idx=5)
    }


In [None]:
import zipfile
import xml.etree.ElementTree as ET
from pathlib import PurePosixPath

NS = {
    "x": "http://schemas.openxmlformats.org/spreadsheetml/2006/main",
    "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
}

DRAW_NS = {
    "xdr": "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing",
    "a": "http://schemas.openxmlformats.org/drawingml/2006/main"
}

xlsx = "data/GAC DPT_T9M_CDC_V0.2_20240729.xlsx"
sheet_path = "xl/worksheets/sheet5.xml"

with zipfile.ZipFile(xlsx) as z:
    # --- read sheet ---
    sheet_root = ET.fromstring(z.read(sheet_path))

    # --- find drawing reference ---
    drawing_elem = sheet_root.find("x:drawing", NS)
    if drawing_elem is None:
        print("❌ No diagram on this sheet")
    else:
        drawing_rid = drawing_elem.attrib[
            "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id"
        ]
        print("✅ Diagram rId:", drawing_rid)

        # --- load sheet relationships ---
        rels_path = "xl/worksheets/_rels/sheet5.xml.rels"
        rels_root = ET.fromstring(z.read(rels_path))

        drawing_target = None
        for rel in rels_root:
            if rel.attrib.get("Id") == drawing_rid:
                drawing_target = rel.attrib["Target"]
                break

        if not drawing_target:
            print("❌ Drawing relationship not found")
        else:
            # normalize path (../drawings/xxx.xml → xl/drawings/xxx.xml)
            drawing_path = str(
                PurePosixPath("xl/worksheets") / drawing_target
            ).replace("xl/worksheets/../", "xl/")

            print("✅ Drawing XML:", drawing_path)

            drawing_root = ET.fromstring(z.read(drawing_path))

            # --- extract diagram text ---
            texts = [
                t.text for t in drawing_root.findall(".//a:t", DRAW_NS)
                if t.text
            ]

            print("🧩 Diagram texts:")
            for t in texts:
                print(" -", t)
