In [None]:
from docx import Document
from docx.text.pagebreak import RenderedPageBreak

In [None]:
def extract_document_structure_with_details(docx_path):
    document_structure = []
    char_index = 0  # Running character index for the document
    
    doc = Document(docx_path)
    for i, paragraph in enumerate(doc.paragraphs):
        style_name = paragraph.style.name
        text = paragraph.text.strip()
        paragraph_details = {
            "index": i,
            "type": "title" if "Heading" in style_name else "paragraph",
            "text": text,
            "character_index": char_index,
            "contains_page_break": any("\x0c" in run.text for run in paragraph.runs)
        }
        char_index += len(text) + 1  # Account for the paragraph text length and a newline
        document_structure.append(paragraph_details)

    return document_structure


In [None]:
def breakdown_paragraph(text, max_length=600, overlap=50):
    paragraphs = []
    while len(text) > max_length:
        split_point = max_length - overlap
        paragraphs.append(text[:split_point].strip())
        text = text[split_point:].strip()
    paragraphs.append(text)
    return paragraphs

In [None]:
def extract_document_structure_with_details(docx_path):
    document_structure = []
    char_index = 0  # Running character index for the document
    heading_context = {1: None, 2: None, 3: None}  # Track context for heading levels

    doc = Document(docx_path)
    for i, paragraph in enumerate(doc.paragraphs):
        style_name = paragraph.style.name
        text = paragraph.text.strip()

        if text:  # Only process non-empty paragraphs
            if "Heading" in style_name:  # Detect titles based on heading styles
                try:
                    heading_level = int(style_name.split()[-1])  # Extract numeric level
                    if heading_level in heading_context:
                        heading_context[heading_level] = text
                        # Clear lower-level contexts
                        for lvl in range(heading_level + 1, 4):
                            heading_context[lvl] = None
                except ValueError:
                    heading_level = None

                document_structure.append({
                    "index": i,
                    "type": "title",
                    "text": text,
                    "character_index": char_index,
                    "heading_level_1_context": heading_context[1],
                    "heading_level_2_context": heading_context[2],
                    "heading_level_3_context": heading_context[3],
                    "contains_page_break": any("\x0c" in run.text for run in paragraph.runs)
                })
                char_index += len(text) + 1  # Update character index

            else:
                # Break down long paragraphs
                sub_paragraphs = breakdown_paragraph(text)
                for sub_paragraph in sub_paragraphs:
                    document_structure.append({
                        "index": i,
                        "type": "paragraph",
                        "text": sub_paragraph,
                        "character_index": char_index,
                        "contains_page_break": any("\x0c" in run.text for run in paragraph.runs),
                        "heading_level_1_context": heading_context[1],
                        "heading_level_2_context": heading_context[2],
                        "heading_level_3_context": heading_context[3]
                    })
                    char_index += len(sub_paragraph) + 1  # Update character index

    return document_structure

In [None]:

# Example usage
docx_file_path = "C:/Users/bossa/Downloads/Copie de DF_C.docx"  # Replace with your Word document file path
structure = extract_document_structure_with_details(docx_file_path)

for element in structure:
    print(f"Type: {element['type'].capitalize()}, Index: {element['index']}")
    print(f"Text: {element['text']}")
    print(f"Character Index: {element['character_index']}")
    print(f"Contains Page Break: {element['contains_page_break']}")
    print(f"Heading Level 1 Context: {element['heading_level_1_context']}")
    print(f"Heading Level 2 Context: {element['heading_level_2_context']}")
    print(f"Heading Level 3 Context: {element['heading_level_3_context']}")
    print("---")


In [None]:
structure[12:35]

In [None]:
from openpyxl import Workbook

def save_structure_to_excel(structure, excel_path):
    wb = Workbook()
    ws = wb.active
    ws.title = "Document Structure"

    # Header row
    ws.append(["Index", "Type", "Text", "Character Index", "Title Context","Title lvl2","Title lvl3"])

    # Write structure data
    for element in structure:
        ws.append([
            element["index"],
            element["type"],
            element["text"],
            element["character_index"],
            element["heading_level_1_context"],
            element["heading_level_2_context"],
            element.get("heading_level_3_context", "")
        ])

    # Save to file
    wb.save(excel_path)



In [None]:
# Example usage
excel_file_path = "DF_C_document_structure.xlsx"  # Replace with your desired file name
save_structure_to_excel(structure, excel_file_path)
print(f"Structure saved to {excel_file_path}")