In [1]:
import os
from docx.shared import Inches, Pt
import json
from docx import Document

class Doc:
    def __init__(self, name: str):
        self.document = name
        self.sections = []

class Section:
    def __init__(self, level: int, text: str):
        self.parent = None
        self.level = level
        self.text = text
        self.sections = []

class SerializableSection(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, Section):
            return {
                "level": obj.level,
                "text": obj.text,
                "sections": [self.default(section) for section in obj.sections]
            }
        elif isinstance(obj, Doc):
            return {
                "name": obj.document,
                "sections": [self.default(section) for section in obj.sections]
            }
        else:
            return super().default(obj)

In [2]:
def list_paragraphs(file_path):
    # Get the filename from file_path
    filename = file_path.split('/')[-1]
    result = Doc(filename)
    current_section = None
    depth_counter = 0
    style_to_level = {}
    docx = Document(file_path)
    for paragraph in docx.paragraphs:
        styleName = paragraph.style.name
        text = paragraph.text.replace('\u00a0', ' ')

        # Skip empty sections
        if text == "":
            continue

        if styleName not in style_to_level:
            # There should be no styles with the same depth value
            assert depth_counter not in style_to_level.values()
            style_to_level[styleName] = depth_counter
            depth_counter += 1
        
        s = Section(style_to_level[styleName], text)
       
        if current_section is None:
            result.sections.append(s)
        elif s.level == current_section.level:
            if current_section.parent is None:
                # No parent, so add directly to root
                result.sections.append(s)
            else:
                # Append to the parent's sections
                s.parent = current_section.parent
                current_section.parent.sections.append(s)
        elif s.level > current_section.level:
            s.parent = current_section
            
            current_section.sections.append(s)
        elif s.level < current_section.level:
            p = current_section
            while s.level < p.level:
                p = p.parent  
            s.parent = p.parent            
            if p.parent is None:
                # back to the doc's root
                result.sections.append(s)
            else:  
                # if not just add it to the parent section
                p.parent.sections.append(s)
            
        current_section = s

    return result

In [3]:
doc = list_paragraphs("sample.docx")

# Convert to JSON string
json_output = json.dumps(doc, cls=SerializableSection, indent=4)

# Print the JSON output
print(json_output)


{
    "name": "sample.docx",
    "sections": [
        {
            "level": 0,
            "text": "Sample Document",
            "sections": [
                {
                    "level": 1,
                    "text": "This document was created using accessibility techniques for headings, lists, image alternate text, tables, and columns. It should be completely accessible using assistive technologies such as screen readers.",
                    "sections": [
                        {
                            "level": 2,
                            "text": "Headings",
                            "sections": []
                        }
                    ]
                },
                {
                    "level": 1,
                    "text": "There are eight section headings in this document. At the beginning, \"Sample Document\" is a level 1 heading. The main section headings, such as \"Headings\" and \"Lists\" are level 2 headings. The Tables section contains two 