In [1]:
import os
import json
import uuid
from docx import Document

In [2]:
class DocElement:
    def __init__(self, type: str, content: str = None, level: int = None):
        self.type = type  # 'text' or 'image'
        self.content = content
        self.level = level
        self.parent = None
        self.sections = []

class Doc:
    def __init__(self, name: str):
        self.document = name
        self.sections = []

class SerializableDocElement(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, DocElement):
            if obj.type == 'text':
                serialized = {
                    "type": "text",
                    "level": obj.level,
                    "text": obj.content
                }
            elif obj.type == 'image':
                serialized = {
                    "type": "image",
                    "image": obj.content
                }
            
            if obj.sections:
                serialized["sections"] = [self.default(section) for section in obj.sections]
            
            return serialized
        elif isinstance(obj, Doc):
            return {
                "name": obj.document,
                "sections": [self.default(section) for section in obj.sections]
            }
        else:
            return super().default(obj)

In [3]:
def list_paragraphs(file_path):
    print(f"Processing file: {file_path}")
    
    filename = os.path.basename(file_path)
    result = Doc(filename)
    
    # Define style to heading level mapping
    style_to_level = {
        'Heading 1': 1,
        'Heading 2': 2,
        'Heading 3': 3,
        'Heading 4': 4
    }
    
    docx = Document(file_path)
    
    base_dir = os.path.dirname(file_path)
    extracted_images_dir = os.path.join(base_dir, 'extracted_images')
    
    # Create directory for extracted images
    os.makedirs(extracted_images_dir, exist_ok=True)

    previous_level = None
    previous_style = None
    heading_counter = 0  # Keep track of the number of headings

    # Iterate through paragraphs and inline shapes (images)
    for paragraph in docx.paragraphs:
        styleName = paragraph.style.name
        text = paragraph.text.replace('\u00a0', ' ')
        
        if text.strip():  # Only process non-empty paragraphs
            # If we encounter a heading, increment heading_counter and set level
            if styleName in style_to_level:
                print(styleName)
                heading_counter += 1
                current_level = heading_counter
                previous_style = styleName
                previous_level = current_level
            else:
                # If it's not a heading, inherit the previous heading level
                current_level = previous_level

            # Create text element with the appropriate heading level
            text_element = DocElement(type='text', content=text, level=current_level)
            result.sections.append(text_element)

        # Check for inline images within the paragraph's runs
        for run in paragraph.runs:
            inline_shapes = run._element.findall('.//a:blip', namespaces={'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
            for inline_shape in inline_shapes:
                rId = inline_shape.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
                if rId:
                    rel = docx.part.rels[rId]
                    image_type = rel.target_part.content_type
                    ext = {
                        'image/jpeg': '.jpg',
                        'image/png': '.png',
                        'image/gif': '.gif',
                        'image/bmp': '.bmp',
                        'image/tiff': '.tiff',
                        'image/svg+xml': '.svg'
                    }.get(image_type, '.png')

                    # Create a unique filename for each image
                    unique_filename = f"image{len(result.sections) + 1}{ext}"
                    image_save_path = os.path.join(extracted_images_dir, unique_filename)

                    with open(image_save_path, 'wb') as f:
                        f.write(rel.target_part.blob)

                    # Add the image element immediately after the text element
                    image_element = DocElement(type='image', content=image_save_path)
                    result.sections.append(image_element)
    import json

    data= json.loads(json.dumps(result, cls=SerializableDocElement, indent=4, ensure_ascii=False))
    print("\n\n..original data..\n",data)
    updated_data = add_image_levels(data['sections'])
    print("level data = \n", updated_data)
    output_data = group_by_level(updated_data)
    print("\n---group_by_level----\n",output_data,"\n------------\n")
    processed_output = process_content(output_data)

    # Print the result
    import json
    print("\n00000000000000000\n",json.dumps(processed_output, indent=2))
        

    return processed_output



def add_image_levels(data):
    last_text_level = None  # To store the level of the last text element
    
    for element in data:
        if element["type"] == "text":
            # Update the last_text_level whenever we encounter a text element
            last_text_level = element.get("level")
        elif element["type"] == "image":
            # Assign the level of the last text element to the image
            if last_text_level is not None:
                element["level"] = last_text_level
    
    return data



def group_by_level(data):
    grouped_data = []
    current_group = None

    for item in data:
        if current_group is None or current_group["level"] != item["level"]:
            if current_group:
                grouped_data.append(current_group)
            current_group = {"level": item["level"], "content": []}
        current_group["content"].append({"type": item["type"], **({"text": item["text"]} if "text" in item else {"image": item["image"]})})

    if current_group:
        grouped_data.append(current_group)

    return grouped_data 


def process_content(data):
    output = []
    for item in data:
        content = item["content"]
        current_section = {"text_before_image": "", "images": [], "text_after_image": ""}
        for entry in content:
            if entry["type"] == "text":
                if current_section["images"]:
                    # If images are already present, append text after image
                    current_section["text_after_image"] += entry["text"] + " "
                else:
                    # Otherwise, append text before image
                    current_section["text_before_image"] += entry["text"] + " "
            elif entry["type"] == "image":
                # Add images to the list if multiple images appear consecutively
                current_section["images"].append(entry["image"])

        # Append the current section to output after processing all content
        if current_section["text_before_image"].strip() or current_section["images"] or current_section["text_after_image"].strip():
            output.append(current_section)

    return output

# def export_to_json(doc_obj, output_path):
#     with open(output_path, 'w', encoding='utf-8') as f:
#         json.dump(doc_obj, f, cls=SerializableDocElement, indent=4, ensure_ascii=False)

# Example usage
def main(file_path):
    doc = list_paragraphs(file_path)
    return doc
    # print(a['sections'],"------------------")
file_path = '/home/anushkas/Downloads/US/ProdDoc1/Line Item Filter (3).docx'
# Usage example (replace with your actual file path)
main(file_path)

Processing file: /home/anushkas/Downloads/US/ProdDoc1/Line Item Filter (3).docx
Heading 2
Heading 1
Heading 3
Heading 3


..original data..
 {'name': 'Line Item Filter (3).docx', 'sections': [{'type': 'text', 'level': 1, 'text': 'Line Item Filter'}, {'type': 'text', 'level': 1, 'text': 'The estimate in RIB 4.0 provides an adaptive filter for a clearer and conveniant presentation of assignments to Line Items. This Filter is called “Line Item Filter”. '}, {'type': 'image', 'image': '/home/anushkas/Downloads/US/ProdDoc1/extracted_images/image3.png'}, {'type': 'text', 'level': 1, 'text': 'This figure shows the two Leading Structure Containers of BoQ and Activities on the left, and the Line Items Container on the right side.'}, {'type': 'text', 'level': 2, 'text': 'Line Item Structure Container'}, {'type': 'text', 'level': 2, 'text': 'The following illustration is an image out of the Estimate module, showing on the left side this Line Item Structure container and on the right side the “Line

[{'text_before_image': 'Line Item Filter The estimate in RIB 4.0 provides an adaptive filter for a clearer and conveniant presentation of assignments to Line Items. This Filter is called “Line Item Filter”.  ',
  'images': ['/home/anushkas/Downloads/US/ProdDoc1/extracted_images/image3.png'],
  'text_after_image': 'This figure shows the two Leading Structure Containers of BoQ and Activities on the left, and the Line Items Container on the right side. '},
 {'text_before_image': 'Line Item Structure Container The following illustration is an image out of the Estimate module, showing on the left side this Line Item Structure container and on the right side the “Line Items” container. As an example a structure with Location and Cost Group is shown. ',
  'images': ['/home/anushkas/Downloads/US/ProdDoc1/extracted_images/image7.png',
   '/home/anushkas/Downloads/US/ProdDoc1/extracted_images/image8.png'],
  'text_after_image': ''},
 {'text_before_image': 'Line Item Filter2222222222 The estimate

In [4]:
data =[
    {
        "type": "text",
        "level": 1,
        "text": "Line Item Filter"
    },
    {
        "type": "text",
        "level": 1,
        "text": "The estimate in RIB 4.0 provides an adaptive filter for a clearer and conveniant presentation of assignments to Line Items. This Filter is called “Line Item Filter”. "
    },
    {
        "type": "image",
        "image": "/home/anushkas/Downloads/US/ProdDoc1/extracted_images/image3.png",
        "level": 1
    },
    {
        "type": "text",
        "level": 1,
        "text": "This figure shows the two Leading Structure Containers of BoQ and Activities on the left, and the Line Items Container on the right side."
    },
    {
        "type": "text",
        "level": 2,
        "text": "Line Item Structure Container"
    },
    {
        "type": "text",
        "level": 2,
        "text": "The following illustration is an image out of the Estimate module, showing on the left side this Line Item Structure container and on the right side the “Line Items” container. As an example a structure with Location and Cost Group is shown."
    },
    {
        "type": "text",
        "level": 2,
        "text": "Line Item Structure Container"
    },
    {
        "type": "image",
        "image": "/home/anushkas/Downloads/US/ProdDoc1/extracted_images/image8.png",
        "level": 2
    },
    {
        "type": "text",
        "level": 3,
        "text": "Line Item Filter2222222222"
    },
    {
        "type": "text",
        "level": 3,
        "text": "The estimate in RIB 4.0 provides an adaptive filter for a clearer and conveniant presentation of assignments to Line Items. This Filter is called “Line Item Filter”. The filter disables/ enables the visibility of Leading Structure Items which are not or are assigned to Line Items. Every Leading Structure Container, such as BoQ, Activities, … provides the Line Item Filter  in its tool bar."
    },
    {
        "type": "image",
        "image": "/home/anushkas/Downloads/US/ProdDoc1/extracted_images/image11.png",
        "level": 3
    },
    {
        "type": "text",
        "level": 4,
        "text": "Line Item Filter3333333333333"
    },
    {
        "type": "text",
        "level": 4,
        "text": "The estimate in RIB 4.0 provides an adaptive filter for a clearer and conveniant "
    }
]

In [5]:
def group_by_level(data):
    grouped_data = []
    current_group = None

    for item in data:
        if current_group is None or current_group["level"] != item["level"]:
            if current_group:
                grouped_data.append(current_group)
            current_group = {"level": item["level"], "content": []}
        current_group["content"].append({"type": item["type"], **({"text": item["text"]} if "text" in item else {"image": item["image"]})})

    if current_group:
        grouped_data.append(current_group)

    return grouped_data

In [6]:
data = group_by_level(data)
data

[{'level': 1,
  'content': [{'type': 'text', 'text': 'Line Item Filter'},
   {'type': 'text',
    'text': 'The estimate in RIB 4.0 provides an adaptive filter for a clearer and conveniant presentation of assignments to Line Items. This Filter is called “Line Item Filter”. '},
   {'type': 'image',
    'image': '/home/anushkas/Downloads/US/ProdDoc1/extracted_images/image3.png'},
   {'type': 'text',
    'text': 'This figure shows the two Leading Structure Containers of BoQ and Activities on the left, and the Line Items Container on the right side.'}]},
 {'level': 2,
  'content': [{'type': 'text', 'text': 'Line Item Structure Container'},
   {'type': 'text',
    'text': 'The following illustration is an image out of the Estimate module, showing on the left side this Line Item Structure container and on the right side the “Line Items” container. As an example a structure with Location and Cost Group is shown.'},
   {'type': 'text', 'text': 'Line Item Structure Container'},
   {'type': 'imag

In [7]:
data = [
    {
        "level": 1,
        "content": [
            {
                "type": "text",
                "text": "Line Item Filter"
            },
            {
                "type": "text",
                "text": "The estimate in RIB 4.0 provides an adaptive filter for a clearer and conveniant presentation of assignments to Line Items. This Filter is called “Line Item Filter”. "
            },
            {
                "type": "image",
                "image": "/home/anushkas/Downloads/US/ProdDoc1/extracted_images/image3.png"
            },
            {
                "type": "text",
                "text": "This figure shows the two Leading Structure Containers of BoQ and Activities on the left, and the Line Items Container on the right side."
            }
        ]
    },
    {
        "level": 2,
        "content": [
            {
                "type": "text",
                "text": "Line Item Structure Container"
            },
            {
                "type": "text",
                "text": "The following illustration is an image out of the Estimate module, showing on the left side this Line Item Structure container and on the right side the “Line Items” container. As an example a structure with Location and Cost Group is shown."
            },
            {
                "type": "image",
                "image": "/home/anushkas/Downloads/US/ProdDoc1/extracted_images/image7.png"
            },
            {
                "type": "image",
                "image": "/home/anushkas/Downloads/US/ProdDoc1/extracted_images/image8.png"
            }
        ]
    },
    {
        "level": 3,
        "content": [
            {
                "type": "text",
                "text": "Line Item Filter2222222222"
            },
            {
                "type": "text",
                "text": "The estimate in RIB 4.0 provides an adaptive filter for a clearer and conveniant presentation of assignments to Line Items. This Filter is called “Line Item Filter”. The filter disables/ enables the visibility of Leading Structure Items which are not or are assigned to Line Items. Every Leading Structure Container, such as BoQ, Activities, … provides the Line Item Filter  in its tool bar."
            },
            {
                "type": "image",
                "image": "/home/anushkas/Downloads/US/ProdDoc1/extracted_images/image11.png"
            }
        ]
    },
    {
        "level": 4,
        "content": [
            {
                "type": "text",
                "text": "Line Item Filter3333333333333"
            },
            {
                "type": "text",
                "text": "The estimate in RIB 4.0 provides an adaptive filter for a clearer and conveniant "
            }
        ]
    }
]

In [8]:
def process_content(data):
    output = []
    for item in data:
        content = item["content"]
        current_section = {"text_before_image": "", "images": [], "text_after_image": ""}
        for entry in content:
            if entry["type"] == "text":
                if current_section["images"]:
                    # If images are already present, append text after image
                    current_section["text_after_image"] += entry["text"] + " "
                else:
                    # Otherwise, append text before image
                    current_section["text_before_image"] += entry["text"] + " "
            elif entry["type"] == "image":
                # Add images to the list if multiple images appear consecutively
                current_section["images"].append(entry["image"])

        # Append the current section to output after processing all content
        if current_section["text_before_image"].strip() or current_section["images"] or current_section["text_after_image"].strip():
            output.append(current_section)

    return output


# Process the data
processed_output = process_content(data)

# Print the result
import json
print(json.dumps(processed_output, indent=2))

[
  {
    "text_before_image": "Line Item Filter The estimate in RIB 4.0 provides an adaptive filter for a clearer and conveniant presentation of assignments to Line Items. This Filter is called \u201cLine Item Filter\u201d.  ",
    "images": [
      "/home/anushkas/Downloads/US/ProdDoc1/extracted_images/image3.png"
    ],
    "text_after_image": "This figure shows the two Leading Structure Containers of BoQ and Activities on the left, and the Line Items Container on the right side. "
  },
  {
    "text_before_image": "Line Item Structure Container The following illustration is an image out of the Estimate module, showing on the left side this Line Item Structure container and on the right side the \u201cLine Items\u201d container. As an example a structure with Location and Cost Group is shown. ",
    "images": [
      "/home/anushkas/Downloads/US/ProdDoc1/extracted_images/image7.png",
      "/home/anushkas/Downloads/US/ProdDoc1/extracted_images/image8.png"
    ],
    "text_after_imag