In [1]:
import os
import json
import uuid
from docx import Document

In [2]:
class DocElement:
    def __init__(self, type: str, content: str = None, level: int = None):
        self.type = type  # 'text' or 'image'
        self.content = content
        self.level = level
        self.parent = None
        self.sections = []

class Doc:
    def __init__(self, name: str):
        self.document = name
        self.sections = []

class SerializableDocElement(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, DocElement):
            if obj.type == 'text':
                serialized = {
                    "type": "text",
                    "level": obj.level,
                    "text": obj.content
                }
            elif obj.type == 'image':
                serialized = {
                    "type": "image",
                    "image": obj.content
                }
            
            if obj.sections:
                serialized["sections"] = [self.default(section) for section in obj.sections]
            
            return serialized
        elif isinstance(obj, Doc):
            return {
                "name": obj.document,
                "sections": [self.default(section) for section in obj.sections]
            }
        else:
            return super().default(obj)

In [14]:
def list_paragraphs(file_path):
    print(f"Processing file: {file_path}")
    
    filename = os.path.basename(file_path)
    result = Doc(filename)
    
    # Define style to heading level mapping
    style_to_level = {
        'Heading 1': 1,
        'Heading 2': 2,
        'Heading 3': 3,
        'Heading 4': 4
    }
    
    docx = Document(file_path)
    
    base_dir = os.path.dirname(file_path)
    extracted_images_dir = os.path.join(base_dir, 'extracted_images')
    
    # Create directory for extracted images
    os.makedirs(extracted_images_dir, exist_ok=True)

    previous_level = None
    previous_style = None
    heading_counter = 0  # Keep track of the number of headings

    # Iterate through paragraphs and inline shapes (images)
    for paragraph in docx.paragraphs:
        styleName = paragraph.style.name
        text = paragraph.text.replace('\u00a0', ' ')
        
        if text.strip():  # Only process non-empty paragraphs
            # If we encounter a heading, increment heading_counter and set level
            if styleName in style_to_level:
             
                heading_counter += 1
                current_level = heading_counter
                previous_style = styleName
                previous_level = current_level
            else:
                # If it's not a heading, inherit the previous heading level
                current_level = previous_level

            # Create text element with the appropriate heading level
            text_element = DocElement(type='text', content=text, level=current_level)
            result.sections.append(text_element)

        # Check for inline images within the paragraph's runs
        for run in paragraph.runs:
            inline_shapes = run._element.findall('.//a:blip', namespaces={'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
            for inline_shape in inline_shapes:
                rId = inline_shape.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
                if rId:
                    rel = docx.part.rels[rId]
                    image_type = rel.target_part.content_type
                    ext = {
                        'image/jpeg': '.jpg',
                        'image/png': '.png',
                        'image/gif': '.gif',
                        'image/bmp': '.bmp',
                        'image/tiff': '.tiff',
                        'image/svg+xml': '.svg'
                    }.get(image_type, '.png')

                    # Create a unique filename for each image
                    unique_filename = f"image{len(result.sections) + 1}{ext}"
                    image_save_path = os.path.join(extracted_images_dir, unique_filename)

                    with open(image_save_path, 'wb') as f:
                        f.write(rel.target_part.blob)

                    # Add the image element immediately after the text element
                    image_element = DocElement(type='image', content=image_save_path)
                    result.sections.append(image_element)
    import json

    data= json.loads(json.dumps(result, cls=SerializableDocElement, indent=4, ensure_ascii=False))
    # print("\n\n..original data..\n",data)
    updated_data = add_image_levels(data['sections'])
    print("level data = \n", updated_data)
    output_data = group_by_level(updated_data)
    print("\n---group_by_level----\n",output_data,"\n------------\n")
    processed_output = process_content(output_data)

   
    import json
    print("\n00000000000000000\n",json.dumps(processed_output, indent=2),"\n00000000000000000\n")
        

    return processed_output



def add_image_levels(data):
    last_text_level = None  # To store the level of the last text element
    
    for element in data:
        if element["type"] == "text":
            # Update the last_text_level whenever we encounter a text element
            last_text_level = element.get("level")
        elif element["type"] == "image":
            # Assign the level of the last text element to the image
            if last_text_level is not None:
                element["level"] = last_text_level
    
    return data



def group_by_level(data):
    grouped_data = []
    current_group = None

    for item in data:
        if current_group is None or current_group["level"] != item["level"]:
            if current_group:
                grouped_data.append(current_group)
            current_group = {"level": item["level"], "content": []}
        current_group["content"].append({"type": item["type"], **({"text": item["text"]} if "text" in item else {"image": item["image"]})})

    if current_group:
        grouped_data.append(current_group)

    return grouped_data 


def process_content(input_data):
    """
    Processes structured data to combine consecutive text elements and preserve the sequence of text and images.
    Args:
        input_data (list): List of dictionaries containing levels and content.
    Returns:
        list: Processed data with combined text and preserved sequence.
    """
    processed_data = []
    
    for entry in input_data:
        level = entry["level"]
        content = entry["content"]
        processed_content = []
        text_buffer = []

        for item in content:
            if item["type"] == "text":
                text_buffer.append(item["text"])
            elif item["type"] == "image":
                if text_buffer:
                    # Combine buffered text and append as a single text entry
                    processed_content.append({"type": "text", "text": " ".join(text_buffer)})
                    text_buffer = []
                # Append the image directly
                processed_content.append(item)
        
        # Add remaining text buffer (if any)
        if text_buffer:
            processed_content.append({"type": "text", "text": " ".join(text_buffer)})

        processed_data.append({"content": processed_content})

    return processed_data

# def export_to_json(doc_obj, output_path):
#     with open(output_path, 'w', encoding='utf-8') as f:
#         json.dump(doc_obj, f, cls=SerializableDocElement, indent=4, ensure_ascii=False)

# Example usage
def main(file_path):
    doc = list_paragraphs(file_path)
    return doc
   
file_path = '/home/anushkas/Downloads/US/ProdDoc1/Line Item Filter (3).docx'
# Usage example (replace with your actual file path)
main(file_path)

Processing file: /home/anushkas/Downloads/US/ProdDoc1/Line Item Filter (3).docx
level data = 
 [{'type': 'text', 'level': 1, 'text': 'Line Item Filter'}, {'type': 'text', 'level': 1, 'text': 'The estimate in RIB 4.0 provides an adaptive filter for a clearer and conveniant presentation of assignments to Line Items. This Filter is called “Line Item Filter”. '}, {'type': 'image', 'image': '/home/anushkas/Downloads/US/ProdDoc1/extracted_images/image3.png', 'level': 1}, {'type': 'text', 'level': 1, 'text': 'This figure shows the two Leading Structure Containers of BoQ and Activities on the left, and the Line Items Container on the right side.'}, {'type': 'text', 'level': 2, 'text': 'Line Item Structure Container'}, {'type': 'text', 'level': 2, 'text': 'The following illustration is an image out of the Estimate module, showing on the left side this Line Item Structure container and on the right side the “Line Items” container. As an example a structure with Location and Cost Group is shown.'

[{'content': [{'type': 'text',
    'text': 'Line Item Filter The estimate in RIB 4.0 provides an adaptive filter for a clearer and conveniant presentation of assignments to Line Items. This Filter is called “Line Item Filter”. '},
   {'type': 'image',
    'image': '/home/anushkas/Downloads/US/ProdDoc1/extracted_images/image3.png'},
   {'type': 'text',
    'text': 'This figure shows the two Leading Structure Containers of BoQ and Activities on the left, and the Line Items Container on the right side.'}]},
 {'content': [{'type': 'text',
    'text': 'Line Item Structure Container The following illustration is an image out of the Estimate module, showing on the left side this Line Item Structure container and on the right side the “Line Items” container. As an example a structure with Location and Cost Group is shown.'},
   {'type': 'image',
    'image': '/home/anushkas/Downloads/US/ProdDoc1/extracted_images/image7.png'},
   {'type': 'image',
    'image': '/home/anushkas/Downloads/US/ProdD

In [4]:
data=[
  {
    "level": 1,
    "content": [
      {
        "type": "text",
        "text": "Line Item Filter"
      },
      {
        "type": "text",
        "text": "The estimate in RIB 4.0 provides an adaptive filter for a clearer and conveniant presentation of assignments to Line Items. This Filter is called “Line Item Filter”. "
      },
      {
        "type": "image",
        "image": "/home/anushkas/Downloads/US/ProdDoc1/extracted_images/image3.png"
      },
      {
        "type": "text",
        "text": "This figure shows the two Leading Structure Containers of BoQ and Activities on the left, and the Line Items Container on the right side."
      }
    ]
  },
  {
    "level": 2,
    "content": [
      {
        "type": "text",
        "text": "Line Item Structure Container"
      },
      {
        "type": "text",
        "text": "The following illustration is an image out of the Estimate module, showing on the left side this Line Item Structure container and on the right side the “Line Items” container. As an example a structure with Location and Cost Group is shown."
      },
      {
        "type": "image",
        "image": "/home/anushkas/Downloads/US/ProdDoc1/extracted_images/image7.png"
      },
      {
        "type": "image",
        "image": "/home/anushkas/Downloads/US/ProdDoc1/extracted_images/image8.png"
      }
    ]
  },
  {
    "level": 3,
    "content": [
      {
        "type": "text",
        "text": "Line Item Filter2222222222"
      },
      {
        "type": "text",
        "text": "The estimate in RIB 4.0 provides an adaptive filter for a clearer and conveniant presentation of assignments to Line Items. This Filter is called “Line Item Filter”. The filter disables/ enables the visibility of Leading Structure Items which are not or are assigned to Line Items. Every Leading Structure Container, such as BoQ, Activities, … provides the Line Item Filter  in its tool bar."
      },
      {
        "type": "image",
        "image": "/home/anushkas/Downloads/US/ProdDoc1/extracted_images/image11.png"
      }
    ]
  },
  {
    "level": 4,
    "content": [
      {
        "type": "text",
        "text": "Line Item Filter3333333333333"
      },
      {
        "type": "text",
        "text": "The estimate in RIB 4.0 provides an adaptive filter for a clearer and conveniant "
      }
    ]
  }
]

In [5]:
def process_data(input_data):
    """
    Processes structured data to combine consecutive text elements and preserve the sequence of text and images.
    Args:
        input_data (list): List of dictionaries containing levels and content.
    Returns:
        list: Processed data with combined text and preserved sequence.
    """
    processed_data = []
    
    for entry in input_data:
        level = entry["level"]
        content = entry["content"]
        processed_content = []
        text_buffer = []

        for item in content:
            if item["type"] == "text":
                text_buffer.append(item["text"])
            elif item["type"] == "image":
                if text_buffer:
                    # Combine buffered text and append as a single text entry
                    processed_content.append({"type": "text", "text": " ".join(text_buffer)})
                    text_buffer = []
                # Append the image directly
                processed_content.append(item)
        
        # Add remaining text buffer (if any)
        if text_buffer:
            processed_content.append({"type": "text", "text": " ".join(text_buffer)})

        processed_data.append({"content": processed_content})

    return processed_data
# Output the processed data
process_data(data)

[{'content': [{'type': 'text',
    'text': 'Line Item Filter The estimate in RIB 4.0 provides an adaptive filter for a clearer and conveniant presentation of assignments to Line Items. This Filter is called “Line Item Filter”. '},
   {'type': 'image',
    'image': '/home/anushkas/Downloads/US/ProdDoc1/extracted_images/image3.png'},
   {'type': 'text',
    'text': 'This figure shows the two Leading Structure Containers of BoQ and Activities on the left, and the Line Items Container on the right side.'}]},
 {'content': [{'type': 'text',
    'text': 'Line Item Structure Container The following illustration is an image out of the Estimate module, showing on the left side this Line Item Structure container and on the right side the “Line Items” container. As an example a structure with Location and Cost Group is shown.'},
   {'type': 'image',
    'image': '/home/anushkas/Downloads/US/ProdDoc1/extracted_images/image7.png'},
   {'type': 'image',
    'image': '/home/anushkas/Downloads/US/ProdD

In [6]:
import os
import json
from docx import Document

def save_image(rel, extracted_images_dir, image_index):
    """Save the image with a unique filename and return the file path."""
    image_type = rel.target_part.content_type
    ext = {
        'image/jpeg': '.jpg',
        'image/png': '.png',
        'image/gif': '.gif',
        'image/bmp': '.bmp',
        'image/tiff': '.tiff',
        'image/svg+xml': '.svg'
    }.get(image_type, '.png')

    filename = f"image{image_index}{ext}"
    image_path = os.path.join(extracted_images_dir, filename)

    with open(image_path, 'wb') as f:
        f.write(rel.target_part.blob)

    return image_path

def list_paragraphs(file_path):
    """
    Process a DOCX file to extract text and images, organizing them hierarchically.
    """
    from some_module import Doc, DocElement, SerializableDocElement, add_image_levels, group_by_level, process_content  # Replace with actual imports

    print(f"Processing file: {file_path}")

    filename = os.path.basename(file_path)
    result = Doc(filename)

    # Define style-to-heading level mapping
    style_to_level = {
        'Heading 1': 1,
        'Heading 2': 2,
        'Heading 3': 3,
        'Heading 4': 4
    }

    docx = Document(file_path)

    base_dir = os.path.dirname(file_path)
    extracted_images_dir = os.path.join(base_dir, 'extracted_images')
    os.makedirs(extracted_images_dir, exist_ok=True)

    previous_level = None
    heading_counter = 0
    image_index = 1  # Counter for naming images

    for paragraph in docx.paragraphs:
        style_name = paragraph.style.name
        text = paragraph.text.replace('\u00a0', ' ').strip()

        if text:
            if style_name in style_to_level:
                heading_counter += 1
                current_level = heading_counter
                previous_level = current_level
            else:
                current_level = previous_level

            text_element = DocElement(type='text', content=text, level=current_level)
            result.sections.append(text_element)

        # Process inline images
        for run in paragraph.runs:
            inline_shapes = run._element.findall('.//a:blip', namespaces={'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
            for inline_shape in inline_shapes:
                rId = inline_shape.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
                if rId:
                    rel = docx.part.rels[rId]
                    image_path = save_image(rel, extracted_images_dir, image_index)
                    image_index += 1
                    image_element = DocElement(type='image', content=image_path)
                    result.sections.append(image_element)

    # Serialize and process extracted content
    data = json.loads(json.dumps(result, cls=SerializableDocElement, indent=4, ensure_ascii=False))
    updated_data = add_image_levels(data['sections'])
    output_data = group_by_level(updated_data)
    processed_output = process_content(output_data)

    print(json.dumps(processed_output, indent=2))
    return processed_output


In [24]:
def list_paragraphs(file_path):
    print(f"Processing file: {file_path}")
    
    filename = os.path.basename(file_path)
    result = Doc(filename)
    
    # Define style to heading level mapping
    style_to_level = {
        'Title',
        'Heading 1',
        'Heading 2',
        'Heading 3',
        'Heading 4'
    }
    
    docx = Document(file_path)
    
    base_dir = os.path.dirname(file_path)
    extracted_images_dir = os.path.join(base_dir, 'extracted_images')
    
    # Create directory for extracted images
    os.makedirs(extracted_images_dir, exist_ok=True)

    previous_level = None
    previous_style = None
    heading_counter = 0  # Keep track of the number of headings
    image_counter = 1  # Initialize a counter for images

    # Iterate through paragraphs and inline shapes (images)
    for paragraph in docx.paragraphs:
        styleName = paragraph.style.name
        text = paragraph.text.replace('\u00a0', ' ')
        
        if text.strip(): 
                        # If we encounter a heading, increment heading_counter and set level
            if styleName in style_to_level:
                print("----    ", styleName)
                heading_counter += 1
                current_level = heading_counter
                previous_style = styleName
                previous_level = current_level
            else:
                # If it's not a heading, inherit the previous heading level
                current_level = previous_level

            # Create text element with the appropriate heading level
            text_element = DocElement(type='text', content=text, level=current_level)
            result.sections.append(text_element)

        # Check for inline images within the paragraph's runs
        for run in paragraph.runs:
            inline_shapes = run._element.findall('.//a:blip', namespaces={'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
            for inline_shape in inline_shapes:
                rId = inline_shape.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
                if rId:
                    rel = docx.part.rels[rId]
                    image_type = rel.target_part.content_type
                    ext = {
                        'image/jpeg': '.jpg',
                        'image/png': '.png',
                        'image/gif': '.gif',
                        'image/bmp': '.bmp',
                        'image/tiff': '.tiff',
                        'image/svg+xml': '.svg'
                    }.get(image_type, '.png')

                    # Create a sequential filename for the image
                    unique_filename = f"image{image_counter}{ext}"
                    image_counter += 1  # Increment the image counter
                    image_save_path = os.path.join(extracted_images_dir, unique_filename)

                    with open(image_save_path, 'wb') as f:
                        f.write(rel.target_part.blob)

                    # Add the image element immediately after the text element
                    image_element = DocElement(type='image', content=image_save_path)
                    result.sections.append(image_element)
    
    # Process and return the final output
    import json

    data = json.loads(json.dumps(result, cls=SerializableDocElement, indent=4, ensure_ascii=False))
    print("\n\n original data = \n", data,"\n\n")
    updated_data = add_image_levels(data['sections'])
    print("level data = \n", updated_data)
    output_data = group_by_level(updated_data)
    print("\n---group_by_level----\n", output_data, "\n------------\n")
    processed_output = process_content(output_data)
    
    print("\n00000000000000000\n", json.dumps(processed_output, indent=2), "\n00000000000000000\n")
    
    return processed_output




def add_image_levels(data):
    last_text_level = None  # To store the level of the last text element
    
    for element in data:
        if element["type"] == "text":
            # Update the last_text_level whenever we encounter a text element
            last_text_level = element.get("level")
        elif element["type"] == "image":
            # Assign the level of the last text element to the image
            if last_text_level is not None:
                element["level"] = last_text_level
    
    return data



def group_by_level(data):
    grouped_data = []
    current_group = None

    for item in data:
        if current_group is None or current_group["level"] != item["level"]:
            if current_group:
                grouped_data.append(current_group)
            current_group = {"level": item["level"], "content": []}
        current_group["content"].append({"type": item["type"], **({"text": item["text"]} if "text" in item else {"image": item["image"]})})

    if current_group:
        grouped_data.append(current_group)

    return grouped_data 


def process_content(input_data):
    """
    Processes structured data to combine consecutive text elements and preserve the sequence of text and images.
    Args:
        input_data (list): List of dictionaries containing levels and content.
    Returns:
        list: Processed data with combined text and preserved sequence.
    """
    processed_data = []
    
    for entry in input_data:
        level = entry["level"]
        content = entry["content"]
        processed_content = []
        text_buffer = []

        for item in content:
            if item["type"] == "text":
                text_buffer.append(item["text"])
            elif item["type"] == "image":
                if text_buffer:
                    # Combine buffered text and append as a single text entry
                    processed_content.append({"type": "text", "text": " ".join(text_buffer)})
                    text_buffer = []
                # Append the image directly
                processed_content.append(item)
        
        # Add remaining text buffer (if any)
        if text_buffer:
            processed_content.append({"type": "text", "text": " ".join(text_buffer)})

        processed_data.append({"content": processed_content})

    return processed_data

# def export_to_json(doc_obj, output_path):
#     with open(output_path, 'w', encoding='utf-8') as f:
#         json.dump(doc_obj, f, cls=SerializableDocElement, indent=4, ensure_ascii=False)

# Example usage
def main(file_path):
    doc = list_paragraphs(file_path)
    return doc
   
file_path = '/home/anushkas/Downloads/Generate Budget from DJC.docx'
# Usage example (replace with your actual file path)
main(file_path)

Processing file: /home/anushkas/Downloads/Generate Budget from DJC.docx
----     Title


 original data = 
 {'name': 'Generate Budget from DJC.docx', 'sections': [{'type': 'text', 'level': 1, 'text': 'Generate Budget from DJC'}, {'type': 'text', 'level': 1, 'text': 'The “Generate Budget from DJC” function allows user to create new estimate with selected line items or all line items and generate budget from the “Cost Total” of the Line items.'}, {'type': 'text', 'level': 1, 'text': 'Generate Budget from DJC'}, {'type': 'image', 'image': '/home/anushkas/Downloads/extracted_images/image1.png'}, {'type': 'text', 'level': 1, 'text': 'Step 1. Select Line Item or All Line Items First step to select for which line items generate budget from DJC. There are 2 options for generating Budget. First option is “Generate Budget for selected Line Items” and second option “Generate Budget for All Line Items”.'}, {'type': 'text', 'level': 1, 'text': 'Generate Budget for selected Line Items'}, {'type': 'i

[{'content': [{'type': 'text',
    'text': 'Generate Budget from DJC The “Generate Budget from DJC” function allows user to create new estimate with selected line items or all line items and generate budget from the “Cost Total” of the Line items. Generate Budget from DJC'},
   {'type': 'image',
    'image': '/home/anushkas/Downloads/extracted_images/image1.png'},
   {'type': 'text',
    'text': 'Step 1. Select Line Item or All Line Items First step to select for which line items generate budget from DJC. There are 2 options for generating Budget. First option is “Generate Budget for selected Line Items” and second option “Generate Budget for All Line Items”. Generate Budget for selected Line Items'},
   {'type': 'image',
    'image': '/home/anushkas/Downloads/extracted_images/image2.png'},
   {'type': 'text',
    'text': 'Step 2. Generate Budget from DJC Click on the wizard “Generate Budget from DJC”. Select the Generate Budget Option. After selecting the option and clicking “OK”, the