In [1]:
import docx
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import Table
from docx.text.paragraph import Paragraph

import pandas as pd
import os

from uuid import uuid4
from unidecode import unidecode

os.chdir('multimodal_docx_parser_files/')

In [2]:
os.getcwd()

'C:\\Users\\91994\\my_workspace\\doc_parser\\multimodal_docx_parser_files'

In [3]:
import openai
from openai import OpenAI
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
openai.apikey = os.environ['OPENAI_API_KEY']

In [4]:
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser

In [5]:
def gen_uuid4():
    return str(uuid4())

In [6]:
def extract_image_from_para(doc, para):
    image_found = False
    image_filename = None
    alt_text = None

    for run in para.runs:
        #check if the run contains a picture
        if run._element.xpath('.//w:drawing'):
            #Extract the image binary data
            image = run._element.xpath('.//a:blip/@r:embed')[0]
            relationship_id = image
            image_part = doc.part.related_parts[relationship_id]

            # Save the image to a file
            image_filename = f'extracted_image_{relationship_id}.jpg'
            with open(image_filename, 'wb') as img_file:
                img_file.write(image_part.blob)
    
            #Extract the alt text
            alt_text_elements = run._element.xpath('.//wp:docPr/@descr')
            alt_text = alt_text_elements[0] if alt_text_elements else 'No Alt Text'
    
            image_found = True
    return image_found, image_filename, alt_text

In [7]:
def iter_block_items(parent):
    """Yield each paragraph, text, and image within *parent*, in document order
    Each returned value is an instance of either Table, Paragraph or InlineShape."""
    parent_elem = parent.element.body
    for child in parent_elem.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)
        elif child.tag.endswith('drawing'):
            yield 'Image'

In [8]:
def extract_tables_into_csv(table, doc_name, table_id):
    data=[]
    for row in table.rows:
        data.append([cell.text for cell in row.cells])
    df = pd.DataFrame(data)
    file_name = f'{doc_name}_table_{table_id}.csv'
    df.to_csv(file_name, index=False, header=None)
    table_markdown = df.to_markdown(index=False)
    return file_name, table_markdown

In [9]:
def process_paragraph(para):
    para_text = para.text
    para_style = para.style.name
    return para_text, para_style

In [10]:
def flatten_table(table):
    row_id = 0
    data = []
    flat_table_df = pd.DataFrame(columns=['rec_id','table_rec_id','table_id','row_id','cell_id','content_type','cell_text','image_loc'])
    for row in table.rows:
        row_id+=1
        cell_id =0
        for cell in row.cells:
            cell_id+=1

            image, image_loc, image_alt_text = extract_image_from_para(doc, cell.paragraphs[0])
            if image:
                content = image_alt_text
                content_type = 'Image'
                image_loc = image_loc
            else:
                content = cell.text
                content_type = 'Text'
                image_loc = None

            flat_table_df.loc[len(flat_table_df.index)] = [gen_uuid4(), None, None, row_id, cell_id, content_type, content, image_loc]
    return flat_table_df


In [11]:
#load the document
doc_name = 'Birdwatching Guide'
doc = docx.Document(f'{doc_name}.docx')

content_df = pd.DataFrame(columns=['rec_id','section_id','content_type', 'content', 'content_style', 'table_location', 'image_location'])
table_id = 1
table_location = ''
content_table_df = pd.DataFrame(columns=['rec_id','table_rec_id','table_id','row_id','cell_id','content_type','cell_text','image_loc'])

#iterate thru each item and classify
for item in iter_block_items(doc):
    rec_id = gen_uuid4()
    table_location = None
    content = None
    image_location = None
    print(item)
    if isinstance(item, Paragraph):
        image, image_loc, image_alt_text = extract_image_from_para(doc, item)
        if image:
            content_type = 'Image'
            content = image_alt_text
            image_location = image_loc
        else:
            content, content_style = process_paragraph(item)
            content_type = 'Para'
            if content.strip() == '':
                continue
    elif isinstance(item, Table):
        table_location, content = extract_tables_into_csv(item, doc_name, table_id)
        content_type = 'Table'
        table_df = flatten_table(item)
        table_df['table_rec_id'] = rec_id
        table_df['table_id'] = table_id
        content_table_df = pd.concat([content_table_df, table_df], ignore_index=True)
        table_id+=1

    elif item=='Image':
        image, image_loc, image_alt_text = extract_image_from_para(doc, item)
        print('Image found')
        content_type = 'Image'

    content_df.loc[len(content_df.index)] = [rec_id, None, content_type, content, content_style, table_location, image_location]

content_df.to_csv(f'{doc_name}_updated_content.csv')
content_table_df.to_csv(f'{doc_name}_updated_content_table.csv')

<docx.text.paragraph.Paragraph object at 0x0000019A504A50D0>
<docx.text.paragraph.Paragraph object at 0x0000019A50485F70>
<docx.text.paragraph.Paragraph object at 0x0000019A504A50D0>
<docx.text.paragraph.Paragraph object at 0x0000019A504A5100>
<docx.text.paragraph.Paragraph object at 0x0000019A504A50A0>
<docx.text.paragraph.Paragraph object at 0x0000019A504A5100>
<docx.text.paragraph.Paragraph object at 0x0000019A504A50D0>
<docx.table.Table object at 0x0000019A504A5100>
<docx.text.paragraph.Paragraph object at 0x0000019A504A5070>
<docx.text.paragraph.Paragraph object at 0x0000019A504C0160>
<docx.text.paragraph.Paragraph object at 0x0000019A504A5070>
<docx.text.paragraph.Paragraph object at 0x0000019A504C0160>
<docx.text.paragraph.Paragraph object at 0x0000019A504A5070>
<docx.text.paragraph.Paragraph object at 0x0000019A504C0160>
<docx.text.paragraph.Paragraph object at 0x0000019A504A5070>
<docx.text.paragraph.Paragraph object at 0x0000019A504C01C0>
<docx.text.paragraph.Paragraph object

In [12]:
import base64
#function to encode image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

In [13]:
client = OpenAI()

def gen_image_desc(base64_image):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Describe the image in single sentence"
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            }
        ]
    )
    return response.choices[0].message.content

In [14]:
content_table_df['ai_gen_image_desc'] = None
content_table_df.fillna('', inplace=True)
image_df = content_table_df[content_table_df.image_loc != '']

In [15]:
for idx, rec in image_df.iterrows():
    #path to your image
    image_path = rec['image_loc']
    #Getting the base64 string
    base64_image = encode_image(image_path)

    img_desc = gen_image_desc(base64_image)
    content_table_df.loc[content_table_df.rec_id == rec['rec_id'], 'ai_gen_image_desc'] = img_desc

In [16]:
content_table_df.to_csv(f'{doc_name}_updated_content_table_img_desc.csv')

In [17]:
prompt = ChatPromptTemplate.from_template(
    """1. Generate a short summary for the content and
    2. Extract the list of headers from the content and provide in a comma seperated list.
    Provide these above details in key value pairs for Summary and Headers in Json format

    Content: ```{table_content}```"""
)
model=ChatOpenAI()
output_parser=StrOutputParser()

table_summary_chain = prompt|model|output_parser

In [18]:
content_df['table_summary'] = None
content_df.fillna('', inplace=True)
table_df = content_df[content_df.table_location!='']

In [19]:
for idx, rec in table_df.iterrows():
    df = pd.read_csv(rec['table_location'])
    table_str = str(df.to_json())
    response = table_summary_chain.invoke({"table_content": table_str})
    content_df.loc[content_df.rec_id == rec['rec_id'], 'table_summary'] = response

In [20]:
content_df.to_csv(f'{doc_name}_updated_content_table_summary.csv')