In [183]:
import docx
import re
from itertools import groupby
import os

### Util functions

In [73]:
def clean_multiple_newlines(text):
    return re.sub(r'\n+', '\n', text).strip()

### Header

In [71]:
def get_header_title_only(doc):
    header_info = []
    return  doc.sections[0].first_page_header.tables[0].rows[0].cells[1].text

In [59]:
def get_header_all_information(doc):
    header_info = []
    for a in doc.sections[0].first_page_header.tables:
        for i, row in enumerate(a.rows):
            for j, cell in enumerate(row.cells):
                if (cell.text!="") & (cell.text not in header_info):
                    header_info.append(cell.text)
    return header_info

### Main content

In [102]:
def process_table_as_text(table, rows_separatelly=False):
    table_text = []
    prev_cell = None
    for i, row in enumerate(table.rows):
        row_text = []
        for j, cell in enumerate(row.cells):
            if cell._tc == prev_cell:
                continue
            if cell.text:
                row_text.append(clean_multiple_newlines(cell.text))
            prev_cell = cell._tc
        table_text.append("; ".join(row_text))
    if not rows_separatelly:
        table_text = "\n".join(table_text)
    return table_text

def process_paragraph(paragraph):
    return clean_multiple_newlines(paragraph.text)

In [196]:
def read_file(filename, group_by_chapter=False, group_by_file=False):
    doc = docx.Document(filename)
    short_filename = filename.split("/")[-1].replace(".docx","")
    document_title = get_header_title_only(doc)
    full_document = []
    current_chapter_name = "First page and table of contents"
    for elem in doc.iter_inner_content():
        if isinstance(elem, docx.table.Table):
            full_document.append((short_filename, document_title, current_chapter_name, process_table_as_text(elem)))
        elif isinstance(elem, docx.text.paragraph.Paragraph):
            paragraph_text = process_paragraph(elem)
            if len(paragraph_text) == 0:
                continue
            if elem.style.name.startswith("Heading"):
                current_chapter_name = paragraph_text
            full_document.append((short_filename, document_title, current_chapter_name, paragraph_text))
    
    if group_by_chapter:
        full_document = groupby(full_document, key=lambda x: (x[0],x[1],x[2]))
        full_document = [(*key, "\n".join([item[3] for item in vals])) for key,vals in full_document]
    if group_by_file:
        full_document = [(full_document[0][0],full_document[0][1], "N/A", "\n".join([paragraph[3] for paragraph in full_document]))]

    return full_document

### Testing - read all documetns

In [210]:
all_documents = []
for f in ["./data_sample/" + f for f in os.listdir("./data_sample/") if not f.startswith("~") and f.endswith(".docx")]:
    all_documents.extend(read_file(f, group_by_chapter=True, group_by_file=True))