In [25]:
import docx
import re
import numpy as np
from docx.shared import Pt
from datetime import date

In [26]:
def parse_toc_hierarchy(doc):
    toc_ind = [ind for ind,p in enumerate(doc.paragraphs) if p.text == 'TABLE OF CONTENTS']
    toc = [p.text for p in doc.paragraphs[toc_ind[0]:] if bool(re.search(r'\t+\d', p.text))]

    titles = []
    subtitles = []
    # page_ind = []
    section = []
    sub_section = []
    for line in toc:
        split_line = line.split('\t')
        if len(split_line) == 3:
            sec = float(split_line[0])
            section.append(sec)
            if (sec % 1) == 0:
                titles.append(split_line[-2].strip())
                # page_ind.append(int(split_line[-1]))
            else:
                subtitles.append(split_line[-2].strip())
                sub_section.append(titles[int(sec)-1])
        else:
            titles.append(split_line[0])
            

    index = np.array([ind for ind, paragraph in enumerate(doc.paragraphs) if paragraph.text.strip() in titles or paragraph.text.strip() in subtitles])
    data_ind = {}
    data = {k: {} for k in titles}
    data_ind[0] = "Title Page"
    data_ind[toc_ind[0]] = "TABLE OF CONTENTS"
    for start,end in zip(index, np.append(index[1:], len(doc.paragraphs))):
        key = [y.strip() for x,y in zip(subtitles, sub_section) if doc.paragraphs[start].text.strip() in x]
        
        if len(key) > 0:
            txt = " ".join([p.text for p in doc.paragraphs[start+1:end]])
            data[key[0]][doc.paragraphs[start].text.strip()] = txt
        data_ind[start] = doc.paragraphs[start].text.strip()
    
    return data_ind, data, toc_ind

In [27]:
def parse_toc(doc):
    # potentially want to add functionality to figure out subheadings - dict in dict, json?, 
    toc_ind = [ind for ind,p in enumerate(doc.paragraphs) if p.text == 'TABLE OF CONTENTS']
    toc = [p.text for p in doc.paragraphs[toc_ind[0]:] if bool(re.search(r'\t+\d', p.text))]

    titles = []
    page_ind = []
    for line in toc:
        split_line = line.split('\t')
        # line_ind.append(float(split_line[0]))
        titles.append(split_line[-2].strip())
        page_ind.append(int(split_line[-1]))


    index = np.array([ind for ind, paragraph in enumerate(doc.paragraphs) if paragraph.text.strip() in titles])

    data = {}
    data_ind = {}
    data_ind["Title Page"] = 0
    data_ind["TABLE OF CONTENTS"] = toc_ind[0]
    for start,end in zip(index, np.append(index[1:], len(doc.paragraphs))):
        data[doc.paragraphs[start].text.strip()] = " ".join([p.text for p in doc.paragraphs[start+1:end]])
        data_ind[doc.paragraphs[start].text.strip()] = start
    return data_ind, data, toc_ind

def find_tables(doc):
    para_count = 0
    table_loc = []
    tables = []
    for p in doc.iter_inner_content():
        if isinstance(p, docx.text.paragraph.Paragraph):
            para_count += 1
        elif isinstance(p, docx.table.Table):
            table_loc.append(para_count)
            tables.append(p)
    
    return table_loc, tables




In [28]:
def find_all_placeholders(doc):
    placeholders = []
    placeholders_ind = []
    for paragraph_index, paragraph in enumerate(doc.paragraphs):
        if '{' in paragraph.text:
            start_index = paragraph.text.find('{')+1
            end_index = paragraph.text.find('}')
            
            placeholders.append(paragraph.text[start_index:end_index])
            placeholders_ind.append(paragraph_index)

    return placeholders, placeholders_ind

In [29]:
def move_table_after(table, paragraph):
    tbl, p = table._tbl, paragraph._p
    p.addnext(tbl)



In [30]:
doc_path = "/Users/rebeccakrall/Desktop/Example for Proposal Report Automation/Example Proposals/LXN_02_13MAY21_CCI_R1.docx"
doc_path = "/Users/rebeccakrall/Desktop/Example for Proposal Report Automation/Example Proposals/MGB_02_12DEC23_IVIS.docx"
edit_path = "/Users/rebeccakrall/Desktop/Example for Proposal Report Automation/PRX_05_20APR23_ITCH_R3.1_test2.docx"
report_path = "/Users/rebeccakrall/Desktop/Example for Proposal Report Automation/PRX_05 - Study Report_12132023.docx"
template_path = "/Users/rebeccakrall/Desktop/Example for Proposal Report Automation/Becca_Template.docx"
original = docx.Document(doc_path)
template = docx.Document(template_path)


In [19]:
place, ind = find_all_placeholders(template)
# Load document
data_ind, data, toc_ind = parse_toc(original) 
title_page =[(ind, p.text) for ind, p in enumerate(original.paragraphs) if ind < toc_ind[0] and p.text.strip() != '' ]    
table_loc, table = find_tables(original)

# Identify key points
title = title_page[0][1]
study_num = [ (x[0], x[1].split(': ')[-1]) for x in title_page if 'Project Quotation' in x[1]][0]
client = [ (x[0], x[1].split('Prepared for ')[-1]) for x in title_page if 'Prepared for' in x[1]][0]
t = date.today().strftime("%B %d, %Y")
roughPM = " ".join(data['Project Manager'].split('proposes ')[-1].split(" ")[0:2])
roughPC = " ".join(data['Project Coordinator:'].split('proposes ')[-1].split(" ")[0:2])
replace_str = {'Study Num': study_num[1], 'Title': title, 'Client': client[1], 'Date':t, 'Project Manager': roughPM, 'Project Coordinator': roughPC}


In [20]:
all_ind = np.array(list(data_ind.values()))
add_factor = 0
for p,i in zip(place, ind):
    if p in replace_str.keys():
        old_string = '{'+p+'}'
        new_string = template.paragraphs[i+add_factor].text.replace(old_string, replace_str[p])
        template.paragraphs[i+add_factor].text = new_string
    elif p == 'Table 1':
        paragraph = template.paragraphs[i+add_factor]  # however you get this paragraph
        # table = template.add_table(table)
        move_table_after(table[0], paragraph)
        template.paragraphs[i+add_factor].clear()
    elif p in data_ind.keys() :
        start = data_ind[p]
        end = all_ind[np.argwhere(all_ind == start)[0][0]+1]
        for x in range(end-1, start, -1):
            template.paragraphs[i+add_factor].insert_paragraph_before(original.paragraphs[x].text, original.paragraphs[x].style)
        
        add_factor = add_factor + len(range(start,end)) - 1
        template.paragraphs[i+add_factor].clear()

#


In [21]:
table_loc, temp_tables = find_tables(template)

In [22]:
row_len = len(temp_tables[0].rows)
col_len = len(temp_tables[0].columns)
for r in range(row_len):
    for c in range(col_len):
        if '{' in temp_tables[0].cell(r,c).text:
            start_index = temp_tables[0].cell(r,c).text.find('{')+1
            end_index = temp_tables[0].cell(r,c).text.find('}')
            
            placeholder = (temp_tables[0].cell(r,c).text[start_index:end_index])

            if placeholder in replace_str.keys():
                old_string = '{'+placeholder+'}'
                new_string = temp_tables[0].cell(r,c).text.replace(old_string, replace_str[placeholder])
                temp_tables[0].cell(r,c).text = new_string
            

In [23]:
section = template.sections[0]
header = section.header
for paragraph in header.paragraphs:
    starts =[m.start() for m in re.finditer(r"{",paragraph.text)]
    ends = [m.start() for m in re.finditer(r"}",paragraph.text)]
    
    for s,e in zip(starts, ends):
        placeholder = (paragraph.text[s+1:e])
        if placeholder in replace_str.keys():
            old_string = '{'+placeholder+'}'
            new_string = paragraph.text.replace(old_string, replace_str[placeholder])
            paragraph.text = new_string
            

In [24]:
template.save(edit_path)

In [33]:
header = template.sections[0].header

Plan:
    iterate through paragraphs in 'original' document
    using the index of the current paragraph - identify what section of the document currently iterating through
        identify paragraphs to be deleted
        identify paragraphs to be included
        identify paragraphs to be edited
    
    
    

Data to scrape from a proposal:
- Project title
- Client name
- date of issue
- date(s) of reissue
- project ID
- methods used 
    - reference experiment abbreviation list
- people included
    - project manager
    - project coordinator
    - client coordinator

Data to scrape from report
- ^ match above
- date of report