In [2]:
from docxtpl import DocxTemplate
from docx import Document, table, text
import re

import numpy as np
# from docx.shared import Pt
from datetime import date
import os
import pandas as pd

In [3]:
def parse_toc(doc):
    # potentially want to add functionality to figure out subheadings - dict in dict, json?, 
    toc_ind = [ind for ind,p in enumerate(doc.paragraphs) if p.text == 'TABLE OF CONTENTS']
    toc = [p.text for p in doc.paragraphs[toc_ind[0]:] if bool(re.search(r'\t+\d', p.text))]

    titles = []
    page_ind = []
    for line in toc:
        split_line = line.split('\t')
        # line_ind.append(float(split_line[0]))
        titles.append(split_line[-2].strip())
        page_ind.append(int(split_line[-1]))


    index = np.array([ind for ind, paragraph in enumerate(doc.paragraphs) if paragraph.text.strip() in titles])

    data = {}
    data_ind = {}
    data_ind["Title Page"] = 0
    data_ind["TABLE OF CONTENTS"] = toc_ind[0]
    for start,end in zip(index, np.append(index[1:], len(doc.paragraphs))):
        key = doc.paragraphs[start].text.strip()
        key = key.replace(' ', '_')

        data[key] = " ".join([p.text for p in doc.paragraphs[start+1:end]])
        data_ind[key] = start
    
    sortem = sorted(data_ind.items(), key=lambda x: x[1]) 
    data_ind = {tup[0]:tup[1] for tup in sortem}
    return data_ind , data

def find_tables(doc):
    para_count = 0
    table_loc = []
    tables = []
    for p in doc.iter_inner_content():
       
        if isinstance(p, table.Table):
            table_loc.append(para_count)
            tables.append(p)
        elif isinstance(p, text.paragraph.Paragraph):
            para_count += 1
    
    return table_loc, tables


def move_table_after(table, paragraph):
    tbl, p = table._tbl, paragraph._p
    p.addnext(tbl)



In [4]:
def scrape_proposal(input_path, template_path, save_path = None, save = True):
    
    # Load proposal and template docx
    input = Document(input_path)
    template = DocxTemplate(template_path)

    # Pull data from proposal (input)
    data_ind,data = parse_toc(input) 
    title_page =[(ind, p.text) for ind, p in enumerate(input.paragraphs) if ind < data_ind['TABLE OF CONTENTS'] and p.text.strip() != '' ]    
    table_loc, table = find_tables(input)

    # Pull placeholders from template
    place = template.get_undeclared_template_variables()
    replace = {k : None for k in place}

    # Identify key points, update replace dict
    replace['title'] = title_page[0][1]
    replace['study_num'] = [ x[1].split(': ')[-1] for x in title_page if 'Project Quotation' in x[1]][0]
    replace['client'] = [x[1].split('Prepared for ')[-1] for x in title_page if 'Prepared for' in x[1]][0]
    replace['t'] = date.today().strftime("%B %d, %Y")
    replace['description'] = input.sections[0].header.paragraphs[0].text.split('\t')[0]

    # Roughly identify the project manager and coordinator
    m = [x for x in data.keys() if 'Project_Manager' in x]
    if len(m) > 0:
        replace['pm'] = " ".join(data[m[0]].split('proposes ')[-1].split(" ")[0:2])

    c = [x for x in data.keys() if 'Project_Coordinator' in x]
    if len(c) > 0:
        replace['pc'] = " ".join(data[c[0]].split('proposes ')[-1].split(" ")[0:2])

    # Replace '&' in strings to prevent jinga2 errors
    for k,v in replace.items():
        if v is not None and '&' in v:
            replace[k] = v.replace("&","and")

    
    # For section replacement, iterate through and pull paragraphs from proposal
    all_ind = np.array(list(data_ind.values()))
    for p in place:
        if p in data_ind.keys():
            print(p)
            start = data_ind[p]
            end = all_ind[np.argwhere(all_ind == start)[0][0]+1]
            print(p+": "+str(start)+":"+str(end))
            sd1 = template.new_subdoc()
            
            # if (start+1 == end):
            #     replace[p] = None
            # else:
            for x in range(start+1, end, 1):
                sd1.add_paragraph(input.paragraphs[x].text, input.paragraphs[x].style)
                if table_loc[0]-1 == x:
                    move_table_after(table[0], sd1.paragraphs[-1])

            replace[p] = sd1
    
    path, fn = os.path.split(input_path)
    name, ext = os.path.splitext(fn)
    new_fn = name + '_REPORT' + ext


    if save_path is None:
        save_path = os.path.join(path, new_fn)
    else:
        save_path = os.path.join(save_path, new_fn)


    template.render(replace)

    if save:
        template.save(save_path)

    return template


In [7]:
template_path = "/Users/rebeccakrall/Desktop/Example for Proposal Report Automation/Report_Template_dxpt.docx"
# template = DocxTemplate(template_path)

edit_path = "/Users/rebeccakrall/Desktop/Example for Proposal Report Automation/Report_test.docx"
# input_path = "/Users/rebeccakrall/Desktop/Example for Proposal Report Automation/Example Proposals/MGB_02_12DEC23_IVIS.docx"
# input_path = "/Users/rebeccakrall/Desktop/Example for Proposal Report Automation/Example Proposals/VBI_01_19FEB20_EAE.docx"
input_path = "/Users/rebeccakrall/Desktop/Example for Proposal Report Automation/Example Proposals/LXN_02_13MAY21_CCI_R1.docx"
# input_path = "/Users/rebeccakrall/Desktop/Example for Proposal Report Automation/Test_Proposal/CYTK_05_16OCT23_HT.docx"

scrape_proposal(input_path, template_path,save= False)


Introduction_and_Background
Introduction_and_Background: 47:52
Animal_Description
Animal_Description: 74:82
Housing_and_Feeding
Housing_and_Feeding: 82:91
Methods
Methods: 118:127
References
References: 127:132
Design
Design: 91:106


<docxtpl.template.DocxTemplate at 0x120f757d0>

In [17]:
input = Document(input_path)
data_ind, data = parse_toc(input)

In [18]:
data_ind

{'Title Page': 0,
 'TABLE OF CONTENTS': 20,
 'Introduction_and_Background': 48,
 'Background': 49,
 'Abbreviations_used_in_this_proposal': 53,
 'Study_Deliverables': 70,
 'Marshall_Gerstein': 71,
 'Melior': 83,
 'Experimental_Procedures': 92,
 'Animal_Description': 93,
 'Housing_and_Feeding': 101,
 'Design': 110,
 'General_Operational_Terms': 123,
 'Methods': 134,
 'Data_Analysis': 141,
 'Terms_and_Conditions': 143,
 'Pricing': 145,
 'Terms': 149,
 'Appendix_1:__Key_Personnel_for_this_project': 198,
 'Project_Manager': 199,
 'Project_Coordinator:': 213,
 'Client_Management_Specialist:': 228,
 'Appendix_2:____Melior_Discovery_Background_and_Operations': 241,
 'Executive_Summary': 242,
 'Melior_Discovery_Overview': 251,
 'Facility': 255,
 'Security,_Monitoring,_and_Backup_Capability': 258,
 'Special_Licenses_/_Certificates': 263}

In [19]:
import os

In [34]:
proposal_dir = "/Users/rebeccakrall/Desktop/Example for Proposal Report Automation/Test_Proposal"
report_dir = "/Users/rebeccakrall/Desktop/Example for Proposal Report Automation/Test_Report"

current_proposals = os.listdir(proposal_dir)
current_reports = os.listdir(report_dir)

proposal_names = [x.split('.docx')[0] for x in current_proposals]
report_names = [x.split('_REPORT.docx')[0] for x in current_reports]

unmatched = [p for r,p in zip(proposal_names, current_proposals) if r not in report_names]



In [31]:
for u in unmatched:
    proposal_path = os.path.join(proposal_dir, u)
    scrape_proposal(proposal_path, template_path, save_path = report_dir)

Methods: 118:127
Design: 91:106
Housing_and_Feeding: 82:91
Animal_Description: 74:82
Introduction_and_Background: 47:52
References: 127:132
Methods: 116:140
Design: 90:104
Housing_and_Feeding: 81:90
Animal_Description: 73:81
Introduction_and_Background: 47:51


In [35]:
unmatched

['VBI_01_19FEB20_EAE.docx']

In [47]:
assays = pd.read_excel("/Users/rebeccakrall/Desktop/Example for Proposal Report Automation/Assay Codes.xlsx")
list(assays['Assay'])

lower_assay = [l.lower() for l in list(assays['Assay'])]

  warn(msg)


In [57]:
assay_list = []
for paragraph in input.paragraphs:
    for ass in list(assays['Assay']):
        if ass.lower() in paragraph.text.lower():
            assay_list.append(ass)

In [58]:
assay_list

['Platform', 'Platform']

In [4]:
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize

tokens = word_tokenize(example_txt)
tagged_words = pos_tag(tokens)

print(tagged_words)

[('Randomization', 'NN'), (':', ':'), ('animals', 'NNS'), ('will', 'MD'), ('be', 'VB'), ('assigned', 'VBN'), ('randomly', 'RB'), ('to', 'TO'), ('treatment', 'NN'), ('groups', 'NNS'), (';', ':'), ('animals', 'NNS'), ('will', 'MD'), ('be', 'VB'), ('distributed', 'VBN'), ('into', 'IN'), ('treatment', 'NN'), ('groups', 'NNS'), ('according', 'VBG'), ('to', 'TO'), ('post-surgery', 'JJ'), ('pre-dose', 'JJ'), ('responses', 'NNS'), ('.', '.')]


In [None]:
def rough_past_tense(txt):
    # lemmatize to look for plural nouns as subjects, all verbs
    # change tense of all future/present verbs to past
    # look for md
    words = txt.split(' ')

In [65]:
tokens = word_tokenize(txt)
tagged_words = pos_tag(tokens)

In [66]:
verbs = [ind for (ind,word) in enumerate(tagged_words) if 'VB' in word[1]]
wills = [ind for (ind,word) in enumerate(tagged_words) if 'will' in word[0]]



In [67]:
# replace will be and will not be 
for w in wills:
    if tagged_words[w+1][0] == 'be':
        print("Will Be found starting at "+str(w))

        


    elif ((tagged_words[w+1][0] == 'not') | (tagged_words[w+1][0] == 'never')) & (tagged_words[w+2][0] == 'be'):
        print("Will not/never be found at "+str(w))
    


Will Be found starting at 3
Will not/never be found at 12


In [4]:
txt = "The Prism file can be provided to Marshall Gerstein upon request.  Additional post-hoc analyses can be conducted by Melior upon request for additional cost.  "
# txt = "Bioluminescent imaging will be performed on all animals on at either 4, 8, and 24 hours after TA administration.  Animals will be anesthetized using isoflurane inhalation.  Imaging will be conducted using a Perkin Elmer IVIS® Lumina III LT according to manufactures recommendations and signal analyzed using Perkin Elmer Living Image® 4.7.4 software.   D-Luciferin at concentration of 15 mg/mL will be used for RO injection. The imaging time will be 1 min post- D-Luciferin injection. "

In [5]:
import spacy
import pyinflect
nlp = spacy.load("en_core_web_sm")
# txt = 'Randomization: animals will be assigned randomly to treatment groups. Animals will always be distributed into treatment groups according to post-surgery pre-dose responses.'
# txt = "Test articles will be provided as pre-formulated stock to Melior to be diluted in PBS ready to a concentration of 1µg / 20 µl.  Test articles will be administered IM injection into the caudal thigh muscle. IM injection volume will be 20 µl (=1 µg) per injections site into each of the left and right caudal thigh muscle."
doc=nlp(txt)




In [6]:
roots = [tok for tok in doc if tok.dep_ == "ROOT"]
doc[6]

to

In [90]:
for c in doc[5].children:
    print(c.text+":")
    print('\t'+c.pos_)
    print('\t'+c.dep_)
    print('\t'+c.tag_)
    print(c.i)

file:
	NOUN
	nsubjpass
	NN
2
can:
	AUX
	aux
	MD
3
be:
	AUX
	auxpass
	VB
4
to:
	ADP
	prep
	IN
6
upon:
	SCONJ
	prep
	IN
9
.:
	PUNCT
	punct
	.
11


In [106]:
# def past_tense_root(word_list, root_verb):
word_list = [tok.text for tok in doc]
root_verb = doc[5]
delete_inds = []
for root_verb in roots:
    children = root_verb.children
    aux = [tok for tok in root_verb.children if 'aux' in tok.dep_]
    subj = [tok for tok in root_verb.children if 'subj' in tok.dep_]

    subj_plural = ['S' in tok.tag_ for tok in subj]
    print(subj_plural[-1])
    for a in aux:
        if a.tag_ == "MD":
            if subj_plural[-1]:
                word_list[a.i] = "were"
            else:
                word_list[a.i] = "was"
        else:
            delete_inds.append(a.i)
            
for ind in delete_inds:
    del word_list(ind)
new_txt = " ".join(word_list)
# cases to account for - no subj identified 
    

SyntaxError: cannot delete function call (2853796180.py, line 22)

In [105]:
new_txt

'The Prism file was provided to Marshall Gerstein upon request .   Additional post - hoc analyses can were by Melior upon request for additional cost .  '

In [25]:
def word_list_replace(list, old, new):
    for ind,l in enumerate(list):
        if l == old:
            print(l)
            print(old)
            list[ind] = new
    
    return list

In [26]:
import spacy
import pyinflect
nlp = spacy.load("en_core_web_sm")
sent = txt
doc=nlp(sent)

roots = [tok for tok in doc if tok.dep_ == "ROOT"]
word_list = [tok.text for tok in doc]
print(word_list)
for r in roots:
    
    aux = [x for x in r.children if 'aux' in x.dep_]
    subj = [y.tag_ for y in r.children if 'subj' in y.dep_]

    if len(aux) > 0 & len(subj) > 0:

        if subj[-1] == "NNS":
            word_list = word_list_replace(word_list, aux[0].text, 'were')
            print(word_list)
            # txt2 = txt2.replace(aux[0].text, 'were')
        else:
            print('singular')
            word_list = word_list_replace(word_list, aux[0].text, 'was')
            # txt2 =txt2.replace(aux[0].text+' ', 'was')
            print(word_list)
        
        for a in aux[1:]:
            word_list =word_list_replace(word_list, a.text, '')
            print(word_list)
                # txt2 = txt2.replace(a.text+ ' ', '')
    
    # txt2 = txt2.replace(r.text, r._.inflect('VBD'))
    word_list = word_list_replace(word_list, r.text, r._.inflect('VBD'))
    print(word_list)
    txt2 = ' '.join(word_list)


['Bioluminescent', 'imaging', 'will', 'be', 'performed', 'on', 'all', 'animals', 'on', 'at', 'either', '4', ',', '8', ',', 'and', '24', 'hours', 'after', 'TA', 'administration', '.', ' ', 'Animals', 'will', 'be', 'anesthetized', 'using', 'isoflurane', 'inhalation', '.', ' ', 'Imaging', 'will', 'be', 'conducted', 'using', 'a', 'Perkin', 'Elmer', 'IVIS', '®', 'Lumina', 'III', 'LT', 'according', 'to', 'manufactures', 'recommendations', 'and', 'signal', 'analyzed', 'using', 'Perkin', 'Elmer', 'Living', 'Image', '®', '4.7.4', 'software', '.', '  ', 'D', '-', 'Luciferin', 'at', 'concentration', 'of', '15', 'mg', '/', 'mL', 'will', 'be', 'used', 'for', 'RO', 'injection', '.', 'The', 'imaging', 'time', 'will', 'be', '1', 'min', 'post-', 'D', '-', 'Luciferin', 'injection', '.']
performed
performed
['Bioluminescent', 'imaging', 'will', 'be', 'performed', 'on', 'all', 'animals', 'on', 'at', 'either', '4', ',', '8', ',', 'and', '24', 'hours', 'after', 'TA', 'administration', '.', ' ', 'Animals', '

In [11]:
[print(a) for a in doc[4].rights]

[]

In [27]:
txt2

'Bioluminescent imaging will was performed on all animals on at either 4 , 8 , and 24 hours after TA administration .   Animals will was anesthetized using isoflurane inhalation .   Imaging will was conducted using a Perkin Elmer IVIS ® Lumina III LT according to manufactures recommendations and signal analyzed using Perkin Elmer Living Image ® 4.7.4 software .    D - Luciferin at concentration of 15 mg / mL will was used for RO injection . The imaging time will was 1 min post- D - Luciferin injection .'

In [184]:
for tok in doc:
    print(tok.dep_)

compound
compound
compound
nsubjpass
aux
auxpass
ccomp
prep
prep
advmod
amod
pobj
punct
appos
cc
conj
punct
npadvmod
punct
punct
dep
nummod
amod
nsubjpass
acl
nummod
punct
prep
amod
compound
dobj
prep
advmod
nummod
compound
pobj
aux
auxpass
ROOT
prep
compound
pobj
punct
dep
nsubjpass
aux
auxpass
ROOT
prep
compound
pobj
prep
advmod
advmod
nummod
compound
pobj
prep
pobj
punct


In [74]:
txt

'Randomization: animals will be assigned randomly to treatment groups. Animals will not be distributed into treatment groups according to post-surgery pre-dose responses.'

In [118]:
for label in nlp.get_pipe("parser").labels:
    print(label, " -- ", spacy.explain(label))

ROOT  --  root
acl  --  clausal modifier of noun (adjectival clause)
acomp  --  adjectival complement
advcl  --  adverbial clause modifier
advmod  --  adverbial modifier
agent  --  agent
amod  --  adjectival modifier
appos  --  appositional modifier
attr  --  attribute
aux  --  auxiliary
auxpass  --  auxiliary (passive)
case  --  case marking
cc  --  coordinating conjunction
ccomp  --  clausal complement
compound  --  compound
conj  --  conjunct
csubj  --  clausal subject
csubjpass  --  clausal subject (passive)
dative  --  dative
dep  --  unclassified dependent
det  --  determiner
dobj  --  direct object
expl  --  expletive
intj  --  interjection
mark  --  marker
meta  --  meta modifier
neg  --  negation modifier
nmod  --  modifier of nominal
npadvmod  --  noun phrase as adverbial modifier
nsubj  --  nominal subject
nsubjpass  --  nominal subject (passive)
nummod  --  numeric modifier
oprd  --  object predicate
parataxis  --  parataxis
pcomp  --  complement of preposition
pobj  --  ob

