In [1]:
# Import
import glob
import os
import docx
from docx.document import Document
from docx import Document as Dcmnt
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
import pandas as pd, numpy as np
import re
from data_preparation import int_to_roman
import mammoth
from bs4 import BeautifulSoup
from thefuzz import fuzz

# <u>Paths</u>

In [2]:
# PDF folder
#path_pdf = '/rwi/users/schettiath/SOP TO/01 Raw/01 PDFs/'

# docx files with headers and footers
#path_docx = '/rwi/users/schettiath/SOP TO/01 Raw/02 docx with Header Footer/Completed (Summary Created 2022-10-12)/' #Originally used for pdfs to docxs
#path_docx = '/rwi/users/schettiath/SOP TO/01 Raw/00 Raw docx/' #Used for raw docxs
path_docx = '/rwi/users/schettiath/SOP TO/01 Raw/02 docx with Header Footer/Completed (Summary Created 2022-12-07)/' # Used for POL 162, 168 & 229

# docx files without headers and footers
#path_docx_no_head_foot = '/rwi/users/schettiath/SOP TO/01 Raw/03 docx No Header Footer/Completed (Summary Created 2022-10-12)/' #Originally used for pdfs to docxs
#path_docx_no_head_foot = '/rwi/users/schettiath/SOP TO/01 Raw/03 docx No Header Footer/' #Used for raw docxs
path_docx_no_head_foot = '/rwi/users/schettiath/SOP TO/01 Raw/03 docx No Header Footer/Completed (Summary Created 2022-12-07)/' # Used for POL 162, 168 & 229

# Unclean txts
#path_unclean_txt = '/rwi/users/schettiath/SOP TO/01 Raw/04 Unclean txt/Completed (Summary Created 2022-10-12)/' #Originally used for pdfs to docxs
#path_unclean_txt = '/rwi/users/schettiath/SOP TO/01 Raw/04 Unclean txt/' #Used for raw docxs
path_unclean_txt = '/rwi/users/schettiath/SOP TO/01 Raw/04 Unclean txt/Completed (Summary Created 2022-12-07)/' # Used for POL 162, 168 & 229

# Clean txts
#path_clean_txt = '/rwi/users/schettiath/SOP TO/01 Raw/05 Clean txt/Completed (Summary Created 2022-10-12)/' #Originally used for pdfs to docxs
#path_clean_txt = '/rwi/users/schettiath/SOP TO/01 Raw/05 Clean txt/' #Used for raw docxs
path_clean_txt = '/rwi/users/schettiath/SOP TO/01 Raw/05 Clean txt/Completed (Summary Created 2022-12-07)/' # Used for POL 162, 168 & 229

# <u>Remove Headers and Footers from docx</u>

In [3]:
# Pull names of all documents ending with '.docx' inside 'path_docx'
names = [os.path.basename(x) for x in glob.glob(path_docx + '*.docx')]

# Remove headers & footers
for name in names:
    doc = docx.Document(path_docx + name)
    for section in doc.sections:
        section.different_first_page_header_footer = False
        section.header.is_linked_to_previous = True # False if want to keep the header
        section.footer.is_linked_to_previous = True # False if want to keep the footer
    doc.save(path_docx_no_head_foot + name)

# <u>docx to txt</u>

In [8]:
# Read in stop words
stop_words = pd.read_csv('/rwi/users/schettiath/MA Insights/custom_stopword_list_nltk_spacy_scikit_wordcloud_gensim.csv')
stop_words = list(stop_words['Stop_Words'])

# Read in Division names with acronyms
div_names_acronyms = pd.read_csv('/rwi/users/schettiath/Master Definition List/Astellas_Division_Names_Acronyms.csv')

# Read in MDL abbreviations & drop NaNs
mdl = pd.read_csv('/rwi/users/schettiath/Master Definition List/Master Definition List.csv')
mdl = mdl.dropna()
print('='*50, 'Initial length of MDL:', len(mdl))

# Pull names of all documents ending with '.docx' inside 'path_docx_no_head_foot'
names = [os.path.basename(x) for x in glob.glob(path_docx_no_head_foot + '*.docx')]
names = [x.replace('.docx', '') for x in names]

# Loop through files
for name in names:
    print(name)
    doc = docx.Document(path_docx_no_head_foot + name + '.docx')
    
    # Convert each paragraph to text
    para_text_list = [para.text.strip() for para in doc.paragraphs]
    
    # Extract text from tables
    extracted_table_strings = []
    for idx in range(len(doc.tables)):
        table = doc.tables[idx]

        temp_list = [['' for i in range(len(table.columns))] for j in range(len(table.rows))]
        for i, row in enumerate(table.rows):
            for j, cell in enumerate(row.cells):
                if cell.text:
                    temp_list[i][j] = cell.text.replace('\n', ' ')
        df = pd.DataFrame(temp_list)
                    
        try:
            table_strings = [df.iloc[x].str.extract('(.+)').squeeze().str.cat(sep=' ') for x in range(len(df))]
        except:
            table_strings = [df.iloc[x].str.extract('(.+)').squeeze() for x in range(len(df))]
        table_str = ' '.join(table_strings)
        extracted_table_strings.append(table_str.strip())
    
    # Extract the block types of the entire document
    def iter_block_items(parent):
        """
        Yield each paragraph and table child within *parent*, in document order.
        Each returned value is an instance of either Table or Paragraph. *parent*
        would most commonly be a reference to a main Document object, but
        also works for a _Cell object, which itself can contain paragraphs and tables.
        """
        if isinstance(parent, Document):
            parent_elm = parent.element.body
        elif isinstance(parent, _Cell):
            parent_elm = parent._tc
        else:
            raise ValueError("something's not right")

        for child in parent_elm.iterchildren():
            if isinstance(child, CT_P):
                yield Paragraph(child, parent)
            elif isinstance(child, CT_Tbl):
                yield Table(child, parent)

    block_types = [str(type(block)) for block in iter_block_items(doc)]
    
    # Extract the indices of the text & table block types
    text_indices = [i for i, x in enumerate(block_types) if x == "<class 'docx.text.paragraph.Paragraph'>"]
    table_indices = [i for i, x in enumerate(block_types) if x == "<class 'docx.table.Table'>"]
    
    # Create a combined list of the paragraphs & tables in the order they appear in the doc
    para_text_table_list = block_types.copy()
    for x, y in zip(text_indices, para_text_list):
        para_text_table_list[x] = y
    for x, y in zip(table_indices, extracted_table_strings):
        para_text_table_list[x] = y

    # Make some substitutions & strip
    para_text_table_list = [re.sub('\.+[0-9]{1,2}|NOTE:|\\n|\\t|•|TABLE\sOF\sCONTENTS|SIGNATURE\sPAGE', ' ', para_text_tab) for para_text_tab in para_text_table_list]
    para_text_table_list = [re.sub('AND', 'and', para_text_tab) for para_text_tab in para_text_table_list]
    para_text_table_list = [re.sub('\s+', ' ', para_text_tab).strip() for para_text_tab in para_text_table_list]
    para_text_table_list = [para_text_tab.strip() for para_text_tab in para_text_table_list]
    
    para_text_table_list = [re.sub('^i\.e\.', 'ie.', para_text_tab) for para_text_tab in para_text_table_list]
    para_text_table_list = [re.sub('\si\.e\.', ' ie.', para_text_tab) for para_text_tab in para_text_table_list]
    para_text_table_list = [re.sub('\(i\.e\.', '(ie.', para_text_tab) for para_text_tab in para_text_table_list]
    para_text_table_list = [re.sub('^e\.g\.', 'eg.', para_text_tab) for para_text_tab in para_text_table_list]
    para_text_table_list = [re.sub('\se\.g\.', ' eg.', para_text_tab) for para_text_tab in para_text_table_list]
    para_text_table_list = [re.sub('\(e\.g\.', '(eg.', para_text_tab) for para_text_tab in para_text_table_list]

    para_text_table_list = [re.sub('^[A-Za-z]\.|^[IVXL]+\.|^[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}|^[0-9]{1,2}\.[0-9]{1,2}|^[0-9]{1,2}\.|\s[A-Za-z]\.|\s[IVXL]+\.|\s[0-9]{1,2}\.|[IVXL]+:|N/A|NA', ' ', para_text_tab) for para_text_tab in para_text_table_list]

    para_text_table_list = [re.sub('^ie\.', 'i.e.', para_text_tab) for para_text_tab in para_text_table_list]
    para_text_table_list = [re.sub('\sie\.', ' i.e.', para_text_tab) for para_text_tab in para_text_table_list]
    para_text_table_list = [re.sub('\(ie\.', '(i.e.', para_text_tab) for para_text_tab in para_text_table_list]
    para_text_table_list = [re.sub('^eg\.', 'e.g.', para_text_tab) for para_text_tab in para_text_table_list]
    para_text_table_list = [re.sub('\seg\.', ' e.g.', para_text_tab) for para_text_tab in para_text_table_list]
    para_text_table_list = [re.sub('\(eg\.', '(e.g.', para_text_tab) for para_text_tab in para_text_table_list]
    
    para_text_table_list = [re.sub('i\.e\.\s', 'i.e., ', para_text_tab) for para_text_tab in para_text_table_list]
    para_text_table_list = [re.sub('e\.g\.\s', 'e.g., ', para_text_tab) for para_text_tab in para_text_table_list]    
    
    para_text_table_list = [re.sub('I[CMJ]{3}E', 'ICMJE', para_text_tab) for para_text_tab in para_text_table_list]
    
    para_text_table_list = [re.sub('\s+', ' ', para_text_tab) for para_text_tab in para_text_table_list]
    para_text_table_list = [para_text_tab.strip() for para_text_tab in para_text_table_list if len(para_text_tab) > 0]
    
    # Remove spaces inside the parenthesis but on the edges of the string
    match_replacements_tups = []
    for para_text_tab in para_text_table_list:
        if re.search('\(.+?\)', para_text_tab):
            matches = re.findall('\(.+?\)', para_text_tab)
            for match in matches:
                split_match = re.split('\(|\)', match)
                match_2 = split_match[1].strip()
                match_2 = '(' + match_2 + ')'
                tup = (match, match_2)
                match_replacements_tups.append(tup)

    # Replace
    para_text_table_list_2 = []
    for para_text_tab in para_text_table_list:
        if re.search('\(.+?\)', para_text_tab):
            for tup in match_replacements_tups:
                para_text_tab = re.sub(tup[0], tup[1][1:-1], para_text_tab)
        para_text_table_list_2.append(para_text_tab)

    # Ensure there is whitespace before an open parentheses & after a closed parentheses
    para_text_table_list_3 = []
    for para_text_tab in para_text_table_list_2:
        if re.search('\(.+?\)', para_text_tab):
            temp_split_string = para_text_tab.split('(')
            para_text_tab = ' ('.join(temp_split_string)
            temp_split_string = para_text_tab.split(')')
            para_text_tab = ') '.join(temp_split_string)
        para_text_table_list_3.append(para_text_tab)

    # Replace long whitespaces with a single whitespace
    para_text_table_list_3 = [re.sub('\s+', ' ', para_text_tab).strip() for para_text_tab in para_text_table_list_3]
    
    # Expand '(ies)' to three words
    para_text_table_list_4 = []
    for para_text_tab in para_text_table_list_3:
        if re.search('y\s\(ies\)', para_text_tab):
            para_text_tab = re.sub('y\s\(ies\)', 'y(ies)', para_text_tab)
            temp_split_string = para_text_tab.split()
            temp_idx = []
            replacement_text = []
            for idx, elem in enumerate(temp_split_string):
                if re.search('y\(ies\)', elem):
                    base_word = re.findall('([A-Za-z]+)y\(ies\)', elem)[0]
                    elem = base_word + 'y or ' + base_word + 'ies'
                    temp_idx.append(idx)
                    replacement_text.append(elem)
            for idx, new_text in zip(temp_idx, replacement_text):
                temp_split_string[idx] = new_text
            para_text_tab = ' '.join(temp_split_string)
        para_text_table_list_4.append(para_text_tab)
    
    # Expand '(s)' to three words
    para_text_table_list_5 = []
    for para_text_tab in para_text_table_list_4:
        if re.search('[A-Za-z]\s\(s\)', para_text_tab):
            para_text_tab = re.sub('\s\(s\)', '(s)', para_text_tab)
            temp_split_string = para_text_tab.split()
            temp_idx = []
            replacement_text = []
            for idx, elem in enumerate(temp_split_string):
                if re.search('[A-Za-z]\(s\)', elem):
                    base_word = re.findall('([A-Za-z]+)\(s\)', elem)[0]
                    elem = base_word + ' or ' + base_word + 's'
                    temp_idx.append(idx)
                    replacement_text.append(elem)
            for idx, new_text in zip(temp_idx, replacement_text):
                temp_split_string[idx] = new_text
            para_text_tab = ' '.join(temp_split_string)
        para_text_table_list_5.append(para_text_tab)
    
    # Compute summary statistics
    all_elem_with_2_cap_letters = []
    initial_abbrevs = []
    for para_text_tab in para_text_table_list_5:
        temp_split_string = para_text_tab.split()
        for elem in temp_split_string:
#             if re.search('DDR', elem):
#                 print('='*50, 'DDR', 'index #', para_text_table_list_5.index(para_text_tab))
#                 print(para_text_tab)
            if not re.search('PURPOSE|SCOPE|DEFINITION|INTRODUCTION|OVERVIEW|BACKGROUND|GENERAL|PRINC[A-Z]+|ROLES|PROCEDURE|DOCUMENT|HISTORY|REFERENCES|ATTACHMENTS', elem):
                cap_matches = re.findall('[A-Z]', elem)
                len_cap_matches = len(cap_matches)
                lower_matches = re.findall('[a-z]', elem)
                len_lower_matches = len(lower_matches)
                len_elem = len(elem)
                if len_cap_matches >= 2:
                    all_elem_with_2_cap_letters.append(elem)
                    if len_cap_matches/len_elem < 1:
                        if len_lower_matches > 0:
                            if len_cap_matches/len_lower_matches > 0.25:
                                initial_abbrevs = initial_abbrevs + [elem]
                        elif len_cap_matches/len_elem > 0.4:
                            initial_abbrevs = initial_abbrevs + [elem]
                    else:
                        initial_abbrevs = initial_abbrevs + [elem]
    print('='*50, 'all_elem_with_2_cap_letters:', len(all_elem_with_2_cap_letters))
    for x in all_elem_with_2_cap_letters:
        print(x)
    print('#'*50, 'DONE')
    print('='*50, 'initial_abbrevs:', len(initial_abbrevs))
    for x in initial_abbrevs:
        print(x)
    print('#'*50, 'DONE')
    
    # 2022-12-05: Evaluate if this step is needed or if it can be deleted
    # Remove elements that are all numbers or contain an @
    para_text_table_list_6 = []
    for para_text_tab in para_text_table_list_5:
        if re.search('\([^\s]+\)', para_text_tab):
            temp_split_string = para_text_tab.split()
            to_delete_idx_list = []
            for idx, elem in enumerate(temp_split_string):
                if re.search('\([^\s]+\)', elem):
                    acronym = re.findall('\(([^\s]+)\)', elem)[0]
                    if acronym.isnumeric() or re.search('@', acronym):
                        to_delete_idx_list.append(idx)
            temp_split_string = list(np.delete(temp_split_string, to_delete_idx_list))
            para_text_tab = ' '.join(temp_split_string)
        para_text_table_list_6.append(para_text_tab)  

    # Compute summary statistics
    abbrevs_list = []
    for para_text_tab in para_text_table_list_6:
        temp_split_string = para_text_tab.split()
        for elem in temp_split_string:
#             if re.search('DDR', elem):
#                 print('='*50, 'DDR', 'index #', para_text_table_list_6.index(para_text_tab))
#                 print(para_text_tab)
            if not re.search('PURPOSE|SCOPE|DEFINITION|INTRODUCTION|OVERVIEW|BACKGROUND|GENERAL|PRINC[A-Z]+|ROLES|PROCEDURE|DOCUMENT|HISTORY|REFERENCES|ATTACHMENTS', elem):
                cap_matches = re.findall('[A-Z]', elem)
                len_cap_matches = len(cap_matches)
                lower_matches = re.findall('[a-z]', elem)
                len_lower_matches = len(lower_matches)
                len_elem = len(elem)
                if len_cap_matches >= 2:
                    if len_cap_matches/len_elem < 1:
                        if len_lower_matches > 0:
                            if len_cap_matches/len_lower_matches > 0.25:
                                abbrevs_list = abbrevs_list + [elem]
                        elif len_cap_matches/len_elem > 0.4:
                            abbrevs_list = abbrevs_list + [elem]
                    else:
                        abbrevs_list = abbrevs_list + [elem]
    print('='*50, 'abbrevs_minus_nums_minus_at:', len(abbrevs_list))
    for x in abbrevs_list:
        print(x)
    print('#'*50, 'DONE')
        
    # Replace matching acronyms within the active doc with their long forms
    complete_acronyms_list = []
    complete_terms_list = []
    para_text_table_list_7 = []
    for para_text_tab in para_text_table_list_6:  # Test to confirm that all parentheses containing anything but spaces only house abbreviations
        if re.search('\([^\s]+\)', para_text_tab):
            temp_split_string = para_text_tab.split()
            acronym_idx_and_replacement = []
            to_delete_idx_list = []
            for idx, elem in enumerate(temp_split_string):
                if re.search('\([^\s]+\)', elem):
                    acronym = re.findall('\(([^\s]+)\)', elem)[0]
                    cap_letters_nums_list = re.findall('[A-Z0-9]', acronym)
                    len_cap_letters_nums_list = len(cap_letters_nums_list)
                    acronym_words_idx = []
                    acronym_words = []
                    for num in range(len_cap_letters_nums_list, 0, -1):
                        idx_2 = idx - num
                        acronym_words_idx.append(idx_2)
                        acronym_words.append(temp_split_string[idx_2])
                    check_list = []
                    for letter_or_num, term in zip(cap_letters_nums_list, acronym_words):
                        if not re.search('[0-9]', letter_or_num):
                            letter_or_num = letter_or_num.lower()
                            lower_term = term.lower()
                            reg_exp = '^' + letter_or_num
                            if re.search(reg_exp, lower_term):
                                check_list.append(True)
                        else:
                            reg_exp = '^' + letter_or_num
                            if re.search(reg_exp, term):
                                check_list.append(True)
                    if len(acronym_words) == len(check_list):
                        acronym_idx_and_replacement.append((idx, ' '.join(acronym_words)))
                        to_delete_idx_list = to_delete_idx_list + acronym_words_idx
                        complete_acronyms_list.append(acronym)
                    else:
                        temp_split_string_subset = temp_split_string[:idx]
                        temp_split_string_subset_2 = [string_elem for string_elem in temp_split_string_subset if string_elem not in stop_words]
                        acronym_words = temp_split_string_subset_2[-len_cap_letters_nums_list:]
                        check_list = []
                        for letter_or_num, term in zip(cap_letters_nums_list, acronym_words):
                            if not re.search('[0-9]', letter_or_num):
                                letter_or_num = letter_or_num.lower()
                                lower_term = term.lower()
                                reg_exp = '^' + letter_or_num
                                if re.search(reg_exp, lower_term):
                                    check_list.append(True)
                            else:
                                reg_exp = '^' + letter_or_num
                                if re.search(reg_exp, term):
                                    check_list.append(True)
                        if len(acronym_words) == len(check_list):
                            acronym_idx_and_replacement.append((idx, ' '.join(acronym_words)))                                
                            temp_split_string_subset.reverse()
                            acronym_words_idx = [temp_split_string_subset.index(term) for term in acronym_words]
                            acronym_words_idx = [ac_w_idx + 1 for ac_w_idx in acronym_words_idx]
                            acronym_words_idx = [ac_w_idx*-1 for ac_w_idx in acronym_words_idx]
                            acronym_words_idx = [ac_w_idx + len(temp_split_string_subset) for ac_w_idx in acronym_words_idx]
                            acronym_words_idx = [val for val in range(acronym_words_idx[0], acronym_words_idx[-1] + 1)]
                            to_delete_idx_list = to_delete_idx_list + acronym_words_idx
                            complete_acronyms_list.append(acronym)
            if len(acronym_idx_and_replacement) > 0:
                for tup in acronym_idx_and_replacement:
                    complete_terms_list.append(tup[1])
                    temp_split_string[tup[0]] = tup[1]
                temp_split_string = list(np.delete(temp_split_string, to_delete_idx_list))
            para_text_tab = ' '.join(temp_split_string)
        para_text_table_list_7.append(para_text_tab)  
    para_text_table_list_8 = []
    for para_text_tab in para_text_table_list_7:
        for acronym, term in zip(complete_acronyms_list, complete_terms_list):
            regex = '^' + acronym + '[^A-Za-rt-z]|[^A-Za-z]' + acronym + '$|[^A-Za-z]' + acronym + '[^A-Za-rt-z]'
            para_text_tab = re.sub(regex, term, para_text_tab)
        para_text_table_list_8.append(para_text_tab)
    print('='*50, 'acronym_and_replacement')
    for x, y in zip(complete_acronyms_list, complete_terms_list):
        print(x, '-----', y)
    
    # Add in document acronyms to MDL
    mdl_acronyms = list(mdl['Acronym/Abbreviation'])
    for x in complete_acronyms_list:
        if x in mdl_acronyms:
            print('='*50, 'Duplicate Acronym')
            print(x)
            #raise Exception('Acronym already exists in the MDL')
    for x, y in zip(complete_acronyms_list, complete_terms_list):
        temp_dict = {'Name': y, 'Acronym/Abbreviation': x}
        mdl = mdl.append(temp_dict, ignore_index=True)
    print('='*50, 'New length of MDL:', len(mdl))
    
    # Compute summary statistics
    abbrevs_list = []
    for para_text_tab in para_text_table_list_8:
        temp_split_string = para_text_tab.split()
        for elem in temp_split_string:
#             if re.search('DDR', elem):
#                 print('='*50, 'DDR', 'index #', para_text_table_list_8.index(para_text_tab))
#                 print(para_text_tab)
            if not re.search('PURPOSE|SCOPE|DEFINITION|INTRODUCTION|OVERVIEW|BACKGROUND|GENERAL|PRINC[A-Z]+|ROLES|PROCEDURE|DOCUMENT|HISTORY|REFERENCES|ATTACHMENTS', elem):
                cap_matches = re.findall('[A-Z]', elem)
                len_cap_matches = len(cap_matches)
                lower_matches = re.findall('[a-z]', elem)
                len_lower_matches = len(lower_matches)
                len_elem = len(elem)
                if len_cap_matches >= 2:
                    if len_cap_matches/len_elem < 1:
                        if len_lower_matches > 0:
                            if len_cap_matches/len_lower_matches > 0.25:
                                abbrevs_list = abbrevs_list + [elem]
                        elif len_cap_matches/len_elem > 0.4:
                            abbrevs_list = abbrevs_list + [elem]
                    else:
                        abbrevs_list = abbrevs_list + [elem]
    print('='*50, 'abbrevs_minus_nums_minus_at_minus_in_doc_acronyms:', len(abbrevs_list))
    for x in abbrevs_list:
        print(x)
    print('#'*50, 'DONE')
    
    # Replace Division acronyms with their long form
    div_acronyms = list(div_names_acronyms['Acronym'])
    div_names = list(div_names_acronyms['Division'])
    para_text_table_list_9 = []
    for para_text_tab in para_text_table_list_8:
        temp_split_string = para_text_tab.split()
        temp_idx_list = []
        temp_elem_list = []
        for idx, elem in enumerate(temp_split_string):
            for acronym, term in zip(div_acronyms, div_names):
                if re.search(acronym, elem) and acronym not in complete_acronyms_list:
                    regex = '^' + acronym + '[^A-Za-rt-z]|[^A-Za-z]' + acronym + '$|[^A-Za-z]' + acronym + '[^A-Za-rt-z]'
                    if acronym == elem or re.search(regex, elem):
                        elem = re.sub(acronym, term, elem)
                        temp_idx_list.append(idx)
                        temp_elem_list.append(elem)
        if len(temp_idx_list) > 0:
            for idx, elem in zip(temp_idx_list, temp_elem_list):
                temp_split_string[idx] = elem
        para_text_tab = ' '.join(temp_split_string)
        para_text_table_list_9.append(para_text_tab)
    
    # Compute summary statistics
    abbrevs_list = []
    for para_text_tab in para_text_table_list_9:
        temp_split_string = para_text_tab.split()
        for elem in temp_split_string:
            if re.search('DDR', elem):
                print('='*50, 'DDR', 'index #', para_text_table_list_9.index(para_text_tab))
                print(para_text_tab)
            if not re.search('PURPOSE|SCOPE|DEFINITION|INTRODUCTION|OVERVIEW|BACKGROUND|GENERAL|PRINC[A-Z]+|ROLES|PROCEDURE|DOCUMENT|HISTORY|REFERENCES|ATTACHMENTS', elem):
                cap_matches = re.findall('[A-Z]', elem)
                len_cap_matches = len(cap_matches)
                lower_matches = re.findall('[a-z]', elem)
                len_lower_matches = len(lower_matches)
                len_elem = len(elem)
                if len_cap_matches >= 2:
                    if len_cap_matches/len_elem < 1:
                        if len_lower_matches > 0:
                            if len_cap_matches/len_lower_matches > 0.25:
                                abbrevs_list = abbrevs_list + [elem]
                        elif len_cap_matches/len_elem > 0.4:
                            abbrevs_list = abbrevs_list + [elem]
                    else:
                        abbrevs_list = abbrevs_list + [elem]
    print('='*50, 'abbrevs_minus_nums_minus_at_minus_in_doc_acronyms_minus_divs:', len(abbrevs_list))
    for x in abbrevs_list:
        print(x)
    print('#'*50, 'DONE')
    
    # Replace acronyms in the MDL with their full length forms
    #mdl.sort_values(by=['Acronym/Abbreviation'], ascending=False, key=lambda x: len(x))  # Confirm this works once pandas==1.1.0
    mdl_terms_list = list(mdl['Name'])
    mdl_acronyms_list = list(mdl['Acronym/Abbreviation'])
    acronyms_to_exclude = ['CDA', 'EC', 'ESI', 'GMS'] + complete_acronyms_list + div_acronyms
    para_text_table_list_10 = []
    for para_text_tab in para_text_table_list_9:
        temp_split_string = para_text_tab.split()
        temp_idx_list = []
        temp_elem_list = []
        for idx, elem in enumerate(temp_split_string):
            for acronym, term in zip(mdl_acronyms_list, mdl_terms_list):
                if acronym not in acronyms_to_exclude:    
                    regex = '^' + acronym + '[^A-Za-rt-z]|[^A-Za-z]' + acronym + '$|[^A-Za-z]' + acronym + '[^A-Za-rt-z]'
                    if acronym == elem or re.search(regex, elem):
                        elem = re.sub(acronym, term, elem)
                        temp_idx_list.append(idx)
                        temp_elem_list.append(elem)
        for idx, elem in zip(temp_idx_list, temp_elem_list):
            temp_split_string[idx] = elem
        para_text_tab = ' '.join(temp_split_string)
        para_text_table_list_10.append(para_text_tab)
        
    # Compute summary statistics
    abbrevs_list = []
    for para_text_tab in para_text_table_list_10:
        temp_split_string = para_text_tab.split()
        for elem in temp_split_string:
#             if re.search('DDR', elem):
#                 print('='*50, 'DDR', 'index #', para_text_table_list_10.index(para_text_tab))
#                 print(para_text_tab)
            if not re.search('PURPOSE|SCOPE|DEFINITION|INTRODUCTION|OVERVIEW|BACKGROUND|GENERAL|PRINC[A-Z]+|ROLES|PROCEDURE|DOCUMENT|HISTORY|REFERENCES|ATTACHMENTS', elem):
                cap_matches = re.findall('[A-Z]', elem)
                len_cap_matches = len(cap_matches)
                lower_matches = re.findall('[a-z]', elem)
                len_lower_matches = len(lower_matches)
                len_elem = len(elem)
                if len_cap_matches >= 2:
                    if len_cap_matches/len_elem < 1:
                        if len_lower_matches > 0:
                            if len_cap_matches/len_lower_matches > 0.25:
                                abbrevs_list = abbrevs_list + [elem]
                        elif len_cap_matches/len_elem > 0.4:
                            abbrevs_list = abbrevs_list + [elem]
                    else:
                        abbrevs_list = abbrevs_list + [elem]
    print('='*50, 'abbrevs_minus_nums_minus_at_minus_in_doc_acronyms_minus_divs_minus_mdl:', len(abbrevs_list))
    for x in abbrevs_list:
        print(x)
    print('#'*50, 'DONE')

    # Remove the 'Doc. Title' repeated underscore pattern
    to_delete_idx_list = []
    for idx, para_text_tab in enumerate(para_text_table_list_10):
        if re.search('Doc\.\sTitle:|______', para_text_tab):
            to_delete_idx_list.append(idx)
            if len(to_delete_idx_list) > 1:
                idx_2 = idx - 2
                if not re.search('Doc\.\sTitle:', para_text_tab) and to_delete_idx_list[-2] == idx_2:
                    to_delete_idx_list.insert(-1, idx - 1)
    try:
        para_text_table_list_10 = list(np.delete(para_text_table_list_10, to_delete_idx_list))        
    except:
        pass
    
    # Extract section headers that are all capital letters
    caps_sec_heads_list = [para_text_tab for para_text_tab in para_text_table_list_10 if not re.search('[\.\?\!]$|[a-z]', para_text_tab) or re.search('^ATTACHMENTS', para_text_tab)]
    caps_sec_heads_list = [caps_sec_head for caps_sec_head in caps_sec_heads_list if re.search('^[A-Z]{2}', caps_sec_head) and re.search('[A-Z]{2}$', caps_sec_head) or re.search('^ATTACHMENTS', caps_sec_head)]
    rslt = []
    [rslt.append(head) for head in caps_sec_heads_list if head not in rslt]
    caps_sec_heads_list = rslt

    # Clean up all caps section
    caps_sec_heads_list_2 = []
    for head in caps_sec_heads_list:
        idx = para_text_table_list_10.index(head)
        if re.search('ATTACHMENTS', head):
            head = 'ATTACHMENTS'
        if re.search('PROCEDURE', head):
            head = 'PROCEDURES'
        if re.search('ROLES', head):
            head = 'ROLES AND RESPONSIBILITIES'
        head = re.sub("\'|\.", '', head)
        caps_sec_heads_list_2.append(head)
        para_text_table_list_10[idx] = head
        
    # Join & write the unclean string to file; will be used to measure token reduction percentage
    unclean_text_string = ' '.join(para_text_table_list_10)
#     with open(path_unclean_txt + name + '_uncl_ms' + '.txt', 'w', encoding='utf8') as f:
#         f.write(unclean_text_string)
#         f.close()
    
    # Remove selected sections if they exist
    if 'DEFINITIONS' in caps_sec_heads_list_2:
        idx_1 = caps_sec_heads_list_2.index('DEFINITIONS')
        idx_2 = idx_1 + 1
        next_sec_head = caps_sec_heads_list_2[idx_2]
        def_idx_start = para_text_table_list_10.index('DEFINITIONS') - 1
        def_idx_end = para_text_table_list_10.index(next_sec_head) - 1
        for x in range(def_idx_end, def_idx_start, -1):
            del para_text_table_list_10[x]

    if 'ATTACHMENTS' in caps_sec_heads_list_2:
        idx_start = para_text_table_list_10.index('ATTACHMENTS')
        del para_text_table_list_10[idx_start:]
    
    if 'REFERENCES' in caps_sec_heads_list_2:
        idx_start = para_text_table_list_10.index('REFERENCES')
        del para_text_table_list_10[idx_start:]
    
    if 'DOCUMENT HISTORY' in caps_sec_heads_list_2:
        idx_start = para_text_table_list_10.index('DOCUMENT HISTORY')
        del para_text_table_list_10[idx_start:]
    
    # Update index & headers list
    idx_caps_sec_heads = []
    caps_sec_heads_list_3 = []
    for head in caps_sec_heads_list_2:
        try:
            idx = para_text_table_list_10.index(head)
            idx_caps_sec_heads.append(idx)
            caps_sec_heads_list_3.append(head)
        except:
            pass
    
    # Add roman numerals & tag all caps headers with hashtags
    for idx, head in enumerate(caps_sec_heads_list_3):
        idx_2 = idx + 1
        rom_num = int_to_roman(idx_2)
        new_head = '###' + rom_num + '. ' + head + '###'
        para_text_table_list_10[idx_caps_sec_heads[idx]] = new_head
    #print()    


    ##################################
    # Start of docx to html conversion
    ##################################


    # Read in docx again for html conversion
    doc = open(path_docx_no_head_foot + name + '.docx', 'rb')
    
    # Convert to html
    html = mammoth.convert_to_html(doc)
    soup = BeautifulSoup(html.value)
    
    # Extract headers that begin with a capital letter or number
    if len(soup.find_all('h2')) > 0:
        soup_text = soup.prettify()
        soup_text = re.sub('\\n +', '', soup_text)
        soup_text = soup_text.split('<h1>')
        soup_text = soup_text[1:]

        num_period_headers_tup_list_final = []
        cap_letter_period_headers_tup_list_final = []
        for sec in soup_text:
            if re.search('<h2>', sec):
                sub_sects = sec.split('<h2>')[1:]
                cap_letter_period_headers_list = []
                for sub_sec in sub_sects:
                    cap_letter_period_header = re.findall('([^<>]+)</h2>', sub_sec)[0]
                    if re.search('^[A-Z]\.', cap_letter_period_header):
                        cap_letter_period_header = cap_letter_period_header[2:].strip()
                    cap_letter_period_headers_list.append(cap_letter_period_header)
                    cap_letter_period_headers_list = [re.sub('&amp;', '&', head) for head in cap_letter_period_headers_list]
                    matches = re.findall('<ol><li>.{0,70}<strong>([^<>]+)</strong></li>|([^<>]+)</h3>', sub_sec) + [('', '')]
                    matches = [''.join(match) for match in matches if len(match[0]) == 0 or len(match[1]) == 0]
                    if re.search('^[0-9]{1,2}\.', matches[0]) and not re.search('^1\.', matches[0]):
                        matches = ['']
                    match_lens = [len(match) for match in matches]
                    if sum(match_lens) > 0:
                        if re.search('POL\-162', name) and 'Working with the Media' in matches:
                            idx = matches.index('Working with the Media')
                            matches.insert(idx + 1, 'Payment for Space in a Media Publication/Outlet and Journal Feature Articles')   
                        matches = [re.sub('^[0-9]{1,2}\.', '', match).strip() if re.search('^[0-9]{1,2}\.', match) else match for match in matches]
                        matches = [match for match in matches if len(match) > 0]
                        matches = [re.sub('&amp;', '&', match) for match in matches]
                        num_period_headers_tup_list = [(match, str(idx + 1) + '. ' + match) for idx, match in enumerate(matches)]
                        num_period_headers_tup_list_final = num_period_headers_tup_list_final + num_period_headers_tup_list
                cap_letter_period_headers_tup_list = [(head, chr(ord('@') + (idx + 1)) + '. ' + head) for idx, head in enumerate(cap_letter_period_headers_list)]
                cap_letter_period_headers_tup_list_final = cap_letter_period_headers_tup_list_final + cap_letter_period_headers_tup_list
        #print('#' * 50, 'cap_letter_period_headers_tup_list_final')
        if len(cap_letter_period_headers_tup_list_final) > 0:
            for tup in cap_letter_period_headers_tup_list_final:
                #print(tup[1])
                print()
        else:
            #print('X'*50, None)
            print()
        #print('#' * 50, 'num_period_headers_tup_list_final')
        if len(num_period_headers_tup_list_final) > 0:
            for tup in num_period_headers_tup_list_final:
                #print(tup[1])
                print()
        else:
            #print('X'*50, None)
            print()
        cap_num_period_headers_tup_list = cap_letter_period_headers_tup_list_final + num_period_headers_tup_list_final
    else:
        # Extract headers
        ol_list = soup.find_all('ol')
        
        li_list = []
        for elem in ol_list:
            li_list = li_list + elem.find_all('li')
        
        strong_list = []
        for elem in li_list:
            strong_list = strong_list + elem.find_all('strong')
        
        pure_text_headers = [elem.get_text().strip() for elem in strong_list]
        
        # Clean headers
        clean_headers = []
        for elem in pure_text_headers:
            if re.search('^[A-Za-z]\.', elem):
                res = re.findall('^[A-Za-z]\.', elem)
                res = elem.split(res[0])[1].strip()
                elem = res
            elif re.search('^[0-9]{1,2}\.', elem):
                res = re.findall('^[0-9]{1,2}\.', elem)
                res = elem.split(res[0])[1].strip()
                elem = res
            clean_headers.append(elem)
        
        # Remove duplicates
        rslt = []
        [rslt.append(head) for head in clean_headers if head not in rslt]
        clean_headers = rslt
        
        # Remove all caps level 1 headers & add capital letter period pattern to remaining headers
        i = 0
        cap_letter_period_headers_tup_list = []
        for elem in clean_headers:
            if not re.search('^[A-Z]{5}', elem):
                i += 1
                head = chr(ord('@') + i) + '. ' + elem
                tup = (elem, head)
                cap_letter_period_headers_tup_list.append(tup)
            else:
                i = 0
        
        # Concatenate
        num_period_headers_tup_list = []
        cap_num_period_headers_tup_list = cap_letter_period_headers_tup_list + num_period_headers_tup_list


    ##################################
    # End of docx to html conversion
    ##################################

    
    # Update index & tag level 2 & 3 section headers
    if len(cap_num_period_headers_tup_list) > 0:
        idx_cap_num_periods_headers_list = []
        cap_num_period_headers_tup_list_2 = []
        for tup in cap_num_period_headers_tup_list:
            for idx, para_text_tab in enumerate(para_text_table_list_10):
                #print(idx, 'YYY', para_text_tab)
                fuzz_ratio = fuzz.ratio(tup[0], para_text_tab)
                if fuzz_ratio > 90:
#                     print('#' * 50, 'Fuzz Ratio')
#                     print(tup[0], para_text_tab, fuzz_ratio)
                    idx_cap_num_periods_headers_list.append(idx)
                    cap_num_period_headers_tup_list_2.append('###' + tup[1] + '###')

        # Throw an error if headers counts don't match up
        if len(cap_num_period_headers_tup_list) != len(cap_num_period_headers_tup_list_2):
            raise Exception('Unequal number of level 3 section headers')

        # Replace level 2 & level 3 headers
        for idx, head in zip(idx_cap_num_periods_headers_list, cap_num_period_headers_tup_list_2):
            para_text_table_list_10[idx] = head
    
    # Join list into string, remove abbreviations
    clean_text_string = ' '.join(para_text_table_list_10)
    
    # Test to find patterns of any abbreviations that were missed
    test_split_string = clean_text_string.split()
    remaining_terms_with_2_cap_letters = []
    for elem in test_split_string:
        matches = re.findall('[A-Z]', elem)
        if len(matches) > 1:
            remaining_terms_with_2_cap_letters.append(elem)
    print('#'*50, 'remaining terms w/at least 2 capital letters:', len(remaining_terms_with_2_cap_letters))
    for elem in remaining_terms_with_2_cap_letters:
        print(elem)

    # Write the clean string to file
#     with open(path_clean_txt + name + '_cl_ms' + '.txt', 'w', encoding='utf8') as f:
#         f.write(clean_text_string)
#         f.close()

# Write new MDL to file
print('='*50, 'Final length of MDL:', len(mdl))
mdl.to_csv('/rwi/users/schettiath/SOP TO/02 Resources/Master Definition List with POL SOP In-Document Acronyms.csv', index=False)

POL-162
(POL)
POL
(IFPMA)
(EFPIA)
Codes/Regulations/Laws.
POL
(SOP)
(WPD)
POL
(HCPs)
POL
SOP-1921
CP-GL-SO-POL-0034
POL
STL-141
Pan-European
(PECT)
(LC)
UK)
POL
MUST
VISION
Pan-European
(PECT)
PECT
PECT
(CSR)
(CSR)
RandD
CSR
(API)
API
(LC)
Publication/Outlet
POL-168
Codes/Regulations/Laws
Pan-European
POL
VI,
Meetings/Events
POL-229
(POs)
POs
SOP-1255
EMEA
POs.
(KEEs)
(KEEs)
KEEs
POL-229
KEEs
Online/Digital
SOP-1771
EMEA
SOP
SOP-1772
EMEA
SOP
POL-293
EMEA
LinkedIn,
YouTube,
SOP-1921
CP-GL-SO-POL-0034
EMEA
SOP-1921
SOPs
(SOP-1129
SOP-256)
POL
CP-GL-SO-POL-0034
CP-GL-SO-STL-0003-EN
POL-293
EMEA
SOP-2154
EMEA
POL-229
POL-168
SOP-1255
EMEA
SOP-1771
EMEA
SOP
SOP-1772
EMEA
SOP
SOP-1921
SOP-2046
EMEA
################################################## DONE
(POL)
POL
(IFPMA)
(EFPIA)
POL
(SOP)
(WPD)
POL
(HCPs)
POL
CP-GL-SO-POL-0034
POL
STL-141
(PECT)
(LC)
UK)
POL
MUST
VISION
(PECT)
PECT
PECT
(CSR)
(CSR)
RandD
CSR
(API)
API
(LC)
POL-168
POL
VI,
POL-229
(POs)
POs
EMEA
POs.
(KEEs)
(KEEs)
KEEs
POL-2

In [6]:
for x in para_text_table_list_10:
    print('='*50)
    print(x)

###I. PURPOSE###
The purpose of this Policy (policy) is to describe the principles and minimum standards required when communicating with internal and external stakeholders within the Established Markets region. This policy should be used in conjunction with relevant policies and procedures outlined in the References section at the end of this document.
This policy has been prepared in order to meet the relevant requirements of the International Federation of Pharmaceutical Manufacturers & Associations (IFPMA) and the European Federation Pharmaceutical Industries Associations . All steps must only occur in accordance with Astellas Policies and Procedures and applicable Codes/Regulations/Laws. Where stricter local requirements apply, these always take precedence, and must be adhered to in all circumstances.
The principles within this policy may be used to help develop a country-specific communications Standard Operating Procedure or Working Practice Document .
###II. SCOPE###
This polic