In [1]:
import fitz
import string
import pandas as pd
from difflib import SequenceMatcher

In [2]:
filepath_full = '../Data/REP-EDC-2020_Fusion_Final.pdf'

In [51]:
class ExtractionTool:
    #Initiator: 
    ##filepath
    def __init__(self, filepath):
        self.filepath = filepath
        
    def openPDFasTextDict(self):
        """
        Opens PDF as XML dict
        """
        text_dict = []
        with fitz.open(self.filepath) as doc:
            for page in doc:
                text_dict.append(page.get_text("dict", sort=False))
        self.text_dict = text_dict
        
    @staticmethod
    def removePuncandSpace(text):
        """
        Removes punctuation and spaces from a string
        Used in extractFromTextDict
        """
        return text.translate(str.maketrans('', '', string.punctuation)).strip()
    
    def extractFromTextDict(self, bannedStrings):
        ##
        # Get an idea of looping through text
        # For every page
        # Extract additional information about the text as well: font and font size
        # Store in list of dictionaries
        ##

        org_list = []
        foundation_list = []
        org_id = -1
        foundation_id = -1
        charitable_foundation = False

        for count_page, page in enumerate(self.text_dict):
            for count_block_list, block_list in enumerate(page["blocks"]):
                for count_line_list, line_list in enumerate(block_list["lines"]):
                    for count_spans_list, spans_list in enumerate(line_list["spans"]):

                        #Remove empty text
                        if spans_list['text'].isspace():
                            continue
                        #Skip if trash text
                        if spans_list['text'].strip() in bannedStrings:
                            continue


                        ### Organizations ####
                        #Check if font & size are that of org number or new org
                        if (spans_list['font'] == 'Helvetica-Bold') & (int(float(spans_list['size'])) == 11):
                            charitable_foundation = False
                            #Check if start of new org
                            try :
                                #Throws ValueError if name of org
                                int(spans_list['text'])
                            except ValueError:
                                #Only triggers when name of org
                                org_list[org_id]['Name'] = spans_list['text'].strip()                   
                            else:
                                #If not name of org then org number
                                if (spans_list['font'] == 'Helvetica-Bold') & (int(float(spans_list['size'])) == 11):
                                    org_number = spans_list['text'].strip()
                                    org_list.append({'id' : org_number,
                                                     'isFoundation' : 'No'})
                                    org_id += 1

                        #Check if not in charitable organisation
                        if not charitable_foundation:

                            #Check if font & size are that of org address
                            #Uses round to filter more text: other text has size that rounds to 8
                            if (spans_list['font'] == 'Helvetica') & (round(float(spans_list['size'])) == 9):
                                #Catch if no orgs created
                                if org_id < 0:
                                    continue
                                #If key Address doesn't already exist, create it
                                if 'Address' not in org_list[org_id].keys():
                                    org_list[org_id]['Address'] = ''
                                    org_list[org_id]['Address'] += spans_list['text']
                                else:
                                    #Strip here to avoid unnecessary blank space
                                    #Maybe handle this later?
                                    org_list[org_id]['Address'] += spans_list['text'].strip()

                            #Check if font & size are that of field name
                            if (spans_list['font'] == 'ArialNarrow') & (int(float(spans_list['size'])) == 8):
                                #Catch if no orgs created
                                if org_id < 0:
                                    continue
                                #If key field doesn't already exist, create it. Checks if length string > 1 to remove bad text
                                if (ExtractionTool.removePuncandSpace(spans_list['text']) not in org_list[org_id].keys()) & (len(ExtractionTool.removePuncandSpace(spans_list['text'])) > 1):
                                    org_list[org_id][ExtractionTool.removePuncandSpace(spans_list['text'])] = ''
                                #If field already exists, create new field with convention i - Name where i is number of fields with the same name +1
                                elif (ExtractionTool.removePuncandSpace(spans_list['text']) in org_list[org_id].keys()):
                                    num_instances = list(org_list[org_id].keys()).count(ExtractionTool.removePuncandSpace(spans_list['text']))
                                    org_list[org_id][f"{num_instances + 1} - {ExtractionTool.removePuncandSpace(spans_list['text'])}"] = ''

                            #Check if font & size are that of field text
                            if (spans_list['font'] == 'Helvetica-Bold') & (round(float(spans_list['size'])) == 8):
                                #Catch if no orgs created
                                if org_id < 0:
                                    continue

                                #Place in last dict key: will always be something there, non-generalizable method
                                org_list[org_id][list( org_list[org_id])[-1]] += spans_list['text']

                        ### Foundations ####
                        #Check if text indicates charitable foundation
                        if (spans_list['font'] == 'ArialNarrow') & (round(float(spans_list['size'])) == 7) & (spans_list['text'][:23] == "L'entreprise possède un"): 

                            charitable_foundation = True

                            #Foundations always start with lines of Helvetica Bold.  
                            #Use that as a trigger with the boolean var start_foundation
                            start_foundation = True
                            foundation_list.append({'id' : org_number,
                                             'isFoundation' : 'Yes'})
                            foundation_id +=1

                        #Check if are in charitable foundation
                        if charitable_foundation:
                            #Trigger for name and address to differentiate from other text
                            if start_foundation:
                                #Check if font & size are foundation name
                                if (spans_list['font'] == 'Helvetica-Bold') & (round(float(spans_list['size'])) >= 9):
                                     #If key Name doesn't already exist, create it
                                    if 'Name' not in foundation_list[foundation_id].keys():
                                        foundation_list[foundation_id]['Name'] = ''
                                        foundation_list[foundation_id]['Name'] += spans_list['text']

                                        lineToSkip = count_line_list
                                    #else:
                                        #Strip here to avoid unnecessary blank space
                                        #foundation_list[foundation_id]['Name'] += spans_list['text'].strip()

                                #Check if font & size are address
                                if (spans_list['font'] == 'Helvetica-Bold') & ((round(float(spans_list['size'])) == 8) | (round(float(spans_list['size'])) >= 9)):
                                    #Check if are on different line than Name, meaning are on Address line
                                    if count_line_list > lineToSkip:
                                         #If key Address doesn't already exist, create it
                                        if 'Address' not in foundation_list[foundation_id].keys():
                                            foundation_list[foundation_id]['Address'] = ''
                                            foundation_list[foundation_id]['Address'] += spans_list['text'].strip()
                                        else:
                                            foundation_list[foundation_id]['Address'] += ' ' +spans_list['text'].strip()



                            #Check if font & size are that of field name
                            #Outside of if start_foundation
                            if ((spans_list['font'] == 'ArialNarrow') or (spans_list['font'] == 'Helvetica')) & (int(float(spans_list['size'])) == 8):
                                #Trigger on first catch of non-address text
                                start_foundation = False

                                #Catch if no orgs created
                                if foundation_id < 0:
                                    continue
                                #If key field doesn't already exist, create it. Checks if length string > 1 to remove bad text
                                if (ExtractionTool.removePuncandSpace(spans_list['text']) not in foundation_list[foundation_id].keys()) & (len(ExtractionTool.removePuncandSpace(spans_list['text'])) > 1):
                                    foundation_list[foundation_id][ExtractionTool.removePuncandSpace(spans_list['text'])] = ''

                            #Check if outside of adress
                            if not start_foundation:
                                #Check if font & size are that of field text
                                if ((spans_list['font'] == 'ArialNarrow,Bold') or (spans_list['font'] == 'Helvetica-Bold')) & (round(float(spans_list['size'])) == 8):
                                    #Catch if no orgs created
                                    if foundation_id < 0:
                                        continue

                                    #Place in last dict key: will always be something there, non-generalizable method
                                    foundation_list[foundation_id][list( foundation_list[foundation_id])[-1]] += spans_list['text']

        return org_list, foundation_list
