In [12]:
import fitz
import string
import pandas as pd
from difflib import SequenceMatcher

import re
import unidecode


In [2]:
filepath_1to5 = r'REP-EDC-2020_Fusion_Final-1-5.pdf'
filepath_full = r'REP-EDC-2020_Fusion_Final.pdf'

In [3]:
def openPDFasTextDict(filepath):
    """
    Opens PDF as XML dict
    """
    text_dict = []
    with fitz.open(filepath) as doc:
        for page in doc:
            text_dict.append(page.get_text("dict", sort=False))
    return text_dict

In [4]:
def removePuncandSpace(text):
    """
    Removes punctuation and spaces from a string
    """
    return text.translate(str.maketrans('', '', string.punctuation)).strip()

In [5]:
textDict_1to5 = openPDFasTextDict(filepath = filepath_1to5)
textDict_full = openPDFasTextDict(filepath = filepath_full)

In [6]:
def extractFromTextDict(text_dict, bannedStrings):
    ##
    # Get an idea of looping through text
    # For every page
    # Extract additional information about the text as well: font and font size
    # Store in list of dictionaries
    ##

    org_list = []
    foundation_list = []
    org_id = -1
    foundation_id = -1
    charitable_foundation = False

    for count_page, page in enumerate(text_dict):
        for count_block_list, block_list in enumerate(page["blocks"]):
            for count_line_list, line_list in enumerate(block_list["lines"]):
                for count_spans_list, spans_list in enumerate(line_list["spans"]):

                    #Remove empty text
                    if spans_list['text'].isspace():
                        continue
                    #Skip if trash text
                    if spans_list['text'].strip() in bannedStrings:
                        continue
                    
                    
                    ### Organizations ####
                    #Check if font & size are that of org number or new org
                    if (spans_list['font'] == 'Helvetica-Bold') & (int(float(spans_list['size'])) == 11):
                        charitable_foundation = False
                        #Check if start of new org
                        try :
                            #Throws ValueError if name of org
                            int(spans_list['text'])
                        except ValueError:
                            #Only triggers when name of org
                            org_list[org_id]['Name'] = spans_list['text'].strip()                   
                        else:
                            #If not name of org then org number
                            if (spans_list['font'] == 'Helvetica-Bold') & (int(float(spans_list['size'])) == 11):
                                org_number = spans_list['text'].strip()
                                org_list.append({'id' : org_number,
                                                 'isFoundation' : 'No'})
                                org_id += 1

                    #Check if not in charitable organisation
                    if not charitable_foundation:

                        #Check if font & size are that of org address
                        #Uses round to filter more text: other text has size that rounds to 8
                        if (spans_list['font'] == 'Helvetica') & (round(float(spans_list['size'])) == 9):
                            #Catch if no orgs created
                            if org_id < 0:
                                continue
                            #If key Address doesn't already exist, create it
                            if 'Address' not in org_list[org_id].keys():
                                org_list[org_id]['Address'] = ''
                                org_list[org_id]['Address'] += spans_list['text']
                            else:
                                #Strip here to avoid unnecessary blank space
                                #Maybe handle this later?
                                org_list[org_id]['Address'] += spans_list['text'].strip()

                        #Check if font & size are that of field name
                        if (spans_list['font'] == 'ArialNarrow') & (int(float(spans_list['size'])) == 8):
                            #Catch if no orgs created
                            if org_id < 0:
                                continue
                            #If key field doesn't already exist, create it. Checks if length string > 1 to remove bad text
                            if (removePuncandSpace(spans_list['text']) not in org_list[org_id].keys()) & (len(removePuncandSpace(spans_list['text'])) > 1):
                                org_list[org_id][removePuncandSpace(spans_list['text'])] = ''
                            #If field already exists, create new field with convention i - Name where i is number of fields with the same name +1
                            elif (removePuncandSpace(spans_list['text']) in org_list[org_id].keys()):
                                num_instances = list(org_list[org_id].keys()).count(removePuncandSpace(spans_list['text']))
                                org_list[org_id][f"{num_instances + 1} - {removePuncandSpace(spans_list['text'])}"] = ''

                        #Check if font & size are that of field text
                        if (spans_list['font'] == 'Helvetica-Bold') & (round(float(spans_list['size'])) == 8):
                            #Catch if no orgs created
                            if org_id < 0:
                                continue

                            #Place in last dict key: will always be something there, non-generalizable method
                            org_list[org_id][list( org_list[org_id])[-1]] += spans_list['text']

                    ### Foundations ####
                    #Check if text indicates charitable foundation
                    if (spans_list['font'] == 'ArialNarrow') & (round(float(spans_list['size'])) == 7) & (spans_list['text'][:23] == "L'entreprise possède un"): 

                        charitable_foundation = True
                        
                        #Foundations always start with lines of Helvetica Bold.  
                        #Use that as a trigger with the boolean var start_foundation
                        start_foundation = True
                        foundation_list.append({'id' : org_number,
                                         'isFoundation' : 'Yes'})
                        foundation_id +=1

                    #Check if are in charitable foundation
                    if charitable_foundation:
                        #Trigger for name and address to differentiate from other text
                        if start_foundation:
                            #Check if font & size are foundation name
                            if (spans_list['font'] == 'Helvetica-Bold') & (round(float(spans_list['size'])) >= 9):
                                 #If key Name doesn't already exist, create it
                                if 'Name' not in foundation_list[foundation_id].keys():
                                    foundation_list[foundation_id]['Name'] = ''
                                    foundation_list[foundation_id]['Name'] += spans_list['text']
                                    
                                    lineToSkip = count_line_list
                                #else:
                                    #Strip here to avoid unnecessary blank space
                                    #foundation_list[foundation_id]['Name'] += spans_list['text'].strip()

                            #Check if font & size are address
                            if (spans_list['font'] == 'Helvetica-Bold') & (round(float(spans_list['size'])) == 8):
                                #Check if are on different line than Name, meaning are on Address line
                                if count_line_list > lineToSkip:
                                     #If key Address doesn't already exist, create it
                                    if 'Address' not in foundation_list[foundation_id].keys():
                                        foundation_list[foundation_id]['Address'] = ''
                                        foundation_list[foundation_id]['Address'] += spans_list['text'].strip()
                                    else:
                                        foundation_list[foundation_id]['Address'] += ' ' +spans_list['text'].strip()



                        #Check if font & size are that of field name
                        #Outside of if start_foundation
                        if ((spans_list['font'] == 'ArialNarrow') or (spans_list['font'] == 'Helvetica')) & (int(float(spans_list['size'])) == 8):
                            #Trigger on first catch of non-address text
                            start_foundation = False
                            
                            #Catch if no orgs created
                            if foundation_id < 0:
                                continue
                            #If key field doesn't already exist, create it. Checks if length string > 1 to remove bad text
                            if (removePuncandSpace(spans_list['text']) not in foundation_list[foundation_id].keys()) & (len(removePuncandSpace(spans_list['text'])) > 1):
                                foundation_list[foundation_id][removePuncandSpace(spans_list['text'])] = ''
                        
                        #Check if outside of adress
                        if not start_foundation:
                            #Check if font & size are that of field text
                            if ((spans_list['font'] == 'ArialNarrow,Bold') or (spans_list['font'] == 'Helvetica-Bold')) & (round(float(spans_list['size'])) == 8):
                                #Catch if no orgs created
                                if foundation_id < 0:
                                    continue

                                #Place in last dict key: will always be something there, non-generalizable method
                                foundation_list[foundation_id][list( foundation_list[foundation_id])[-1]] += spans_list['text']

    return org_list, foundation_list


In [7]:
bannedStrings = ['Entreprises donatrices et commanditaires du Québec',
                "« DDD » : Date de distribution des dons. « FAF » : date de fin d'année fiscale.  « Langue : B »  dans l’inscription d’une fondation indique que vous pouvez écrire en français ou en anglais.",
                "Tous droits réservés © 2020– Centre québécois de philanthropie"]

In [8]:
extractedOrgList_full, extractedFoundationList_full = extractFromTextDict(textDict_full, bannedStrings)

In [25]:
dfOrg_full = pd.DataFrame(extractedOrgList_full)
dfFoundation_full = pd.DataFrame(extractedFoundationList_full)

### Cleaning column names and merging equivalent fields

In [26]:
def concat_cols(df_original, df_cols, df_new):

    for _, x in df_cols.iterrows():
        new_col_name = x['new_col']
        cols_to_concat = x['colnames']

        if len(cols_to_concat) > 1:
            df_new[new_col_name] =  df_original[cols_to_concat].fillna('').agg('; '.join, axis=1)
        else:
            df_new[new_col_name] = df_original[cols_to_concat[0]]
    
    return df_new


def clean_colnames(string):
    string_c = re.sub(r'[^ \nA-Za-z0-9À-ÖØ-öø-ÿ/]+', '', string).upper()
    string_c = unidecode.unidecode(string_c)
    return string_c

def remove_extra_sep_tokens(df_original, df_cols, pattern = '; '):

    cols_concatted = df_cols[ df_cols.apply(lambda x: len(x['colnames']),axis=1) >1].new_col.tolist()
    sep_list = [pattern*i for i in range(2,10)]
    df_new = df_original.copy()
    
    for col in cols_concatted:
        for str1 in sep_list[::-1]:
            df_new.loc[:,col] = df_new.loc[:,col].str.replace(str1, pattern)

    df_new[cols_concatted] = df_new[cols_concatted].applymap(lambda x: x.strip(pattern) if isinstance(x, str) else x)

    return df_new


In [27]:
colnames = dfOrg_full.columns

clean_cols = [clean_colnames(i) for i in colnames]

dfOrg_full.columns = clean_cols

# pd.DataFrame({'colnames':clean_cols}).to_csv('Colnames.csv', index=None) 
# to label them in excel

In [30]:
df_groupings = pd.read_csv('Colnames_orgs_std.csv')

df_groupings = df_groupings.groupby('new_col')['colnames'].apply(list).reset_index()

df_orgs = dfOrg_full[['ID']].copy()

df_orgs = concat_cols(dfOrg_full, df_groupings, df_orgs)

df_orgs = remove_extra_sep_tokens(df_orgs, df_groupings)

In [33]:
# same process but for foundations
colnames = dfFoundation_full.columns

clean_cols = [clean_colnames(i) for i in colnames]

dfFoundation_full.columns = clean_cols

#pd.DataFrame({'colnames':clean_cols}).to_csv('Colnames_foundations.csv', index=None)


In [35]:
df_groupings = pd.read_csv('Colnames_foundations_std.csv')

df_groupings = df_groupings.groupby('new_col')['colnames'].apply(list).reset_index()

df_fonds = dfFoundation_full[['ID']].copy()

df_fonds = concat_cols(dfFoundation_full, df_groupings, df_fonds)

df_fonds = remove_extra_sep_tokens(df_fonds, df_groupings)

In [40]:
####
# Save to csv
###
df_orgs.to_excel("dataset_merge/Organizations.xlsx", index=None)
df_fonds.to_excel("dataset_merge/Foundations.xlsx", index=None)