In [2]:
import fitz
import string
import pandas as pd
from difflib import SequenceMatcher
import re
import unidecode

In [3]:
filepath_full = '../Data/REP-EDC-2020_Fusion_Final.pdf'

In [4]:
def openPDFasTextDict(filepath):
    """
    Opens PDF as XML dict
    """
    text_dict = []
    with fitz.open(filepath) as doc:
        for page in doc:
            text_dict.append(page.get_text("dict", sort=False))
    return text_dict

In [5]:
def removePuncandSpace(text):
    """
    Removes punctuation and spaces from a string
    """
    return text.translate(str.maketrans('', '', string.punctuation)).strip()

In [6]:
def getFontInfo(text_dict):
    company_name_acquired = False
    address_acquired = False
    field_acquired = False
    text_acquired = False
    
    output = {'Company' : {'font': None,
                              'size': None},
                 'Address' : {'font': None,
                             'size': None},
                 'Field' : {'font': None,
                             'size': None},
                 'Text' : {'font': None,
                             'size': None}
                 }
    
    while True:
        if not company_name_acquired:
            sample_company_name = re.sub(r"(^[^\w]+)|([^\w]+$)", "", input("Please copy-paste a company name"))
        if not address_acquired:
            sample_address = re.sub(r"(^[^\w]+)|([^\w]+$)", "",input("Please copy-paste an address"))
        if not field_acquired:
            sample_field = re.sub(r"(^[^\w]+)|([^\w]+$)", "",input("Please copy-paste a field name"))
        if not text_acquired:
            sample_text = re.sub(r"(^[^\w]+)|([^\w]+$)", "",input("Please copy-paste the text after a field name"))

        

        for count_page, page in enumerate(text_dict):
            for count_block_list, block_list in enumerate(page["blocks"]):
                for count_line_list, line_list in enumerate(block_list["lines"]):
                    for count_spans_list, spans_list in enumerate(line_list["spans"]):
                        
                        pdf_text = re.sub(r"(^[^\w]+)|([^\w]+$)", "", spans_list['text'])
                        
                        #Company Name
                        if pdf_text == sample_company_name:
                            output['Company']['font'] = spans_list['font']
                            output['Company']['size'] = spans_list['size']

                        #Company Address
                        if pdf_text == sample_address:
                            output['Address']['font'] = spans_list['font']
                            output['Address']['size'] = spans_list['size']

                        #Field name
                        if pdf_text == sample_field:
                            output['Field']['font'] = spans_list['font']
                            output['Field']['size'] = spans_list['size']

                        #Field text
                        if pdf_text == sample_text:
                            output['Text']['font'] = spans_list['font']
                            output['Text']['size'] = spans_list['size']
                        
        #Check if have all required data, if so break out of while True
        if None not in [value for values in output.values() for value in values.values()]:
            break
            
        #Company name
        if None in [values for values in output['Company'].values()]:
            print("Failed to get data for company name.")
        else:
            company_name_acquired = True
        
        #Address
        if None in [values for values in output['Address'].values()]:
            print("Failed to get data for address.")
        else:
            address_acquired = True
        
        #Field
        if None in [values for values in output['Field'].values()]:
            print("Failed to get data for field.")
        else:
            field_acquired = True
        
        #Text
        if None in [values for values in output['Text'].values()]:
            print("Failed to get data for text.")
        else:
            text_acquired = True
            
        print('\n')   
        print(output)
        
    return output

# Extraction

In [9]:
textDict_full = openPDFasTextDict(filepath = filepath_full)

In [None]:
fontnsize_data = getFontInfo(textDict_full)

In [None]:
fontnsize_data

In [129]:
def extractFromTextDict(text_dict, bannedStrings, fontsize_data):
    ##
    # Get an idea of looping through text
    # For every page
    # Extract additional information about the text as well: font and font size
    # Store in list of dictionaries
    ##

    org_list = []
    foundation_list = []
    org_id = -1
    foundation_id = -1
    charitable_foundation = False

    for count_page, page in enumerate(text_dict):
        for count_block_list, block_list in enumerate(page["blocks"]):
            for count_line_list, line_list in enumerate(block_list["lines"]):
                for count_spans_list, spans_list in enumerate(line_list["spans"]):

                    #Remove empty text
                    if spans_list['text'].isspace():
                        continue
                    #Skip if trash text
                    if spans_list['text'].strip() in bannedStrings:
                        continue
                    
                    ### Organizations ####
                    #Check if font & size are that of org number or new org
                    if (spans_list['font'] == fontsize_data['Company']['font']) & (int(float(spans_list['size'])) == int(fontsize_data['Company']['size'])):
                        charitable_foundation = False
                        #Check if start of new org
                        try :
                            #Throws ValueError if name of org
                            int(spans_list['text'])
                        except ValueError:
                            #Only triggers when name of org
                            org_list[org_id]['Name'] = spans_list['text'].strip()                   
                        else:
                            #If not name of org then org number
                            if (spans_list['font'] == fontsize_data['Company']['font']) & (int(float(spans_list['size'])) == int(fontsize_data['Company']['size'])):
                                org_number = spans_list['text'].strip()
                                print(org_number)
                                org_list.append({'id' : org_number,
                                                 'isFoundation' : 'No'})
                                org_id += 1

                    #Check if not in charitable organisation
                    if not charitable_foundation:

                        #Check if font & size are that of org address
                        #Uses round to filter more text: other text has size that rounds to 8
                        if (spans_list['font'] == fontsize_data['Address']['font']) & (round(float(spans_list['size'])) == int(fontsize_data['Address']['size'])):
                            #Catch if no orgs created
                            if org_id < 0:
                                continue
                            #If key Address doesn't already exist, create it
                            if 'Address' not in org_list[org_id].keys():
                                org_list[org_id]['Address'] = ''
                                org_list[org_id]['Address'] += spans_list['text']
                            else:
                                #Strip here to avoid unnecessary blank space
                                #Maybe handle this later?
                                org_list[org_id]['Address'] += spans_list['text'].strip()

                        #Check if font & size are that of field name
                        if (spans_list['font'] == fontsize_data['Field']['font']) & (int(float(spans_list['size'])) == int(fontsize_data['Field']['size'])):
                            #Catch if no orgs created
                            if org_id < 0:
                                continue
                            #If key field doesn't already exist, create it. Checks if length string > 1 to remove bad text
                            if (removePuncandSpace(spans_list['text']) not in org_list[org_id].keys()) & (len(removePuncandSpace(spans_list['text'])) > 1):
                                org_list[org_id][removePuncandSpace(spans_list['text'])] = ''
                            #If field already exists, create new field with convention i - Name where i is number of fields with the same name +1
                            elif (removePuncandSpace(spans_list['text']) in org_list[org_id].keys()):
                                num_instances = list(org_list[org_id].keys()).count(removePuncandSpace(spans_list['text']))
                                org_list[org_id][f"{num_instances + 1} - {removePuncandSpace(spans_list['text'])}"] = ''

                        #Check if font & size are that of field text
                        if (spans_list['font'] == fontsize_data['Text']['font']) & (round(float(spans_list['size'])) == int(fontsize_data['Text']['size'])):
                            #Catch if no orgs created
                            if org_id < 0:
                                continue

                            #Place in last dict key: will always be something there, non-generalizable method
                            org_list[org_id][list( org_list[org_id])[-1]] += spans_list['text']
                            
                    ### Foundations ####
                    #Check if text indicates charitable foundation
                    if (spans_list['font'] == 'ArialNarrow') & (round(float(spans_list['size'])) == 7) & (spans_list['text'][:23] == "L'entreprise possède un"): 

                        charitable_foundation = True
                        
                        #Foundations always start with lines of Helvetica Bold.  
                        #Use that as a trigger with the boolean var start_foundation
                        start_foundation = True
                        foundation_list.append({'id' : org_number,
                                         'isFoundation' : 'Yes'})
                        foundation_id +=1

                    #Check if are in charitable foundation
                    if charitable_foundation:
                        #Trigger for name and address to differentiate from other text
                        if start_foundation:
                            #Check if font & size are foundation name
                            if (spans_list['font'] == fontsize_data['Company']['font']) & (round(float(spans_list['size'])) >= fontsize_data['Company']['size']):
                                 #If key Name doesn't already exist, create it
                                if 'Name' not in foundation_list[foundation_id].keys():
                                    foundation_list[foundation_id]['Name'] = ''
                                    foundation_list[foundation_id]['Name'] += spans_list['text']
                                    
                                    lineToSkip = count_line_list
                                #else:
                                    #Strip here to avoid unnecessary blank space
                                    #foundation_list[foundation_id]['Name'] += spans_list['text'].strip()

                            #Check if font & size are address
                            if (spans_list['font'] == fontsize_data['Address']['font']) & ((round(float(spans_list['size'])) >= fontsize_data['Address']['size'])):
                                #Check if are on different line than Name, meaning are on Address line
                                if count_line_list > lineToSkip:
                                     #If key Address doesn't already exist, create it
                                    if 'Address' not in foundation_list[foundation_id].keys():
                                        foundation_list[foundation_id]['Address'] = ''
                                        foundation_list[foundation_id]['Address'] += spans_list['text'].strip()
                                    else:
                                        foundation_list[foundation_id]['Address'] += ' ' +spans_list['text'].strip()



                        #Check if font & size are that of field name
                        #Outside of if start_foundation
                        if ((spans_list['font'] == fontsize_data['Field']['font']) & (int(float(spans_list['size'])) == fontsize_data['Field']['size'])):
                            #Trigger on first catch of non-address text
                            start_foundation = False
                            
                            #Catch if no orgs created
                            if foundation_id < 0:
                                continue
                            #If key field doesn't already exist, create it. Checks if length string > 1 to remove bad text
                            if (removePuncandSpace(spans_list['text']) not in foundation_list[foundation_id].keys()) & (len(removePuncandSpace(spans_list['text'])) > 1):
                                foundation_list[foundation_id][removePuncandSpace(spans_list['text'])] = ''
                        
                        #Check if outside of adress
                        if not start_foundation:
                            #Check if font & size are that of field text
                            if ((spans_list['font'] == fontsize_data['Text']['font']) & (round(float(spans_list['size'])) == fontsize_data['Text']['size'])):
                                #Catch if no orgs created
                                if foundation_id < 0:
                                    continue

                                #Place in last dict key: will always be something there, non-generalizable method
                                foundation_list[foundation_id][list( foundation_list[foundation_id])[-1]] += spans_list['text']
                
    return org_list, foundation_list


In [130]:
bannedStrings = ['Entreprises donatrices et commanditaires du Québec',
                "« DDD » : Date de distribution des dons. « FAF » : date de fin d'année fiscale.  « Langue : B »  dans l’inscription d’une fondation indique que vous pouvez écrire en français ou en anglais.",
                "Tous droits réservés © 2020– Centre québécois de philanthropie"]

In [131]:
extractedOrgList_full, extractedFoundationList_full = extractFromTextDict(textDict_full, bannedStrings, fontnsize_data)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32


UnboundLocalError: local variable 'lineToSkip' referenced before assignment

In [132]:
extractedOrgList_full

[]

In [112]:
dfOrg_full = pd.DataFrame(extractedOrgList_full)
dfFoundation_full = pd.DataFrame(extractedFoundationList_full)

In [113]:
dfOrg_full.head()

In [96]:
dfFoundation_full.head(10)

Unnamed: 0,id,isFoundation,Name,Address,Langue,Catégorie,Contact,Tél,Échelledons,Total annuel,...,Projets privilégiés,Avis,Courriel,Web,ou,Limites géographiques,Téléc,Poste,Total actif,Limites géog
0,6,Yes,"AbbVie,","8401, rte Trans-Canadienne Saint-Laurent QC H4...",F,Fondation corporaative,"Direction, appui à la communauté",514-906-9700,25 000 à 250 000 $,5 050 000 $,...,Bourses aux particuliers (bourses d’études MII...,Communiquer par : courrier. Limites géographiq...,,,,,,,,
1,32,Yes,"Air-Canada,","7373, boul. de la Côte-Vertu Saint-Laurent QC ...",F,Fondation corporative,"Mme Micheline Villeneuve, Gérante de la Fondat...",514-422-5973,,,...,Différents programmes servent à différentes ca...,Les demandes de soutien doivent être faites en...,foundation-fondation@aircanada.ca,https://www.aircanada.com/fondation,https://www.aircanada.com/fr/about/community/f...,le Canada.,,,,
2,42,Yes,Alcoa Foundation,"390 Park Ave, 9th Floor New York NY 10022 USA",A,Fondation corporative,"Ms Esra Ozer, President",412-553-4545,,"17,800,000 $",...,Amélioration de la qualité de vie. Éducation. ...,Communiquer par : courrier ou fax. Décisions r...,,www.alcoafoundation.com,http://www.alcoa.com/global/en/community/found...,,412-553-4498,,,
3,48,Yes,Fondation Memoria,"1115, rue Laurier O. Outremont QC H2V2L3",B,Fondation corporative,"Madame Jeannette Rioux, Secrétaire",514-277-7778,1 000 $,1 000 $,...,Bourses aux particuliers (d'études ou perfecti...,Communiquer par : courrier ou fax. Limites géo...,,,,,514-908-1354,,,
4,65,Yes,Allstate Foundation of Canada,"27 Allstate Parkway Ave., Suite 100 Markham ON...",B,Fondation corporative,"Mr Jeff Wickware, Executive Secretary & Treas...",905-475-4413,100 à 117 860 $,264 350 $,...,Déficit budgétaire. Dons jumelés. Fonds de con...,Communiquer par : courriel. Décisions prises a...,foundation@allstate.ca,www.allstate.ca,,,905-415-4899,,,
5,76,Yes,The American Express Foundation in Canada,C/O Amex Bank of Canada P.O. Box 3204 Stn F...,A,Fondation corporative,Vice-pésident et Secrétaire,800-866-2639,,590 000 $,...,Campagne annuelle. Dons jumelés (avec les empl...,Communiquer par : courrier ou courriel. Décisi...,amexcanadafoundation@aexp.com,https://www.americanexpress.com/ca/en/ content...,,,,,,
6,117,Yes,"Avon pour les femmes du Canada,","5500, rte Transcanadienne Pointe-Claire QC H9R1B6",B,Fondation corporative,"Madame Leslie Cox, Administratrice",514-630-8312,6 300 à 250 000 $,1 701 335 $,...,Bourses d'études (écoles académiques; bourse d...,Communiquer par : courrier ou fax. Joindre la ...,,www.avon.ca,www.ca.avon.com,,514-630-5439,,,
7,121,Yes,"CIBC Children's Foundation,","25 King St. W., 30 th Floor Commerce Court Nor...",A,Fondation corporative,"Mr Richard Nesbitt, Director",416-861-8023,350 à 200 000 $,3 914 950 $,...,Éducation. Fondations hospitalières. Fonds d'é...,Communiquer par : courrier seulement. Formulai...,mailbox.miracleday@cibc.com,www.cibc.com/miracleday,,,416-861-3757,,,
8,127,Yes,TD Friends of the Environment Foundation,"77 King Street West, 10th Floor TD North Tower...",B,Fondation corporative,"Mrs Mary Desjardins, Executive Director",416-308-5372,2500 à 437 140 $,4 297 600 $,...,Activité bénéfice annuelle (Boîtes de collecte...,"Communiquer par : courrier, courriel ou fax. F...",tdfef@td.com,www.fef.td.com,,,416-308-6426,,,
9,176,Yes,Fondation de bienfaisance des employés de BMO,"119, rue St-Jacques Montréal QC H2Y1L6",,,Coordonnateur des dons,514-877-7373,,,...,,Au Québec le fonds opère sous le nom de Fontai...,,,,le Canada.,,,,


In [97]:
dfOrg_full.columns

Index(['id', 'isFoundation', 'Name', 'Address', 'Secteur industriel',
       'Langue de comm', 'DDD', 'FAF', 'N° de télCie', 'n° de tél', 'Site Web',
       'Domaine dintérêt', 'Limites géog', 'Note', 'Nombre demployés',
       'Contribution', 'Contact', 'Tél', 'Poste', 'Courriel', 'Avis',
       'N° de faxCie', '2e n° de tél', 'Courriel  Cie', 'Fax',
       '2e contact pour', '2 - Tél', '2 - Fax', 'Filiale de', '2 - Courriel',
       'Nbre de succ', 'Princip filiales', '2 - Note', 'Princip Filiales',
       'Principale filiale', 'Filiales princip', 'Principales filiales',
       'Principfiliales', '2 - Contribution', '2 - Contact', '2 - Poste'],
      dtype='object')

In [98]:
dfFoundation_full.columns

Index(['id', 'isFoundation', 'Name', 'Address', 'Langue', 'Catégorie',
       'Contact', 'Tél', 'Échelledons', 'Total annuel', 'Actif', 'Date approb',
       'Date fin d’ann', 'Domaines dintérêt', 'Projets privilégiés', 'Avis',
       'Courriel', 'Web', 'ou', 'Limites géographiques', 'Téléc', 'Poste',
       'Total actif', 'Limites géog'],
      dtype='object')

In [99]:
#Checking null values
dfOrg_full.isna().sum()

id                         0
isFoundation               0
Name                       0
Address                    0
Secteur industriel         0
Langue de comm             3
DDD                       19
FAF                      519
N° de télCie               6
n° de tél               1223
Site Web                  10
Domaine dintérêt           6
Limites géog              12
Note                     597
Nombre demployés         339
Contribution               0
Contact                    6
Tél                       38
Poste                   1158
Courriel                 318
Avis                      34
N° de faxCie             307
2e n° de tél             669
Courriel  Cie            547
Fax                      416
2e contact pour         1315
2 - Tél                 1333
2 - Fax                 1338
Filiale de              1009
2 - Courriel            1328
Nbre de succ             835
Princip filiales        1138
2 - Note                1212
Princip Filiales        1341
Principale fil

In [100]:
dfFoundation_full.isna().sum()

id                        0
isFoundation              0
Name                      3
Address                   1
Langue                    4
Catégorie                10
Contact                  10
Tél                      11
Échelledons              29
Total annuel             17
Actif                    24
Date approb              16
Date fin d’ann           16
Domaines dintérêt         4
Projets privilégiés       8
Avis                     18
Courriel                 33
Web                      24
ou                       73
Limites géographiques    72
Téléc                    24
Poste                    78
Total actif              80
Limites géog             80
dtype: int64

In [101]:
dfFoundation_full[dfFoundation_full.Name.isna() == True]

Unnamed: 0,id,isFoundation,Name,Address,Langue,Catégorie,Contact,Tél,Échelledons,Total annuel,...,Projets privilégiés,Avis,Courriel,Web,ou,Limites géographiques,Téléc,Poste,Total actif,Limites géog
19,281,Yes,,,,,,,,,...,,,,,,,,,,
72,1186,Yes,,"Voir la Fondation St-Hubert, plus haut.",,,,,,,...,,,,,,,,,,
76,1282,Yes,,La Fondation Unilever a été révoquée volontair...,,,,,,,...,,,,,,,,,,


In [102]:
#Drop the foundations where name is null
dfFoundation_full = dfFoundation_full[dfFoundation_full.Name.isna() == False]

In [103]:
dfFoundation_full[dfFoundation_full.Address.isna() == True]

Unnamed: 0,id,isFoundation,Name,Address,Langue,Catégorie,Contact,Tél,Échelledons,Total annuel,...,Projets privilégiés,Avis,Courriel,Web,ou,Limites géographiques,Téléc,Poste,Total actif,Limites géog


### Cleaning column names and merging equivalent fields

In [104]:
def concat_cols(df_original, df_cols, df_new):

    for _, x in df_cols.iterrows():
        new_col_name = x['new_col']
        cols_to_concat = x['colnames']

        if len(cols_to_concat) > 1:
            df_new[new_col_name] =  df_original[cols_to_concat].fillna('').agg('; '.join, axis=1)
        else:
            df_new[new_col_name] = df_original[cols_to_concat[0]]
    
    return df_new


def clean_colnames(string):
    string_c = re.sub(r'[^ \nA-Za-z0-9À-ÖØ-öø-ÿ/]+', '', string).upper()
    string_c = unidecode.unidecode(string_c)
    return string_c

def remove_extra_sep_tokens(df_original, df_cols, pattern = '; '):

    cols_concatted = df_cols[ df_cols.apply(lambda x: len(x['colnames']),axis=1) >1].new_col.tolist()
    sep_list = [pattern*i for i in range(2,10)]
    df_new = df_original.copy()
    
    for col in cols_concatted:
        for str1 in sep_list[::-1]:
            df_new.loc[:,col] = df_new.loc[:,col].str.replace(str1, pattern)

    df_new[cols_concatted] = df_new[cols_concatted].applymap(lambda x: x.strip(pattern) if isinstance(x, str) else x)

    return df_new


In [109]:
colnames = dfOrg_full.columns

clean_cols = [clean_colnames(i) for i in colnames]

dfOrg_full.columns = clean_cols

# pd.DataFrame({'colnames':clean_cols}).to_csv('Colnames.csv', index=None) 
# to label them in excel

In [111]:
df_groupings = pd.read_csv('Colnames_orgs_std.csv')

df_groupings = df_groupings.groupby('new_col')['colnames'].apply(list).reset_index()

df_orgs = dfOrg_full[['ID']].copy()

df_orgs = concat_cols(dfOrg_full, df_groupings, df_orgs)

df_orgs = remove_extra_sep_tokens(df_orgs, df_groupings)

In [112]:
# same process but for foundations
colnames = dfFoundation_full.columns

clean_cols = [clean_colnames(i) for i in colnames]

dfFoundation_full.columns = clean_cols

#pd.DataFrame({'colnames':clean_cols}).to_csv('Colnames_foundations.csv', index=None)


In [114]:
df_groupings = pd.read_csv('Colnames_foundations_std.csv')

df_groupings = df_groupings.groupby('new_col')['colnames'].apply(list).reset_index()

df_fonds = dfFoundation_full[['ID']].copy()

df_fonds = concat_cols(dfFoundation_full, df_groupings, df_fonds)

df_fonds = remove_extra_sep_tokens(df_fonds, df_groupings)

### Save

In [115]:
####
# Save to csv
###
dfOrg_full.to_excel("../Data/Organizations.xlsx")
dfFoundation_full.to_excel("../Data/Foundations.xlsx")