In [1]:
import fitz
import string
import pandas as pd
from difflib import SequenceMatcher
import re
import unidecode

In [2]:
filepath_full = '../Data/REP-EDC-2020_Fusion_Final.pdf'

In [3]:
def openPDFasTextDict(filepath):
    """
    Opens PDF as XML dict
    """
    text_dict = []
    with fitz.open(filepath) as doc:
        for page in doc:
            text_dict.append(page.get_text("dict", sort=False))
    return text_dict

In [4]:
def removePuncandSpace(text):
    """
    Removes punctuation and spaces from a string
    """
    return text.translate(str.maketrans('', '', string.punctuation)).strip()

In [5]:
def isListEmpty(inList):
    if isinstance(inList, list): # Is a list
        return all( map(isListEmpty, inList) )
    return False # Not a list

In [6]:
def getFontInfo(text_dict):
    
    foundation_present = input("Are there any charitable foundations present in your document? Y/N")    
    
    if foundation_present == 'Y':
        foundation_check = True
    if foundation_present == 'N':
        foundation_check = False
    
    company_name_acquired = False
    company_address_acquired = False
    company_field_acquired = False
    company_text_acquired = False
    foundation_separator_acquired = False
    foundation_name_acquired = False
    foundation_address_acquired = False
    foundation_field_acquired = False
    foundation_text_acquired = False
    
    if foundation_check:
        output = {'CompanyName' : {'font': [],
                                  'size': []},
                  'CompanyAddress' : {'font': [],
                                 'size': []},
                  'CompanyField' : {'font': [],
                                 'size': []},
                  'CompanyText' : {'font': [],
                                 'size': []},
                  'FoundationSeparator' : {'font':[],
                                          'size':[],
                                          'text':[]},
                  'FoundationName' : {'font': [],
                                  'size': []},
                  'FoundationAddress' : {'font': [],
                                 'size': []},
                  'FoundationField' : {'font': [],
                                 'size': []},
                  'FoundationText' : {'font': [],
                                 'size': []},

                 }
    else:
        output = {'CompanyName' : {'font': [],
                                  'size': []},
                  'CompanyAddress' : {'font': [],
                                 'size': []},
                  'CompanyField' : {'font': [],
                                 'size': []},
                  'CompanyText' : {'font': [],
                                 'size': []}

                 }
    
    while True:
        if not company_name_acquired:
            sample_company_name_temp = re.sub(r"(^[^\w]+)|([^\w]+$)", "", input("Please copy-paste a line of company name"))
            sample_company_name = [sample_company_name_temp]
            while True:
                if input("Do you want to copy-paste more in order to ensure more data is captured? Y/N") == 'N':
                    break
                else:
                    sample_company_name.append(input("Please copy-paste another line of company name"))
        if not company_address_acquired:
            sample_company_address_temp = re.sub(r"(^[^\w]+)|([^\w]+$)", "",input("Please copy-paste a line of company address"))
            sample_company_address = [sample_company_address_temp]
            while True:
                if input("Do you want to copy-paste more in order to ensure more data is captured? Y/N") == 'N':
                    break
                else:
                    sample_company_address.append(input("Please copy-paste another line of company address"))
        if not company_field_acquired:
            sample_company_field_temp = re.sub(r"(^[^\w]+)|([^\w]+$)", "",input("Please copy-paste a company field name"))
            sample_company_field = [sample_company_field_temp]
            while True:
                if input("Do you want to copy-paste more in order to ensure more data is captured? Y/N") == 'N':
                    break
                else:
                    sample_company_field.append(input("Please copy-paste another company field name"))
        if not company_text_acquired:
            sample_company_text_temp = re.sub(r"(^[^\w]+)|([^\w]+$)", "",input("Please copy-paste the text after the company field name"))
            sample_company_text = [sample_company_text_temp]
            while True:
                if input("Do you want to copy-paste more in order to ensure more data is captured? Y/N") == 'N':
                    break
                else:
                    sample_company_text.append(input("Please copy-paste another line of text after the company field name"))
        
        if foundation_check:
            if not foundation_separator_acquired:
                sample_foundation_separator_temp = re.sub(r"(^[^\w]+)|([^\w]+$)", "", input("Please copy-paste the line that indicates the start of a foundation"))
                sample_foundation_separator = [sample_foundation_separator_temp]
                while True:
                    if input("Do you want to copy-paste more in order to ensure more data is captured? Y/N") == 'N':
                        break
                    else:
                        sample_foundation_separator.append(input("Please copy-paste another foundation separator"))
                        
            if not foundation_name_acquired:
                sample_foundation_name_temp = re.sub(r"(^[^\w]+)|([^\w]+$)", "", input("Please copy-paste a foundation name"))
                sample_foundation_name = [sample_foundation_name_temp]
                while True:
                    if input("Do you want to copy-paste more in order to ensure more data is captured? Y/N") == 'N':
                        break
                    else:
                        sample_foundation_name.append(input("Please copy-paste another foundation name"))
                        
            if not foundation_address_acquired:
                sample_foundation_address_temp = re.sub(r"(^[^\w]+)|([^\w]+$)", "",input("Please copy-paste a foundation address"))
                sample_foundation_address = [sample_foundation_address_temp]
                while True:
                    if input("Do you want to copy-paste more in order to ensure more data is captured? Y/N") == 'N':
                        break
                    else:
                        sample_foundation_address.append(input("Please copy-paste another foundation address"))
                        
            if not foundation_field_acquired:
                sample_foundation_field_temp = re.sub(r"(^[^\w]+)|([^\w]+$)", "",input("Please copy-paste a field name unique to foundations"))
                sample_foundation_field = [sample_foundation_field_temp]
                while True:
                    if input("Do you want to copy-paste more in order to ensure more data is captured? Y/N") == 'N':
                        break
                    else:
                        sample_foundation_field.append(input("Please copy-paste another foundation field name"))
                        
            if not foundation_text_acquired:
                sample_foundation_text_temp = re.sub(r"(^[^\w]+)|([^\w]+$)", "",input("Please copy-paste the text after the foundation field name"))
                sample_foundation_text = [sample_foundation_text_temp]
                while True:
                    if input("Do you want to copy-paste more in order to ensure more data is captured? Y/N") == 'N':
                        break
                    else:
                        sample_foundation_text.append(input("Please copy-paste more text following a foundation field name"))

        for count_page, page in enumerate(text_dict):
            for count_block_list, block_list in enumerate(page["blocks"]):
                for count_line_list, line_list in enumerate(block_list["lines"]):
                    for count_line, line in enumerate(line_list["spans"]):
                        
                        pdf_text = re.sub(r"(^[^\w]+)|([^\w]+$)", "", line['text'])
                        
                        #Company Name
                        if pdf_text in sample_company_name:
                            if line['font'] in output['CompanyName']['font'] and line['size'] in output['CompanyName']['size']:
                                continue
                            else:
                                output['CompanyName']['font'].append(line['font'])
                                output['CompanyName']['size'].append(line['size'])

                        #Company Address
                        if pdf_text in sample_company_address:
                            if line['font'] in output['CompanyAddress']['font'] and line['size'] in output['CompanyAddress']['size']:
                                continue
                            else:
                                output['CompanyAddress']['font'].append(line['font'])
                                output['CompanyAddress']['size'].append(line['size'])

                        #Field name
                        if pdf_text in sample_company_field:
                            if line['font'] in output['CompanyField']['font'] and line['size'] in output['CompanyField']['size']:
                                continue
                            else:
                                output['CompanyField']['font'].append(line['font'])
                                output['CompanyField']['size'].append(line['size'])

                        #Field text
                        if pdf_text in sample_company_text:
                            if line['font'] in output['CompanyText']['font'] and line['size'] in output['CompanyText']['size']:
                                continue
                            else:
                                output['CompanyText']['font'].append(line['font'])
                                output['CompanyText']['size'].append(line['size'])
                            
                        #Foundation Check
                        if foundation_check:
                            #Foundation Separator
                            if pdf_text in sample_foundation_separator:
                                if line['font'] in output['FoundationSeparator']['font'] and line['size'] in output['FoundationSeparator']['size'] and line['text'] in output['FoundationSeparator']['text']:
                                    continue
                                else:
                                    output['FoundationSeparator']['font'].append(line['font'])
                                    output['FoundationSeparator']['size'].append(line['size'])
                                    output['FoundationSeparator']['text'].append(line['text'])

                                
                            #Foundation Name
                            if pdf_text in sample_foundation_name:
                                if line['font'] in output['FoundationName']['font'] and line['size'] in output['FoundationName']['size']:
                                    continue
                                else:
                                    output['FoundationName']['font'].append(line['font'])
                                    output['FoundationName']['size'].append(line['size'])


                            #Foundation Address
                            if pdf_text in sample_foundation_address:
                                if line['font'] in output['FoundationAddress']['font'] and line['size'] in output['FoundationAddress']['size']:
                                    continue
                                else:
                                    output['FoundationAddress']['font'].append(line['font'])
                                    output['FoundationAddress']['size'].append(line['size'])

                            #Field name
                            if pdf_text in sample_foundation_field:
                                if line['font'] in output['FoundationField']['font'] and line['size'] in output['FoundationField']['size']:
                                    continue
                                else:
                                    output['FoundationField']['font'].append(line['font'])
                                    output['FoundationField']['size'].append(line['size'])

                            #Field text
                            if pdf_text in sample_foundation_text:
                                if line['font'] in output['FoundationText']['font'] and line['size'] in output['FoundationText']['size']:
                                    continue
                                else:
                                    output['FoundationText']['font'].append(line['font'])
                                    output['FoundationText']['size'].append(line['size'])
                
        
        #Check if have all required data, if so break out of while True
        if not any([isListEmpty(value) for values in output.values() for value in values.values()]):
            break
            
        #Company name
        if any([isListEmpty(values) for values in output['CompanyName'].values()]):
            print("Failed to get data for company name.")
        else:
            company_name_acquired = True
        
        #Company Address
        if any([isListEmpty(values) for values in output['CompanyAddress'].values()]):
            print("Failed to get data for company address.")
        else:
            company_address_acquired = True
        
        #Company Field
        if any([isListEmpty(values) for values in output['CompanyField'].values()]):
            print("Failed to get data for company field name.")
        else:
            company_field_acquired = True
        
        #Company Text
        if any([isListEmpty(values) for values in output['CompanyText'].values()]):
            print("Failed to get data for company text.")
        else:
            company_text_acquired = True
            
        #Foundation Check
        if foundation_check:
            #Foundation separator
            if any([isListEmpty(values) for values in output['FoundationSeparator'].values()]):
                print("Failed to get data for line that indicates the start of a foundation.")
            else:
                foundation_separator_acquired = True
                
            #Foundation name
            if any([isListEmpty(values) for values in output['FoundationName'].values()]):
                print("Failed to get data for foundation name.")
            else:
                foundation_name_acquired = True

            #Foundation Address
            if any([isListEmpty(values) for values in output['FoundationAddress'].values()]):
                print("Failed to get data for foundation address.")
            else:
                foundation_address_acquired = True

            #Foundation Field
            if any([isListEmpty(values) for values in output['FoundationField'].values()]):
                print("Failed to get data for foundation field.")
            else:
                foundation_field_acquired = True

            #Foundation Text
            if any([isListEmpty(values) for values in output['FoundationText'].values()]):
                print("Failed to get data for foundation text.")
            else:
                foundation_text_acquired = True
            
        print('\n')   
        print(output)
        
    return output

In [7]:
def extractTextFromTextDict(text_dict, bannedStrings):
    
    output_list = []
    for count_page, page in enumerate(text_dict):
        for count_block_list, block_list in enumerate(page["blocks"]):
            for count_line_list, line_list in enumerate(block_list["lines"]):
                for count_line, line in enumerate(line_list["spans"]):
                    
                     #Remove empty text
                    if line['text'].isspace():
                        continue
                    #Skip if trash text
                    if line['text'].strip() in bannedStrings:
                        continue
                        
                    #If not then populate output_list with dict containing:
                    #size, font, text, line_number
                    output_list.append({'text' : line['text'],
                                       'size' : float(line['size']),
                                       'font' : line['font'],
                                       'line_number' : count_line_list})
    return output_list


# Extraction

In [8]:
textDict_full = openPDFasTextDict(filepath = filepath_full)

In [9]:
fontnsize_data = getFontInfo(textDict_full)

Are there any charitable foundations present in your document? Y/NY
Please copy-paste a line of company name3M Canada inc
Do you want to copy-paste more in order to ensure more data is captured? Y/NN
Please copy-paste a line of company address7290, rue Frederick Banting
Do you want to copy-paste more in order to ensure more data is captured? Y/NN
Please copy-paste a company field nameSecteur industriel
Do you want to copy-paste more in order to ensure more data is captured? Y/NN
Please copy-paste the text after the company field name 31 décembre
Do you want to copy-paste more in order to ensure more data is captured? Y/NN
Please copy-paste the line that indicates the start of a foundationL'entreprise possède une fondation corporative : 
Do you want to copy-paste more in order to ensure more data is captured? Y/NN
Please copy-paste a foundation nameAbbVie
Do you want to copy-paste more in order to ensure more data is captured? Y/NN
Please copy-paste a foundation address8401, rte Trans-C

In [10]:
fontnsize_data

{'CompanyName': {'font': ['Helvetica-Bold'], 'size': [11.029430389404297]},
 'CompanyAddress': {'font': ['Helvetica'], 'size': [8.991293907165527]},
 'CompanyField': {'font': ['ArialNarrow'], 'size': [8.032254219055176]},
 'CompanyText': {'font': ['Helvetica-Bold', 'ArialNarrow,Bold'],
  'size': [8.032254219055176, 8.032254219055176]},
 'FoundationSeparator': {'font': ['ArialNarrow'],
  'size': [6.9532151222229],
  'text': ["L'entreprise possède une fondation corporative :"]},
 'FoundationName': {'font': ['Helvetica-Bold', 'Helvetica-Bold'],
  'size': [11.029430389404297, 8.991293907165527]},
 'FoundationAddress': {'font': ['Helvetica-Bold'],
  'size': [8.032254219055176]},
 'FoundationField': {'font': ['Helvetica', 'ArialNarrow'],
  'size': [8.032254219055176, 8.032254219055176]},
 'FoundationText': {'font': ['Helvetica-Bold'], 'size': [8.032254219055176]}}

In [11]:
bannedStrings = ['Entreprises donatrices et commanditaires du Québec',
                "« DDD » : Date de distribution des dons. « FAF » : date de fin d'année fiscale.  « Langue : B »  dans l’inscription d’une fondation indique que vous pouvez écrire en français ou en anglais.",
                "Tous droits réservés © 2020– Centre québécois de philanthropie"]

In [12]:
extracted_text = extractTextFromTextDict(textDict_full, bannedStrings)

In [13]:
def convertExtracttoTable(extracted_text, fontsize_data):
    
    #Initialize needed vars
    org_list = []
    foundation_list = []
    org_number = 0
    org_id = -1
    foundation_id = -1
    charitable_foundation = False
    
    #Loop through the extracted list-dict structure
    for line in extracted_text:
          
            
        #### Organizations #####
        #Check if font & size are that of org number / new org
        if (line['font'] in fontsize_data['CompanyName']['font']) & (int(line['size']) in [int(elem) for elem in fontsize_data['CompanyName']['size']]):
            charitable_foundation = False
            #Check if start of new org
            try :
                #Throws ValueError if name of org
                int(line['text'])
            except ValueError:
                #Catch if no orgs created
                if org_id < 0:
                    continue
                #Only triggers when name of org
                org_list[org_id]['Name'] = line['text'].strip()                   
            else:
                #If not name of org then org number
                if (line['font'] in fontsize_data['CompanyName']['font']) & (int(line['size']) in [int(elem) for elem in fontsize_data['CompanyName']['size']]):
                    org_number = line['text'].strip()
                    org_list.append({'id' : org_number,
                                     'isFoundation' : 'No'})
                    org_id += 1
         #Check if not in charitable organisation
        if not charitable_foundation:
            #Check if font & size are that of org address
            #Uses round to filter more text: other text has size that rounds to 8
            if (line['font'] in fontsize_data['CompanyAddress']['font']) & (round(line['size']) in [round(elem) for elem in fontsize_data['CompanyAddress']['size']]):
                #Catch if no orgs created
                if org_id < 0:
                    continue
                #If key Address doesn't already exist, create it
                if 'Address' not in org_list[org_id].keys():
                    org_list[org_id]['Address'] = ''
                    org_list[org_id]['Address'] += line['text']
                else:
                    #Strip here to avoid unnecessary blank space
                    #Maybe handle this later?
                    org_list[org_id]['Address'] += line['text'].strip()
        
        #Check if font & size are that of field name
        if (line['font'] in fontsize_data['CompanyField']['font']) & (int(line['size']) in [int(elem) for elem in fontsize_data['CompanyField']['size']]):
            #Catch if no orgs created
            if org_id < 0:
                continue
                
            #If key field doesn't already exist, create it. Checks if length string > 1 to remove bad text
            if (removePuncandSpace(line['text']) not in org_list[org_id].keys()) & (len(removePuncandSpace(line['text'])) > 1):
                org_list[org_id][removePuncandSpace(line['text'])] = ''
                
            #If field already exists, create new field with convention i - Name where i is number of fields with the same name +1
            elif (removePuncandSpace(line['text']) in org_list[org_id].keys()):
                num_instances = list(org_list[org_id].keys()).count(removePuncandSpace(line['text']))
                org_list[org_id][f"{num_instances + 1} - {removePuncandSpace(line['text'])}"] = ''

        #Check if font & size are that of field text
        if (line['font'] in fontsize_data['CompanyText']['font']) & (round(line['size']) in [round(elem) for elem in fontsize_data['CompanyText']['size']]):
            #Catch if no orgs created
            if org_id < 0:
                continue

            #Place in last dict key: will always be something there
            org_list[org_id][list( org_list[org_id])[-1]] += line['text']
            
            
            
            
        ### Foundations ####
        #Check if text indicates charitable foundation
        if (
            (line['font'] in fontsize_data['FoundationSeparator']['font']) & \
            (round(line['size']) in [round(elem) for elem in fontsize_data['FoundationSeparator']['size']]) & \
            (any([removePuncandSpace(line['text']) in removePuncandSpace(elem) for elem in fontsize_data['FoundationSeparator']['text']])) or (any([removePuncandSpace(elem) in removePuncandSpace(line['text']) for elem in fontsize_data['FoundationSeparator']['text']]))
           ):
            charitable_foundation = True

            #Foundations always start with lines of Helvetica Bold.  
            #Use that as a trigger with the boolean var start_foundation
            start_foundation = True
            foundation_list.append({'id' : org_number,
                             'isFoundation' : 'Yes'})
            foundation_id +=1

        #Check if are in charitable foundation
        if charitable_foundation:
            #Trigger for name and address to differentiate from other text
            if start_foundation:
                #Check if font & size are foundation name
                if (line['font'] in fontsize_data['FoundationName']['font']) & any([round(line['size']) >= round(elem) for elem in fontsize_data['FoundationName']['size']]):
                     #If key Name doesn't already exist, create it
                    if 'Name' not in foundation_list[foundation_id].keys():
                        foundation_list[foundation_id]['Name'] = ''
                        foundation_list[foundation_id]['Name'] += line['text']

                        lineToSkip = line['line_number']
                    

                #Check if font & size are address
                if (line['font'] in fontsize_data['FoundationAddress']['font']) & any([round(line['size']) >= round(elem) for elem in fontsize_data['FoundationAddress']['size']]):
                    #Check if are on different line than Name, meaning are on Address line
                    if line['line_number'] > lineToSkip:
                         #If key Address doesn't already exist, create it
                        if 'Address' not in foundation_list[foundation_id].keys():
                            foundation_list[foundation_id]['Address'] = ''
                            foundation_list[foundation_id]['Address'] += line['text'].strip()
                        else:
                            foundation_list[foundation_id]['Address'] += ' ' +line['text'].strip()



            #Check if font & size are that of field name
            #Outside of if start_foundation
            if (line['font'] in fontsize_data['FoundationField']['font']) & ((int(line['size']) in [int(elem) for elem in fontsize_data['FoundationField']['size']])):
                #Trigger on first catch of non-address text
                start_foundation = False

                #Catch if no orgs created
                if foundation_id < 0:
                    continue
                #If key field doesn't already exist, create it. Checks if length string > 1 to remove bad text
                if (removePuncandSpace(line['text']) not in foundation_list[foundation_id].keys()) & (len(removePuncandSpace(line['text'])) > 1):
                    foundation_list[foundation_id][removePuncandSpace(line['text'])] = ''

            #Check if outside of adress
            if not start_foundation:
                #Check if font & size are that of field text
                if (line['font'] in fontsize_data['FoundationText']['font']) & (round(line['size']) in [round(elem) for elem in fontsize_data['FoundationText']['size']]):
                    #Catch if no orgs created
                    if foundation_id < 0:
                        continue

                    #Place in last dict key: will always be something there, non-generalizable method
                    foundation_list[foundation_id][list( foundation_list[foundation_id])[-1]] += line['text']
                    
    return org_list, foundation_list
        


In [14]:
testingorg, testingfoundation = convertExtracttoTable(extracted_text, fontnsize_data)

In [15]:
dforg_test = pd.DataFrame(testingorg)
dforg_test

Unnamed: 0,id,isFoundation,Name,Address,Secteur industriel,Langue de comm,DDD,FAF,N° de télCie,n° de tél,...,2 - Poste,Principfiliales,ou,2 - Contribution,2 - Langue,2 - Web,2 - Date approb,2 - Domaines dintérêt,2 - Projets privilégiés,2 - Limites géog
0,1,No,3M Canada inc.,"7290, rue Frederick Banting Saint-Laurent QC ...",Équipement et services industriels; Fabricatio...,Français,En tout temps,31 décembre,514-336-5252,800-265-1840,...,,,,,,,,,,
1,2,No,A & D Prévost,"305, 12e Avenue Richelieu QC J3L3T2","Matériaux de construction, manufacturiers; Fab...",Français,En tout temps,31 décembre,450-658-8771,,...,,,,,,,,,,
2,3,No,AAR Aicraft - Services Trois-Rivières,"3750, chemin de l'aéroport Trois-Rivières QC...","Offre des services d'entretien, de réparation ...",Français,En tout temps,31 décembre,819-377-4500,,...,,,,,,,,,,
3,4,No,ABB (Albert Bob Bob),800 boul. Hymes Saint-Laurent QC H4S0B5,Équipement et services industriels; Fabriquer ...,Français,En tout temps,,514-856-6222,,...,,,,,,,,,,
4,5,No,ABB Canada,800 boul. Hymes Saint-Laurent QC H4S0B5,Commerce de gros et détail; Grossistes-distrib...,Français,En tout temps,31 décembre,438-843-6000,888-856-6266,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1337,1338,No,Xérox Canada,"3400, boul. de Maisonneuve O., bur. 900 Montr...","Commerce de gros et détail; Équipement, systèm...",Français,En tout temps,31 décembre,514-939-3769,,...,,,,,,,,,,
1338,1339,No,Yamaha Moteur du Canada Ltée,"1301, rue Ampère Boucherville QC J4B5Z5",Produits récréatifs; Fabricant d'une gamme de ...,Français,Avril à juin,31 mars,450-641-2602,,...,,,,,,,,,,
1339,1340,No,"Yum Yum Enr., Les croustilles","40, rue du Moulin Warwick QC J0A1M0",Alimentation; Fabricant de croustilles et grig...,Français,En tout temps,,819-358-3600,,...,,,,,,,,,,
1340,1341,No,(CEZinc),"860, boul. Gérard-Cadieux Salaberry-de-Valley...",Produits d'acier et de métal; Producteur de zi...,Français,En tout temps,31 décembre,450-373-9144,,...,,,,,,,,,,


In [16]:
dffoundation_test = pd.DataFrame(testingfoundation)
dffoundation_test

Unnamed: 0,id,isFoundation,Name,Address,Langue,Catégorie,Contact,Tél,Échelledons,Total annuel,...,Projets privilégiés,Avis,Courriel,Web,ou,Limites géographiques,Téléc,Poste,Total actif,Limites géog
0,6,Yes,"AbbVie,","8401, rte Trans-Canadienne Saint-Laurent QC H4...",F,Fondation corporaative,"Direction, appui à la communauté",514-906-9700,25 000 à 250 000 $,5 050 000 $,...,,,,,,,,,,
1,32,Yes,"Air-Canada,","7373, boul. de la Côte-Vertu Saint-Laurent QC ...",F,Fondation corporative,"Mme Micheline Villeneuve, Gérante de la Fondat...",514-422-5973,,,...,,,foundation-fondation@aircanada.ca,https://www.aircanada.com/fondation,https://www.aircanada.com/fr/about/community/f...,,,,,
2,42,Yes,Alcoa Foundation,"390 Park Ave, 9th Floor New York NY 10022 USA",A,,"Ms Esra Ozer, President",412-553-4545,,"17,800,000 $",...,,,,www.alcoafoundation.com,http://www.alcoa.com/global/en/community/found...,,412-553-4498,,,
3,48,Yes,Fondation Memoria,"1115, rue Laurier O. Outremont QC H2V2L3",B,,"Madame Jeannette Rioux, Secrétaire",514-277-7778,1 000 $,1 000 $,...,,,,,,,514-908-1354,,,
4,65,Yes,Allstate Foundation of Canada,"27 Allstate Parkway Ave., Suite 100 Markham ON...",B,,"Mr Jeff Wickware, Executive Secretary & Treas...",905-475-4413,100 à 117 860 $,264 350 $,...,,,foundation@allstate.ca,www.allstate.ca,,,905-415-4899,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,1278,Yes,"Utramar Canada,","1155, boul. René-Lévesque O., bur. 3200 Montré...",B,,"Monsieur Christian Houle, Président",514-499-6494,10 à 531 400 $,1 173 800 :$,...,,,,http://www.ultramarcst.ca -ou- https://www.ul...,,,,,,
71,1282,Yes,,La Fondation Unilever a été révoquée volontair...,,,,,,,...,,,,,,,,,,
72,1286,Yes,"UPS Foundation,","c/o UPS Canada Ltd 6285 Northam Drive, Suite 4...",A,,"Mr Eduardo Martinez, President",800-742-5877,,93 500 000 $,...,,,community@ups.com,www.community.ups.com,http://www.community.ups.com/UPS+Foundation,,,,,
73,1302,Yes,"Velan Foundation,","7007, ch. Côte-de-Liesse Saint-Laurent QC H4T1G2",A,,"Mrs Olga Velan, Secretary",514-748-7743,300 à 300 000 $,611 565 :$,...,,,,,,,514-748-9593,,,


# Merging for Similar Column Names


In [17]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [18]:
#Very inefficient but it probably works :)
def similarityColnames(df, threshold, verbose = True):
    i = 0
    output = []
    for j, col1 in enumerate(df.columns):
        output.append([col1])
        for col2 in df.columns:
            if col1 == col2:
                continue

            if similar(col1, col2) >= threshold:
                i += 1
                output[j].append(col2)
                if verbose:
                    print(f"{i}) {col1} - {col2}: {str(similar(col1,col2))}")
    
    #Remove 1 element lists and sort alphabetically
    output = [sorted(nested) for nested in output if len(nested)>1]
    #Remove duplicates
    cleaned_output = []
    for elem in output:
        if elem not in cleaned_output:
            cleaned_output.append(elem)
    return cleaned_output

In [19]:
similarityscores = similarityColnames(dforg_test, 0.8)

1) n° de tél - 2e n° de tél: 0.8571428571428571
2) Domaine dintérêt - Domaines dintérêt: 0.9696969696969697
3) Domaine dintérêt - 2 - Domaines dintérêt: 0.8648648648648649
4) Limites géog - 2 - Limites géog: 0.8571428571428571
5) Contribution - 2 - Contribution: 0.8571428571428571
6) Courriel - 2 - Courriel: 0.8
7) 2e n° de tél - n° de tél: 0.8571428571428571
8) 2 - Courriel - Courriel: 0.8
9) Domaines dintérêt - Domaine dintérêt: 0.9696969696969697
10) Domaines dintérêt - 2 - Domaines dintérêt: 0.8947368421052632
11) Projets privilégiés - 2 - Projets privilégiés: 0.9047619047619048
12) Princip filiales - Princip Filiales: 0.9375
13) Princip filiales - Principale filiale: 0.8823529411764706
14) Princip filiales - Principales filiales: 0.8888888888888888
15) Princip filiales - Principfiliales: 0.967741935483871
16) 2 - Note - 2 - Poste: 0.8235294117647058
17) Date approb - 2 - Date approb: 0.8461538461538461
18) Princip Filiales - Princip filiales: 0.9375
19) Princip Filiales - Principa

In [20]:
similarityscores

[['2e n° de tél', 'n° de tél'],
 ['2 - Domaines dintérêt', 'Domaine dintérêt', 'Domaines dintérêt'],
 ['2 - Limites géog', 'Limites géog'],
 ['2 - Contribution', 'Contribution'],
 ['2 - Courriel', 'Courriel'],
 ['2 - Projets privilégiés', 'Projets privilégiés'],
 ['Princip Filiales',
  'Princip filiales',
  'Principale filiale',
  'Principales filiales',
  'Principfiliales'],
 ['2 - Note', '2 - Poste'],
 ['2 - Date approb', 'Date approb']]

In [24]:
#Dumb way of concating similar columns with a threshold: doesn't check if there are values in both columns
def concatSimilarStringColumns(df, scores, threshold, drop = True, user_input = True):
    """ 
    Uses the output from similarityColnames
    Can be set to use user input or not. If no input from user, will merge every set of columns using first name in list
    Can be set to drop merged columns or not
    ------------
    NOTE: edge case exists where new name provided by user is same as old name. 
    ------------
    """
    
    #If user doesn't want to input anything
    if not user_input:
        while True:
            i = 0

            #Merge        
            #If want to keep first name in list, merge on col of that name
            df[scores[i][0]] = df[scores[i][0]].fillna('')
            for name in scores[i]:
                if name != scores[i][0]:
                    df[scores[i][0]] += df[name].fillna('')


            #Then need to drop the merged columns
            if drop:
                df = df.drop(scores[i][1:], axis=1)

            #Once done with merge, up counter
            i+=1

            #Once done with every column, finish
            if i == len(scores):
                break
                
    #If user_input
    else:
        i = 0
        while True:
            #Get user input for if they want to merge columns in similarity list or not
            mergeResponse = input(f"Do you want to merge the columns in this list: {scores[i]}? Y/N\n")
            if mergeResponse not in ('Y','N'):
                print("\nERROR: Please enter one of Y or N")
                continue

            #Go to next if merge not desired
            if mergeResponse == 'N':
                i+=1

            #Merge
            else:
                #Get user input for desired name of column
                while True:
                    nameResponse = input("If you want to keep the first name in this list, press 1. Else, press 2.")
                    if nameResponse not in ('1','2'):
                        print("\nERROR: Please enter one of 1 or 2")
                        continue
                    else:
                        break

                #If want to keep first name in list, merge on col of that name
                if nameResponse == '1':
                    df[scores[i][0]] = df[scores[i][0]].fillna('')
                    for name in scores[i]:
                        if name != scores[i][0]:
                            df[scores[i][0]] += df[name].fillna('')


                    #Then need to drop the merged columns
                    if drop:
                        df = df.drop(scores[i][1:], axis=1)
                #If want to input a new name, merge by creating new column with inputed name
                else:
                    newName = input("Input desired name for column.")

                    df[newName] = df[scores[i][0]].fillna('')
                    for name in scores[i]:
                        df[newName] += df[name].fillna('')

                    #Then need to drop the merged columns
                    if drop:
                        df = df.drop(scores[i], axis=1)


                #Once done with merge, up counter
                i+=1

            #Once done with every column, finish
            if i == len(scores):
                    break

    return df

In [25]:
df_merged = concatSimilarStringColumns(dforg_test, similarityscores, 0.8, drop = True, user_input = True)

Do you want to merge the columns in this list: ['2e n° de tél', 'n° de tél']? Y/N
N
Do you want to merge the columns in this list: ['2 - Domaines dintérêt', 'Domaine dintérêt', 'Domaines dintérêt']? Y/N
N
Do you want to merge the columns in this list: ['2 - Limites géog', 'Limites géog']? Y/N
N
Do you want to merge the columns in this list: ['2 - Contribution', 'Contribution']? Y/N
N
Do you want to merge the columns in this list: ['2 - Courriel', 'Courriel']? Y/N
N
Do you want to merge the columns in this list: ['2 - Projets privilégiés', 'Projets privilégiés']? Y/N
N
Do you want to merge the columns in this list: ['Princip Filiales', 'Princip filiales', 'Principale filiale', 'Principales filiales', 'Principfiliales']? Y/N
N
Do you want to merge the columns in this list: ['2 - Note', '2 - Poste']? Y/N
N
Do you want to merge the columns in this list: ['2 - Date approb', 'Date approb']? Y/N
N


### Save

In [115]:
####
# Save to csv
###
dfOrg_full.to_excel("../Data/Organizations.xlsx")
dfFoundation_full.to_excel("../Data/Foundations.xlsx")