In [69]:
import fitz
import string
import pandas as pd
from difflib import SequenceMatcher
import re
import unidecode

In [70]:
filepath_full = '../Data/REP-EDC-2020_Fusion_Final.pdf'

In [71]:
def openPDFasTextDict(filepath):
    """
    Opens PDF as XML dict
    """
    text_dict = []
    with fitz.open(filepath) as doc:
        for page in doc:
            text_dict.append(page.get_text("dict", sort=False))
    return text_dict

In [72]:
def removePuncandSpace(text):
    """
    Removes punctuation and spaces from a string
    """
    return text.translate(str.maketrans('', '', string.punctuation)).strip()

In [73]:
textDict_full = openPDFasTextDict(filepath = filepath_full)

In [74]:
def extractFromTextDict(text_dict, bannedStrings):
    ##
    # Get an idea of looping through text
    # For every page
    # Extract additional information about the text as well: font and font size
    # Store in list of dictionaries
    ##

    org_list = []
    foundation_list = []
    org_id = -1
    foundation_id = -1
    charitable_foundation = False

    for count_page, page in enumerate(text_dict):
        for count_block_list, block_list in enumerate(page["blocks"]):
            for count_line_list, line_list in enumerate(block_list["lines"]):
                for count_spans_list, spans_list in enumerate(line_list["spans"]):

                    #Remove empty text
                    if spans_list['text'].isspace():
                        continue
                    #Skip if trash text
                    if spans_list['text'].strip() in bannedStrings:
                        continue
                    
                    
                    ### Organizations ####
                    #Check if font & size are that of org number or new org
                    if (spans_list['font'] == 'Helvetica-Bold') & (int(float(spans_list['size'])) == 11):
                        charitable_foundation = False
                        #Check if start of new org
                        try :
                            #Throws ValueError if name of org
                            int(spans_list['text'])
                        except ValueError:
                            #Only triggers when name of org
                            org_list[org_id]['Name'] = spans_list['text'].strip()                   
                        else:
                            #If not name of org then org number
                            if (spans_list['font'] == 'Helvetica-Bold') & (int(float(spans_list['size'])) == 11):
                                org_number = spans_list['text'].strip()
                                org_list.append({'id' : org_number,
                                                 'isFoundation' : 'No'})
                                org_id += 1

                    #Check if not in charitable organisation
                    if not charitable_foundation:

                        #Check if font & size are that of org address
                        #Uses round to filter more text: other text has size that rounds to 8
                        if (spans_list['font'] == 'Helvetica') & (round(float(spans_list['size'])) == 9):
                            #Catch if no orgs created
                            if org_id < 0:
                                continue
                            #If key Address doesn't already exist, create it
                            if 'Address' not in org_list[org_id].keys():
                                org_list[org_id]['Address'] = ''
                                org_list[org_id]['Address'] += spans_list['text']
                            else:
                                #Strip here to avoid unnecessary blank space
                                #Maybe handle this later?
                                org_list[org_id]['Address'] += spans_list['text'].strip()

                        #Check if font & size are that of field name
                        if (spans_list['font'] == 'ArialNarrow') & (int(float(spans_list['size'])) == 8):
                            #Catch if no orgs created
                            if org_id < 0:
                                continue
                            #If key field doesn't already exist, create it. Checks if length string > 1 to remove bad text
                            if (removePuncandSpace(spans_list['text']) not in org_list[org_id].keys()) & (len(removePuncandSpace(spans_list['text'])) > 1):
                                org_list[org_id][removePuncandSpace(spans_list['text'])] = ''
                            #If field already exists, create new field with convention i - Name where i is number of fields with the same name +1
                            elif (removePuncandSpace(spans_list['text']) in org_list[org_id].keys()):
                                num_instances = list(org_list[org_id].keys()).count(removePuncandSpace(spans_list['text']))
                                org_list[org_id][f"{num_instances + 1} - {removePuncandSpace(spans_list['text'])}"] = ''

                        #Check if font & size are that of field text
                        if (spans_list['font'] == 'Helvetica-Bold') & (round(float(spans_list['size'])) == 8):
                            #Catch if no orgs created
                            if org_id < 0:
                                continue

                            #Place in last dict key: will always be something there, non-generalizable method
                            org_list[org_id][list( org_list[org_id])[-1]] += spans_list['text']

                    ### Foundations ####
                    #Check if text indicates charitable foundation
                    if (spans_list['font'] == 'ArialNarrow') & (round(float(spans_list['size'])) == 7) & (spans_list['text'][:23] == "L'entreprise possède un"): 

                        charitable_foundation = True
                        
                        #Foundations always start with lines of Helvetica Bold.  
                        #Use that as a trigger with the boolean var start_foundation
                        start_foundation = True
                        foundation_list.append({'id' : org_number,
                                         'isFoundation' : 'Yes'})
                        foundation_id +=1

                    #Check if are in charitable foundation
                    if charitable_foundation:
                        #Trigger for name and address to differentiate from other text
                        if start_foundation:
                            #Check if font & size are foundation name
                            if (spans_list['font'] == 'Helvetica-Bold') & (round(float(spans_list['size'])) >= 9):
                                 #If key Name doesn't already exist, create it
                                if 'Name' not in foundation_list[foundation_id].keys():
                                    foundation_list[foundation_id]['Name'] = ''
                                    foundation_list[foundation_id]['Name'] += spans_list['text']
                                    
                                    lineToSkip = count_line_list
                                #else:
                                    #Strip here to avoid unnecessary blank space
                                    #foundation_list[foundation_id]['Name'] += spans_list['text'].strip()

                            #Check if font & size are address
                            if (spans_list['font'] == 'Helvetica-Bold') & ((round(float(spans_list['size'])) == 8) | (round(float(spans_list['size'])) >= 9)):
                                #Check if are on different line than Name, meaning are on Address line
                                if count_line_list > lineToSkip:
                                     #If key Address doesn't already exist, create it
                                    if 'Address' not in foundation_list[foundation_id].keys():
                                        foundation_list[foundation_id]['Address'] = ''
                                        foundation_list[foundation_id]['Address'] += spans_list['text'].strip()
                                    else:
                                        foundation_list[foundation_id]['Address'] += ' ' +spans_list['text'].strip()



                        #Check if font & size are that of field name
                        #Outside of if start_foundation
                        if ((spans_list['font'] == 'ArialNarrow') or (spans_list['font'] == 'Helvetica')) & (int(float(spans_list['size'])) == 8):
                            #Trigger on first catch of non-address text
                            start_foundation = False
                            
                            #Catch if no orgs created
                            if foundation_id < 0:
                                continue
                            #If key field doesn't already exist, create it. Checks if length string > 1 to remove bad text
                            if (removePuncandSpace(spans_list['text']) not in foundation_list[foundation_id].keys()) & (len(removePuncandSpace(spans_list['text'])) > 1):
                                foundation_list[foundation_id][removePuncandSpace(spans_list['text'])] = ''
                        
                        #Check if outside of adress
                        if not start_foundation:
                            #Check if font & size are that of field text
                            if ((spans_list['font'] == 'ArialNarrow,Bold') or (spans_list['font'] == 'Helvetica-Bold')) & (round(float(spans_list['size'])) == 8):
                                #Catch if no orgs created
                                if foundation_id < 0:
                                    continue

                                #Place in last dict key: will always be something there, non-generalizable method
                                foundation_list[foundation_id][list( foundation_list[foundation_id])[-1]] += spans_list['text']

    return org_list, foundation_list


In [75]:
bannedStrings = ['Entreprises donatrices et commanditaires du Québec',
                "« DDD » : Date de distribution des dons. « FAF » : date de fin d'année fiscale.  « Langue : B »  dans l’inscription d’une fondation indique que vous pouvez écrire en français ou en anglais.",
                "Tous droits réservés © 2020– Centre québécois de philanthropie"]

In [76]:
extractedOrgList_full, extractedFoundationList_full = extractFromTextDict(textDict_full, bannedStrings)

In [77]:
dfOrg_full = pd.DataFrame(extractedOrgList_full)
dfFoundation_full = pd.DataFrame(extractedFoundationList_full)

In [78]:
dfOrg_full.columns

Index(['id', 'isFoundation', 'Name', 'Address', 'Secteur industriel',
       'Langue de comm', 'DDD', 'FAF', 'N° de télCie', 'n° de tél', 'Site Web',
       'Domaine dintérêt', 'Limites géog', 'Note', 'Nombre demployés',
       'Contribution', 'Contact', 'Tél', 'Poste', 'Courriel', 'Avis',
       'N° de faxCie', '2e n° de tél', 'Courriel  Cie', 'Fax',
       '2e contact pour', '2 - Tél', '2 - Fax', 'Filiale de', '2 - Courriel',
       'Nbre de succ', 'Princip filiales', '2 - Note', 'Princip Filiales',
       'Principale filiale', 'Filiales princip', 'Principales filiales',
       'Principfiliales', '2 - Contribution', '2 - Contact', '2 - Poste'],
      dtype='object')

In [79]:
dfFoundation_full.columns

Index(['id', 'isFoundation', 'Name', 'Address', 'Langue', 'Catégorie',
       'Contact', 'Tél', 'Échelledons', 'Total annuel', 'Actif', 'Date approb',
       'Date fin d’ann', 'Domaines dintérêt', 'Projets privilégiés', 'Avis',
       'Courriel', 'Web', 'ou', 'Limites géographiques', 'Téléc', 'Poste',
       'Total actif', 'Limites géog'],
      dtype='object')

In [80]:
#Checking null values
dfOrg_full.isna().sum()

id                         0
isFoundation               0
Name                       0
Address                    0
Secteur industriel         0
Langue de comm             3
DDD                       19
FAF                      519
N° de télCie               6
n° de tél               1223
Site Web                  10
Domaine dintérêt           6
Limites géog              12
Note                     597
Nombre demployés         339
Contribution               0
Contact                    6
Tél                       38
Poste                   1158
Courriel                 318
Avis                      34
N° de faxCie             307
2e n° de tél             669
Courriel  Cie            547
Fax                      416
2e contact pour         1315
2 - Tél                 1333
2 - Fax                 1338
Filiale de              1009
2 - Courriel            1328
Nbre de succ             835
Princip filiales        1138
2 - Note                1212
Princip Filiales        1341
Principale fil

In [81]:
dfFoundation_full.isna().sum()

id                        0
isFoundation              0
Name                      3
Address                   1
Langue                    4
Catégorie                10
Contact                  10
Tél                      11
Échelledons              29
Total annuel             17
Actif                    24
Date approb              16
Date fin d’ann           16
Domaines dintérêt         4
Projets privilégiés       8
Avis                     18
Courriel                 33
Web                      24
ou                       73
Limites géographiques    72
Téléc                    24
Poste                    78
Total actif              80
Limites géog             80
dtype: int64

In [82]:
dfFoundation_full[dfFoundation_full.Name.isna() == True]

Unnamed: 0,id,isFoundation,Name,Address,Langue,Catégorie,Contact,Tél,Échelledons,Total annuel,...,Projets privilégiés,Avis,Courriel,Web,ou,Limites géographiques,Téléc,Poste,Total actif,Limites géog
19,281,Yes,,,,,,,,,...,,,,,,,,,,
72,1186,Yes,,"Voir la Fondation St-Hubert, plus haut.",,,,,,,...,,,,,,,,,,
76,1282,Yes,,La Fondation Unilever a été révoquée volontair...,,,,,,,...,,,,,,,,,,


In [83]:
#Drop the foundations where name is null
dfFoundation_full = dfFoundation_full[dfFoundation_full.Name.isna() == False]

In [84]:
dfFoundation_full[dfFoundation_full.Address.isna() == True]

Unnamed: 0,id,isFoundation,Name,Address,Langue,Catégorie,Contact,Tél,Échelledons,Total annuel,...,Projets privilégiés,Avis,Courriel,Web,ou,Limites géographiques,Téléc,Poste,Total actif,Limites géog


### Cleaning column names and merging equivalent fields

In [95]:
def concat_cols(df_original, df_cols, df_new):

    for _, x in df_cols.iterrows():
        new_col_name = x['new_col']
        cols_to_concat = x['colnames']

        if len(cols_to_concat) > 1:
            df_new[new_col_name] =  df_original[cols_to_concat].fillna('').agg('; '.join, axis=1)
        else:
            df_new[new_col_name] = df_original[cols_to_concat[0]]
    
    return df_new


def clean_colnames(string):
    string_c = re.sub(r'[^ \nA-Za-z0-9À-ÖØ-öø-ÿ/]+', '', string).upper()
    string_c = unidecode.unidecode(string_c)
    #string_c = string_c.replace(' ','_')
    return string_c

def remove_extra_sep_tokens(df_original, df_cols, pattern = '; '):

    cols_concatted = df_cols[ df_cols.apply(lambda x: len(x['colnames']),axis=1) >1].new_col.tolist()
    sep_list = [pattern*i for i in range(2,10)]
    df_new = df_original.copy()
    
    for col in cols_concatted:
        for str1 in sep_list[::-1]:
            df_new.loc[:,col] = df_new.loc[:,col].str.replace(str1, pattern)

    df_new[cols_concatted] = df_new[cols_concatted].applymap(lambda x: x.strip(pattern) if isinstance(x, str) else x)

    return df_new


In [96]:
colnames = dfOrg_full.columns

clean_cols = [clean_colnames(i) for i in colnames]

dfOrg_full.columns = clean_cols

# pd.DataFrame({'colnames':clean_cols}).to_csv('Colnames.csv', index=None) 
# to label them in excel

In [97]:
df_groupings = pd.read_csv('Colnames_orgs_std.csv')

df_groupings = df_groupings.groupby('new_col')['colnames'].apply(list).reset_index()

df_orgs = dfOrg_full[['ID']].copy()

df_orgs = concat_cols(dfOrg_full, df_groupings, df_orgs)

df_orgs = remove_extra_sep_tokens(df_orgs, df_groupings)

In [98]:
df_orgs.columns

Index(['ID', '2E CONTACT POUR', 'ADDRESS', 'AVIS', 'CONTACT', 'CONTRIBUTION',
       'COURRIEL', 'DDD', 'DOMAINE DINTERET', 'FAF', 'FAX', 'FILIALE DE',
       'ISFOUNDATION', 'LANGUE', 'LIMITES GEOG', 'N DE TEL', 'NAME',
       'NBRE DE SUCC', 'NOMBRE DEMPLOYES', 'NOTE', 'POSTE', 'PRINCIP FILIALES',
       'SECTEUR INDUSTRIEL', 'SITE WEB'],
      dtype='object')

In [99]:
# same process but for foundations
colnames = dfFoundation_full.columns

clean_cols = [clean_colnames(i) for i in colnames]

dfFoundation_full.columns = clean_cols

#pd.DataFrame({'colnames':clean_cols}).to_csv('Colnames_foundations.csv', index=None)


In [100]:
df_groupings = pd.read_csv('Colnames_foundations_std.csv')

df_groupings = df_groupings.groupby('new_col')['colnames'].apply(list).reset_index()

df_fonds = dfFoundation_full[['ID']].copy()

df_fonds = concat_cols(dfFoundation_full, df_groupings, df_fonds)

df_fonds = remove_extra_sep_tokens(df_fonds, df_groupings)

### Save

In [103]:
print(df_orgs.shape)
print(df_fonds.shape)

(1342, 24)
(78, 22)


In [104]:
####
# Save to csv
###
df_orgs.to_excel("../Data/Organizations.xlsx")
df_fonds.to_excel("../Data/Foundations.xlsx")

# Merging for Similar Column Names
# - Deprecated -

In [90]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [91]:
#Very inefficient but it probably works :)
def similarityColnames(df, threshold, verbose = True):
    i = 0
    output = []
    for j, col1 in enumerate(df.columns):
        output.append([col1])
        for col2 in df.columns:
            if col1 == col2:
                continue

            if similar(col1, col2) >= threshold:
                i += 1
                output[j].append(col2)
                if verbose:
                    print(f"{i}) {col1} - {col2}: {str(similar(col1,col2))}")
    
    #Remove 1 element lists and sort alphabetically
    output = [sorted(nested) for nested in output if len(nested)>1]
    #Remove duplicates
    cleaned_output = []
    for elem in output:
        if elem not in cleaned_output:
            cleaned_output.append(elem)
    return cleaned_output

In [93]:
similarityscores = similarityColnames(dfOrg_full, 0.8)

1) n° de tél - 2e n° de tél: 0.8571428571428571
2) Domaine dintérêt - Domaines dintérêt: 0.9696969696969697
3) Contribution - 2 - Contribution: 0.8571428571428571
4) Courriel - 2 - Courriel: 0.8
5) 2e n° de tél - n° de tél: 0.8571428571428571
6) 2 - Courriel - Courriel: 0.8
7) Princip filiales - Princip Filiales: 0.9375
8) Princip filiales - Principale filiale: 0.8823529411764706
9) Princip filiales - Principales filiales: 0.8888888888888888
10) Princip filiales - Principfiliales: 0.967741935483871
11) 2 - Note - 2 - Poste: 0.8235294117647058
12) Princip Filiales - Princip filiales: 0.9375
13) Princip Filiales - Principale filiale: 0.8235294117647058
14) Princip Filiales - Principales filiales: 0.8333333333333334
15) Princip Filiales - Principfiliales: 0.9032258064516129
16) Principale filiale - Princip filiales: 0.8823529411764706
17) Principale filiale - Princip Filiales: 0.8235294117647058
18) Principale filiale - Principales filiales: 0.9473684210526315
19) Principale filiale - Pri

In [94]:
similarityscores

[['2e n° de tél', 'n° de tél'],
 ['Domaine dintérêt', 'Domaines dintérêt'],
 ['2 - Contribution', 'Contribution'],
 ['2 - Courriel', 'Courriel'],
 ['Princip Filiales',
  'Princip filiales',
  'Principale filiale',
  'Principales filiales',
  'Principfiliales'],
 ['2 - Note', '2 - Poste']]

In [95]:
#Dumb way of concating similar columns with a threshold: doesn't check if there are values in both columns
def concatSimilarStringColumns(df, scores, threshold, drop = True, user_input = True):
    """ 
    Uses the output from similarityColnames
    Can be set to use user input or not. If no input from user, will merge every set of columns using first name in list
    Can be set to drop merged columns or not
    ------------
    NOTE: edge case exists where new name provided by user is same as old name. 
    ------------
    """
    
    #If user doesn't want to input anything
    if not user_input:
        while True:
            i = 0

            #Merge        
            #If want to keep first name in list, merge on col of that name
            df[scores[i][0]] = df[scores[i][0]].fillna('')
            for name in scores[i]:
                if name != scores[i][0]:
                    df[scores[i][0]] += df[name].fillna('')


            #Then need to drop the merged columns
            if drop:
                df = df.drop(scores[i][1:], axis=1)

            #Once done with merge, up counter
            i+=1

            #Once done with every column, finish
            if i == len(scores):
                break
                
    #If user_input
    else:
        i = 0
        while True:
            #Get user input for if they want to merge columns in similarity list or not
            mergeResponse = input(f"Do you want to merge the columns in this list: {scores[i]}? Y/N\n")
            if mergeResponse not in ('Y','N'):
                print("\nERROR: Please enter one of Y or N")
                continue

            #Go to next if merge not desired
            if mergeResponse == 'N':
                i+=1
                continue

            #Merge
            else:
                #Get user input for desired name of column
                while True:
                    nameResponse = input("If you want to keep the first name in this list, press 1. Else, press 2.")
                    if nameResponse not in ('1','2'):
                        print("\nERROR: Please enter one of 1 or 2")
                        continue
                    else:
                        break

                #If want to keep first name in list, merge on col of that name
                if nameResponse == '1':
                    df[scores[i][0]] = df[scores[i][0]].fillna('')
                    for name in scores[i]:
                        if name != scores[i][0]:
                            df[scores[i][0]] += df[name].fillna('')


                    #Then need to drop the merged columns
                    if drop:
                        df = df.drop(scores[i][1:], axis=1)
                #If want to input a new name, merge by creating new column with inputed name
                else:
                    newName = input("Input desired name for column.")

                    df[newName] = df[scores[i][0]].fillna('')
                    for name in scores[i]:
                        df[newName] += df[name].fillna('')

                    #Then need to drop the merged columns
                    if drop:
                        df = df.drop(scores[i], axis=1)


                #Once done with merge, up counter
                i+=1

            #Once done with every column, finish
            if i == len(scores):
                    break

    return df

In [96]:
df_merged = concatSimilarStringColumns(dfOrg_full, similarityscores, 0.8, drop = True, user_input = True)

Do you want to merge the columns in this list: ['2e n° de tél', 'n° de tél']? Y/N
N
Do you want to merge the columns in this list: ['Domaine dintérêt', 'Domaines dintérêt']? Y/N
Y
If you want to keep the first name in this list, press 1. Else, press 2.1
Do you want to merge the columns in this list: ['2 - Contribution', 'Contribution']? Y/N
N
Do you want to merge the columns in this list: ['2 - Courriel', 'Courriel']? Y/N
N
Do you want to merge the columns in this list: ['Princip Filiales', 'Princip filiales', 'Principale filiale', 'Principales filiales', 'Principfiliales']? Y/N
Y
If you want to keep the first name in this list, press 1. Else, press 2.2
Input desired name for column.Principales Filiales
Do you want to merge the columns in this list: ['2 - Note', '2 - Poste']? Y/N
N


IndexError: list index out of range