In [1]:
import fitz
import string
import pandas as pd
from difflib import SequenceMatcher

In [2]:
filepath_1to5 = r'REP-EDC-2020_Fusion_Final-1-5.pdf'
filepath_full = r'REP-EDC-2020_Fusion_Final.pdf'

In [3]:
def openPDFasTextDict(filepath):
    """
    Opens PDF as XML dict
    """
    text_dict = []
    with fitz.open(filepath) as doc:
        for page in doc:
            text_dict.append(page.get_text("dict", sort=False))
    return text_dict

In [4]:
def removePuncandSpace(text):
    """
    Removes punctuation and spaces from a string
    """
    return text.translate(str.maketrans('', '', string.punctuation)).strip()

In [12]:
textDict_1to5 = openPDFasTextDict(filepath = filepath_1to5)
textDict_full = openPDFasTextDict(filepath = filepath_full)

In [81]:
def extractFromTextDict(text_dict):
    ##
    # Get an idea of looping through text
    # For every page
    # Extract additional information about the text as well: font and font size
    # Store in list of dictionaries
    ##

    org_list = []
    org_id = -1
    charitable_foundation = False

    for count_page, page in enumerate(text_dict):
        for count_block_list, block_list in enumerate(page["blocks"]):
            for count_line_list, line_list in enumerate(block_list["lines"]):
                for count_spans_list, spans_list in enumerate(line_list["spans"]):

                    #Remove empty text
                    if spans_list['text'].isspace():
                        continue                

                    #Check if font & size are that of org number or new org
                    if (spans_list['font'] == 'Helvetica-Bold') & (int(float(spans_list['size'])) == 11):
                        charitable_foundation = False
                        #Check if start of new org
                        try :
                            #Throws ValueError if name of org
                            int(spans_list['text'])
                        except ValueError:
                            #Only triggers when name of org
                            org_list[org_id]['Name'] = spans_list['text'].strip()                   
                        else:
                            #If not name of org then org number
                            if (spans_list['font'] == 'Helvetica-Bold') & (int(float(spans_list['size'])) == 11):
                                org_number = spans_list['text'].strip()
                                org_list.append({'id' : org_number,
                                                 'isFoundation' : 'No'})
                                org_id += 1

                    #Check if not in charitable organisation
                    if not charitable_foundation:

                        #Check if font & size are that of org address
                        #Uses round to filter more text: other text has size that rounds to 8
                        if (spans_list['font'] == 'Helvetica') & (round(float(spans_list['size'])) == 9):
                            #Catch if no orgs created
                            if org_id < 0:
                                continue
                            #If key Address doesn't already exist, create it
                            if 'Address' not in org_list[org_id].keys():
                                org_list[org_id]['Address'] = ''
                                org_list[org_id]['Address'] += spans_list['text']
                            else:
                                #Strip here to avoid unnecessary blank space
                                #Maybe handle this later?
                                org_list[org_id]['Address'] += spans_list['text'].strip()

                        #Check if font & size are that of field name
                        if (spans_list['font'] == 'ArialNarrow') & (int(float(spans_list['size'])) == 8):
                            #Catch if no orgs created
                            if org_id < 0:
                                continue
                            #If key field doesn't already exist, create it. Checks if length string > 1 to remove bad text
                            if (removePuncandSpace(spans_list['text']) not in org_list[org_id].keys()) & (len(removePuncandSpace(spans_list['text'])) > 1):
                                org_list[org_id][removePuncandSpace(spans_list['text'])] = ''

                        #Check if font & size are that of field text
                        if (spans_list['font'] == 'Helvetica-Bold') & (round(float(spans_list['size'])) == 8):
                            #Catch if no orgs created
                            if org_id < 0:
                                continue

                            #Place in last dict key: will always be something there, non-generalizable method
                            org_list[org_id][list( org_list[org_id])[-1]] += spans_list['text']

                    ### Foundations ####
                    #Check if text indicates charitable foundation
                    if spans_list['text'] == "L'entreprise possède une fondation corporative :":
                        charitable_foundation = True
                        org_id += 1
                        org_list.append({'id' : org_number,
                                         'isFoundation' : 'Yes'})

                    #Check if are in charitable foundation
                    if charitable_foundation:
                        #Check if font & size are foundation name
                        if (spans_list['font'] == 'Helvetica-Bold') & (round(float(spans_list['size'])) >= 9):
                             #If key Name doesn't already exist, create it
                            if 'Name' not in org_list[org_id].keys():
                                org_list[org_id]['Name'] = ''
                                org_list[org_id]['Name'] += spans_list['text']
                            else:
                                #Strip here to avoid unnecessary blank space
                                org_list[org_id]['Name'] += spans_list['text'].strip()

                        #Check if font & size are address
                        if (spans_list['font'] == 'Helvetica-Bold') & (int(float(spans_list['size'])) == 8):
                             #If key Name doesn't already exist, create it
                            if 'Address' not in org_list[org_id].keys():
                                org_list[org_id]['Address'] = ''
                                org_list[org_id]['Address'] += spans_list['text']
                            else:
                                #Strip here to avoid unnecessary blank space
                                org_list[org_id]['Address'] += ', ' + spans_list['text'].strip()


                        #Check if font & size are that of field name
                        if (spans_list['font'] == 'ArialNarrow') & (int(float(spans_list['size'])) == 8):
                            #Catch if no orgs created
                            if org_id < 0:
                                continue
                            #If key field doesn't already exist, create it. Checks if length string > 1 to remove bad text
                            if (removePuncandSpace(spans_list['text']) not in org_list[org_id].keys()) & (len(removePuncandSpace(spans_list['text'])) > 1):
                                org_list[org_id][removePuncandSpace(spans_list['text'])] = ''

                        #Check if font & size are that of field text
                        if (spans_list['font'] == 'ArialNarrow,Bold') & (round(float(spans_list['size'])) == 8):
                            #Catch if no orgs created
                            if org_id < 0:
                                continue

                            #Place in last dict key: will always be something there, non-generalizable method
                            org_list[org_id][list( org_list[org_id])[-1]] += spans_list['text']
    return org_list


In [82]:
extractedList_1to5 = extractFromTextDict(textDict_1to5)

In [83]:
extractedList_full = extractFromTextDict(textDict_full)

In [84]:
df_1to5 = pd.DataFrame(extractedList_1to5)

In [85]:
df_full = pd.DataFrame(extractedList_full)

In [86]:
df_1to5.head()

Unnamed: 0,id,isFoundation,Name,Address,Secteur industriel,Langue de comm,DDD,FAF,N° de télCie,n° de tél,...,N° de faxCie,2e n° de tél,Courriel Cie,Fax,2e contact pour,Filiale de,Domaines dintérêt,Projets privilégiés,Nbre de succ,Princip filiales
0,1,No,3M Canada inc.,"7290, rue Frederick Banting Saint-Laurent QC ...",Équipement et services industriels; Fabricatio...,Français,En tout temps,31 décembre,514-336-5252,800-265-1840,...,,,,,,,,,,
1,2,No,A & D Prévost,"305, 12e Avenue Richelieu QC J3L3T2","Matériaux de construction, manufacturiers; Fab...",Français,En tout temps,31 décembre,450-658-8771,,...,450-658-0077,800-361-4433,info@prevost-architectural.com,450-658-0077,"DonsMadame Marie-Josée Dery, Coordonnatrice Ca...",,,,,
2,3,No,AAR Aicraft - Services Trois-Rivières,"3750, chemin de l'aéroport Trois-Rivières QC...","Offre des services d'entretien, de réparation ...",Français,En tout temps,31 décembre,819-377-4500,,...,819- 668-8811,,comptabilite@aarcorp.com,,,,,,,
3,4,No,ABB (Albert Bob Bob),800 boul. Hymes Saint-Laurent QC H4S0B5,Équipement et services industriels; Fabriquer ...,Français,En tout temps,,514-856-6222,,...,514-856-6297,,,514-856-6297,,Baldor-Dodge-Reliance,,,,
4,5,No,ABB Canada,800 boul. Hymes Saint-Laurent QC H4S0B5,Commerce de gros et détail; Grossistes-distrib...,Français,En tout temps,31 décembre,438-843-6000,888-856-6266,...,514-856-6297,,contact.center@ca.abb.com,514-856-6297,,ABB Bomem,,,,


In [87]:
df_full.head()

Unnamed: 0,id,isFoundation,Name,Address,Secteur industriel,Langue de comm,DDD,FAF,N° de télCie,n° de tél,...,Date approb,Échelledons,Actif,Princip Filiales,Principale filiale,Filiales princip,Principales filiales,Total actif,Principfiliales,ou
0,1,No,3M Canada inc.,"7290, rue Frederick Banting Saint-Laurent QC ...",Équipement et services industriels; Fabricatio...,Français,En tout temps,31 décembre,514-336-5252,800-265-1840,...,,,,,,,,,,
1,2,No,A & D Prévost,"305, 12e Avenue Richelieu QC J3L3T2","Matériaux de construction, manufacturiers; Fab...",Français,En tout temps,31 décembre,450-658-8771,,...,,,,,,,,,,
2,3,No,AAR Aicraft - Services Trois-Rivières,"3750, chemin de l'aéroport Trois-Rivières QC...","Offre des services d'entretien, de réparation ...",Français,En tout temps,31 décembre,819-377-4500,,...,,,,,,,,,,
3,4,No,ABB (Albert Bob Bob),800 boul. Hymes Saint-Laurent QC H4S0B5,Équipement et services industriels; Fabriquer ...,Français,En tout temps,,514-856-6222,,...,,,,,,,,,,
4,5,No,ABB Canada,800 boul. Hymes Saint-Laurent QC H4S0B5,Commerce de gros et détail; Grossistes-distrib...,Français,En tout temps,31 décembre,438-843-6000,888-856-6266,...,,,,,,,,,,


In [88]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [89]:
#Very inefficient but it probably works :)
def similarityColnames(df, threshold, verbose = True):
    i = 0
    output = []
    for j, col1 in enumerate(df.columns):
        output.append([col1])
        for col2 in df.columns:
            if col1 == col2:
                continue

            if similar(col1, col2) >= threshold:
                i += 1
                output[j].append(col2)
                if verbose:
                    print(f"{i}) {col1} - {col2}: {str(similar(col1,col2))}")
    
    #Remove 1 element lists and sort alphabetically
    output = [sorted(nested) for nested in output if len(nested)>1]
    #Remove duplicates
    cleaned_output = []
    for elem in output:
        if elem not in cleaned_output:
            cleaned_output.append(elem)
    return cleaned_output

In [90]:
similarityscores = similarityColnames(df_full, 0.8)

1) n° de tél - 2e n° de tél: 0.8571428571428571
2) Domaine dintérêt - Domaines dintérêt: 0.9696969696969697
3) 2e n° de tél - n° de tél: 0.8571428571428571
4) Domaines dintérêt - Domaine dintérêt: 0.9696969696969697
5) Princip filiales - Princip Filiales: 0.9375
6) Princip filiales - Principale filiale: 0.8823529411764706
7) Princip filiales - Principales filiales: 0.8888888888888888
8) Princip filiales - Principfiliales: 0.967741935483871
9) Princip Filiales - Princip filiales: 0.9375
10) Princip Filiales - Principale filiale: 0.8235294117647058
11) Princip Filiales - Principales filiales: 0.8333333333333334
12) Princip Filiales - Principfiliales: 0.9032258064516129
13) Principale filiale - Princip filiales: 0.8823529411764706
14) Principale filiale - Princip Filiales: 0.8235294117647058
15) Principale filiale - Principales filiales: 0.9473684210526315
16) Principale filiale - Principfiliales: 0.8484848484848485
17) Principales filiales - Princip filiales: 0.8888888888888888
18) Princ

In [91]:
similarityscores

[['2e n° de tél', 'n° de tél'],
 ['Domaine dintérêt', 'Domaines dintérêt'],
 ['Princip Filiales',
  'Princip filiales',
  'Principale filiale',
  'Principales filiales',
  'Principfiliales']]

In [127]:
#Dumb way of concating similar columns with a threshold: doesn't check if there are values in both columns
def concatSimilarStringColumns(df, scores, threshold, drop = True, user_input = True):
    """ 
    Uses the output from similarityColnames
    Can be set to use user input or not. If no input from user, will merge every set of columns using first name in list
    Can be set to drop merged columns or not
    ------------
    NOTE: edge case exists where new name provided by user is same as old name. 
    ------------
    """
    
    #If user doesn't want to input anything
    if not user_input:
        while True:
            i = 0

            #Merge        
            #If want to keep first name in list, merge on col of that name
            df[scores[i][0]] = df[scores[i][0]].fillna('')
            for name in scores[i]:
                if name != scores[i][0]:
                    df[scores[i][0]] += df[name].fillna('')


            #Then need to drop the merged columns
            if drop:
                df = df.drop(scores[i][1:], axis=1)

            #Once done with merge, up counter
            i+=1

            #Once done with every column, finish
            if i == len(scores):
                break
                
    #If user_input
    else:
        i = 0
        while True:
            #Get user input for if they want to merge columns in similarity list or not
            mergeResponse = input(f"Do you want to merge the columns in this list: {scores[i]}? Y/N\n")
            if mergeResponse not in ('Y','N'):
                print("\nERROR: Please enter one of Y or N")
                continue

            #Go to next if merge not desired
            if mergeResponse == 'N':
                i+=1
                continue

            #Merge
            else:
                #Get user input for desired name of column
                while True:
                    nameResponse = input("If you want to keep the first name in this list, press 1. Else, press 2.")
                    if nameResponse not in ('1','2'):
                        print("\nERROR: Please enter one of 1 or 2")
                        continue
                    else:
                        break

                #If want to keep first name in list, merge on col of that name
                if nameResponse == '1':
                    df[scores[i][0]] = df[scores[i][0]].fillna('')
                    for name in scores[i]:
                        if name != scores[i][0]:
                            df[scores[i][0]] += df[name].fillna('')


                    #Then need to drop the merged columns
                    if drop:
                        df = df.drop(scores[i][1:], axis=1)
                #If want to input a new name, merge by creating new column with inputed name
                else:
                    newName = input("Input desired name for column.")

                    df[newName] = df[scores[i][0]].fillna('')
                    for name in scores[i]:
                        df[newName] += df[name].fillna('')

                    #Then need to drop the merged columns
                    if drop:
                        df = df.drop(scores[i], axis=1)


                #Once done with merge, up counter
                i+=1

            #Once done with every column, finish
            if i == len(scores):
                    break

    return df

In [128]:
df_merged = concatSimilarStringColumns(df_full, similarityscores, 0.8, drop = True, user_input = True)

Do you want to merge the columns in this list: ['2e n° de tél', 'n° de tél']? Y/N
N
Do you want to merge the columns in this list: ['Domaine dintérêt', 'Domaines dintérêt']? Y/N
Y
If you want to keep the first name in this list, press 1. Else, press 2.1
Do you want to merge the columns in this list: ['Princip Filiales', 'Princip filiales', 'Principale filiale', 'Principales filiales', 'Principfiliales']? Y/N
Y
If you want to keep the first name in this list, press 1. Else, press 2.N

ERROR: Please enter one of 1 or 2
If you want to keep the first name in this list, press 1. Else, press 2.FUck off 

ERROR: Please enter one of 1 or 2
If you want to keep the first name in this list, press 1. Else, press 2.2


KeyboardInterrupt: Interrupted by user