In [8]:
import fitz
import string
import pandas as pd
from difflib import SequenceMatcher

In [9]:
filepath_1to5 = r'REP-EDC-2020_Fusion_Final-1-5.pdf'
filepath_full = r'REP-EDC-2020_Fusion_Final.pdf'

In [10]:
def openPDFasTextDict(filepath):
    """
    Opens PDF as XML dict
    """
    text_dict = []
    with fitz.open(filepath) as doc:
        for page in doc:
            text_dict.append(page.get_text("dict", sort=False))
    return text_dict

In [11]:
def removePuncandSpace(text):
    """
    Removes punctuation and spaces from a string
    """
    return text.translate(str.maketrans('', '', string.punctuation)).strip()

In [12]:
removePuncandSpace("yo!")

'yo'

In [13]:
def extractFromTextDict(text_dict):
    """
    Extracts desired text from a text dictionary
    Returns list of dicts
    """
    org_list = []
    org_id = -1
    org_name = False

    for count_page, page in enumerate(text_dict):
        for count_block_list, block_list in enumerate(page["blocks"]):
            for count_line_list, line_list in enumerate(block_list["lines"]):
                for count_spans_list, spans_list in enumerate(line_list["spans"]):

                    #Remove empty text
                    if spans_list['text'].isspace():
                        continue

                    #Check if font & size are that of org number or new org
                    if (spans_list['font'] == 'Helvetica-Bold') & (int(float(spans_list['size'])) == 11):
                        #Check if start of new org
                        try :
                            #Throws ValueError if name of org
                            int(spans_list['text'])
                        except ValueError:
                            #Only triggers when name of org
                            org_list[org_id]['Name'] = spans_list['text'].strip()                   
                        else:
                            #If not name of org then org number
                            if (spans_list['font'] == 'Helvetica-Bold') & (int(float(spans_list['size'])) == 11):
                                org_list.append({'id' : spans_list['text'].strip()})   
                                org_id += 1

                    #Check if font & size are that of org address
                    #Uses round to filter more text: other text has size that rounds to 8
                    if (spans_list['font'] == 'Helvetica') & (round(float(spans_list['size'])) == 9):
                        #Catch if no orgs created
                        if org_id < 0:
                            continue
                        #If key Address doesn't already exist, create it
                        if 'Address' not in org_list[org_id].keys():
                            org_list[org_id]['Address'] = ''
                            org_list[org_id]['Address'] += spans_list['text']
                            org_address = True
                        else:
                            #Strip here to avoid unnecessary blank space
                            #Maybe handle this later?
                            org_list[org_id]['Address'] += spans_list['text'].strip()

                    #Check if font & size are that of field name
                    if (spans_list['font'] == 'ArialNarrow') & (int(float(spans_list['size'])) == 8):
                        #Catch if no orgs created
                        if org_id < 0:
                            continue
                        #If key field doesn't already exist, create it. Checks if length string > 1 to remove bad text
                        if (removePuncandSpace(spans_list['text']) not in org_list[org_id].keys()) & (len(removePuncandSpace(spans_list['text'])) > 1):
                            org_list[org_id][removePuncandSpace(spans_list['text'])] = ''

                    #Check if font & size are that of field text
                    if (spans_list['font'] == 'Helvetica-Bold') & (round(float(spans_list['size'])) == 8):
                        #Catch if no orgs created
                        if org_id < 0:
                            continue

                        #Place in last dict key: will always be something there, non-generalizable method
                        org_list[org_id][list( org_list[org_id])[-1]] += spans_list['text']

                   
    return org_list

In [14]:
textDict_1to5 = openPDFasTextDict(filepath = filepath_1to5)
textDict_full = openPDFasTextDict(filepath = filepath_full)

In [21]:
for row in textDict_full:
    print(row, '\n')

{'width': 612.0, 'height': 792.0, 'blocks': [{'number': 0, 'type': 0, 'bbox': (71.99996948242188, 36.033119201660156, 559.9302368164062, 54.59596252441406), 'lines': [{'spans': [{'size': 11.029430389404297, 'flags': 4, 'font': 'Impact', 'color': 0, 'ascender': 1.0087890625, 'descender': -0.2109375, 'text': ' ', 'origin': (71.99996948242188, 51.12030029296875), 'bbox': (71.99996948242188, 39.983272552490234, 73.9422607421875, 53.44905090332031)}], 'wmode': 0, 'dir': (1.0, 0.0), 'bbox': (71.99996948242188, 39.983272552490234, 73.9422607421875, 53.44905090332031)}, {'spans': [{'size': 11.029430389404297, 'flags': 4, 'font': 'Impact', 'color': 0, 'ascender': 1.0087890625, 'descender': -0.2109375, 'text': '           ', 'origin': (167.39698791503906, 51.12030029296875), 'bbox': (167.39698791503906, 39.983272552490234, 188.62950134277344, 53.44905090332031)}, {'size': 15.944744110107422, 'flags': 4, 'font': 'GloucesterMTExtraCondens', 'color': 0, 'ascender': 0.9453125, 'descender': -0.217773

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



{'width': 612.0, 'height': 792.0, 'blocks': [{'number': 0, 'type': 0, 'bbox': (71.99996948242188, 36.033119201660156, 559.9302368164062, 54.59596252441406), 'lines': [{'spans': [{'size': 11.029430389404297, 'flags': 4, 'font': 'Impact', 'color': 0, 'ascender': 1.0087890625, 'descender': -0.2109375, 'text': ' ', 'origin': (71.99996948242188, 51.12030029296875), 'bbox': (71.99996948242188, 39.983272552490234, 73.9422607421875, 53.44905090332031)}], 'wmode': 0, 'dir': (1.0, 0.0), 'bbox': (71.99996948242188, 39.983272552490234, 73.9422607421875, 53.44905090332031)}, {'spans': [{'size': 11.029430389404297, 'flags': 4, 'font': 'Impact', 'color': 0, 'ascender': 1.0087890625, 'descender': -0.2109375, 'text': '           ', 'origin': (167.39698791503906, 51.12030029296875), 'bbox': (167.39698791503906, 39.983272552490234, 188.62950134277344, 53.44905090332031)}, {'size': 15.944744110107422, 'flags': 4, 'font': 'GloucesterMTExtraCondens', 'color': 0, 'ascender': 0.9453125, 'descender': -0.217773

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



{'width': 612.0, 'height': 792.0, 'blocks': [{'number': 0, 'type': 0, 'bbox': (71.99996948242188, 36.033119201660156, 559.9302368164062, 54.59596252441406), 'lines': [{'spans': [{'size': 11.029430389404297, 'flags': 4, 'font': 'Impact', 'color': 0, 'ascender': 1.0087890625, 'descender': -0.2109375, 'text': ' ', 'origin': (71.99996948242188, 51.12030029296875), 'bbox': (71.99996948242188, 39.983272552490234, 73.9422607421875, 53.44905090332031)}], 'wmode': 0, 'dir': (1.0, 0.0), 'bbox': (71.99996948242188, 39.983272552490234, 73.9422607421875, 53.44905090332031)}, {'spans': [{'size': 11.029430389404297, 'flags': 4, 'font': 'Impact', 'color': 0, 'ascender': 1.0087890625, 'descender': -0.2109375, 'text': '           ', 'origin': (167.39698791503906, 51.12030029296875), 'bbox': (167.39698791503906, 39.983272552490234, 188.62950134277344, 53.44905090332031)}, {'size': 15.944744110107422, 'flags': 4, 'font': 'GloucesterMTExtraCondens', 'color': 0, 'ascender': 0.9453125, 'descender': -0.217773

In [19]:
with open('textdict_full.txt', 'a') as file:
    for row in textDict_full:
        file.write(str(row))
        file.write('\n')

In [42]:
for count_page, page in enumerate(textDict_1to5):
    for count_block_list, block_list in enumerate(page["blocks"]):
        for count_line_list, line_list in enumerate(block_list["lines"]):
            for count_spans_list, spans_list in enumerate(line_list["spans"]):
                print(spans_list)

{'size': 11.029430389404297, 'flags': 4, 'font': 'Impact', 'color': 0, 'ascender': 1.0087890625, 'descender': -0.2109375, 'text': ' ', 'origin': (71.99996948242188, 51.12030029296875), 'bbox': (71.99996948242188, 39.983272552490234, 73.9422607421875, 53.44905090332031)}
{'size': 11.029430389404297, 'flags': 4, 'font': 'Impact', 'color': 0, 'ascender': 1.0087890625, 'descender': -0.2109375, 'text': '           ', 'origin': (167.39698791503906, 51.12030029296875), 'bbox': (167.39698791503906, 39.983272552490234, 188.62950134277344, 53.44905090332031)}
{'size': 15.944744110107422, 'flags': 4, 'font': 'GloucesterMTExtraCondens', 'color': 0, 'ascender': 0.9453125, 'descender': -0.2177734375, 'text': 'Entreprises donatrices et commanditaires du Québec', 'origin': (188.63992309570312, 51.12030029296875), 'bbox': (188.63992309570312, 36.033119201660156, 422.57421875, 54.59596252441406)}
{'size': 11.029430389404297, 'flags': 4, 'font': 'Impact', 'color': 0, 'ascender': 1.0087890625, 'descender'

{'size': 8.032254219055176, 'flags': 20, 'font': 'Helvetica-Bold', 'color': 0, 'ascender': 0.9359999895095825, 'descender': -0.2199999988079071, 'text': 'Privilégie la Semaine de la Santé  mentale.', 'origin': (110.75995635986328, 111.36029052734375), 'bbox': (110.75995635986328, 103.83485412597656, 271.63104248046875, 113.12908935546875)}
{'size': 8.991293907165527, 'flags': 4, 'font': 'Helvetica', 'color': 0, 'ascender': 0.9399999976158142, 'descender': -0.21799999475479126, 'text': ' ', 'origin': (271.67987060546875, 111.36029052734375), 'bbox': (271.67987060546875, 102.9002914428711, 274.1770324707031, 113.3222885131836)}
{'size': 8.032254219055176, 'flags': 4, 'font': 'Helvetica', 'color': 0, 'ascender': 0.9399999976158142, 'descender': -0.21799999475479126, 'text': ' ', 'origin': (274.0798645019531, 111.36029052734375), 'bbox': (274.0798645019531, 103.80269622802734, 276.3106689453125, 113.11300659179688)}
{'size': 8.032254219055176, 'flags': 4, 'font': 'ArialNarrow', 'color': 0,

{'size': 8.032254219055176, 'flags': 20, 'font': 'Helvetica-Bold', 'color': 0, 'ascender': 0.9359999895095825, 'descender': -0.2199999988079071, 'text': 'Canada', 'origin': (332.9999694824219, 97.560302734375), 'bbox': (332.9999694824219, 90.03486633300781, 361.94171142578125, 99.3291015625)}
{'size': 8.991293907165527, 'flags': 4, 'font': 'Helvetica', 'color': 0, 'ascender': 0.9399999976158142, 'descender': -0.21799999475479126, 'text': '  ', 'origin': (361.91986083984375, 97.560302734375), 'bbox': (361.91986083984375, 89.10030364990234, 366.9370422363281, 99.52230072021484)}
{'size': 8.032254219055176, 'flags': 4, 'font': 'ArialNarrow', 'color': 0, 'ascender': 0.935546875, 'descender': -0.2119140625, 'text': 'Tél.: ', 'origin': (332.9998474121094, 106.80029296875), 'bbox': (332.9998474121094, 99.27850341796875, 347.66790771484375, 108.50408172607422)}
{'size': 8.032254219055176, 'flags': 20, 'font': 'Helvetica-Bold', 'color': 0, 'ascender': 0.9359999895095825, 'descender': -0.2199999

In [8]:
extractedList_1to5 = extractFromTextDict(textDict_1to5)

In [9]:
extractedList_full = extractFromTextDict(textDict_full)

In [10]:
df_1to5 = pd.DataFrame(extractedList_1to5)

In [11]:
df_full = pd.DataFrame(extractedList_full)

In [13]:
df_full.shape

(1342, 47)

In [15]:
df_full.tail(5)

Unnamed: 0,id,Name,Address,Secteur industriel,Langue de comm,DDD,FAF,N° de télCie,n° de tél,Site Web,...,Date approb,Échelledons,Actif,Princip Filiales,Principale filiale,Filiales princip,Principales filiales,Total actif,Principfiliales,ou
1337,1338,Xérox Canada,"3400, boul. de Maisonneuve O., bur. 900 Montr...","Commerce de gros et détail; Équipement, systèm...",Français,En tout temps,31 décembre,514-939-3769,,https://www.xerox.ca,...,,,,,,,,,,
1338,1339,Yamaha Moteur du Canada Ltée,"1301, rue Ampère Boucherville QC J4B5Z5",Produits récréatifs; Fabricant d'une gamme de ...,Français,Avril à juin,31 mars,450-641-2602,,www.yamaha-motor.ca,...,,,,,,,,,,
1339,1340,"Yum Yum Enr., Les croustilles","40, rue du Moulin Warwick QC J0A1M0",Alimentation; Fabricant de croustilles et grig...,Français,En tout temps,,819-358-3600,,www.yum-yum.com,...,,,,,,,,,,
1340,1341,(CEZinc),"860, boul. Gérard-Cadieux Salaberry-de-Valley...",Produits d'acier et de métal; Producteur de zi...,Français,En tout temps,31 décembre,450-373-9144,,http://www.cezinc.com,...,,,,,,,,,,
1341,1342,Zurich Amérique du Nord Canada,"1100, boul. René-Lévesque O., bur. 1840 Montr...",Services financiers; Services financiers liés ...,Français,En tout temps,,514-393-7222,,www.zurichcanada.com/can/fr/home/welcome.htm,...,,,,,Groupe Zurich Financial Services,,,,,


In [17]:
df_full[df_full.ou.notna()]

Unnamed: 0,id,Name,Address,Secteur industriel,Langue de comm,DDD,FAF,N° de télCie,n° de tél,Site Web,...,Date approb,Échelledons,Actif,Princip Filiales,Principale filiale,Filiales princip,Principales filiales,Total actif,Principfiliales,ou
1019,1020,Provigo Distribution inc.,"400, ave Ste-Croix Saint-Laurent QC H4N3L4",Alimentation; Chaîne de magasins alimentaire.,Français,En tout temps,,514-383-3000,,www.provigo.com et https://www.loblaw.ca/,...,,,,,,,,,,


1342

In [31]:
df_full[df_full['Total actif'].notna()]['Total actif']

343    
Name: Total actif, dtype: object

In [33]:
df_full.iloc[343]

id                                                                     344
Name                                               Courchesne, Larose ltée
Address                         9761, boul. des Sciences  Anjou QC  H1J0A6
Secteur industriel       Alimentation; Importateur et distributeur de f...
Langue de comm                                                    Français
DDD                                                          En tout temps
FAF                                                            31 décembre
N° de télCie                                                  514-525-6381
n° de tél                                                              NaN
Site Web                                          www.courchesnelarose.com
Domaine dintérêt         Écoles, hôpitaux et santé. Le Groupe participe...
Limites géog                                                      locales.
Note                     Sur le site web sélectionner: À  propos, Respo...
Nombre demployés         

In [26]:
df_full.isna().sum().sort_values(ascending=False)

ou                       1341
Principfiliales          1341
Total actif              1341
Filiales princip         1341
Princip Filiales         1341
Principales filiales     1339
Limites géographiques    1335
2e contact pour          1315
Échelledons              1291
Web                      1286
Actif                    1286
Téléc                    1285
Total annuel             1279
Date approb              1278
Date fin d’ann           1278
Catégorie                1274
Langue                   1269
Projets privilégiés      1269
Domaines dintérêt        1266
Principale filiale       1254
n° de tél                1223
Poste                    1156
Princip filiales         1138
Filiale de               1009
Nbre de succ              835
2e n° de tél              669
Note                      597
Courriel  Cie             547
FAF                       519
Fax                       416
Nombre demployés          339
N° de faxCie              307
Courriel                  305
Tél       

In [35]:
df_full.to_clipboard()

In [34]:
df_full.Name.str.find('Mme Fernande Bernier')

0      -1
1      -1
2      -1
3      -1
4      -1
       ..
1337   -1
1338   -1
1339   -1
1340   -1
1341   -1
Name: Name, Length: 1342, dtype: int64

In [25]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [28]:
#Loop through dataframe column pairs
for col1 in df_full.columns:
    for col2 in df_full.columns:
        
        if col1 == col2:
            continue
        
        if similar(col1, col2) >= 0.8:
            print(col1 + " - " + col2 + ": " + str(similar(col1,col2)))

n° de tél - 2e n° de tél: 0.8571428571428571
Domaine dintérêt - Domaines dintérêt: 0.9696969696969697
2e n° de tél - n° de tél: 0.8571428571428571
Domaines dintérêt - Domaine dintérêt: 0.9696969696969697
Princip filiales - Princip Filiales: 0.9375
Princip filiales - Principale filiale: 0.8823529411764706
Princip filiales - Principales filiales: 0.8888888888888888
Princip filiales - Principfiliales: 0.967741935483871
Princip Filiales - Princip filiales: 0.9375
Princip Filiales - Principale filiale: 0.8235294117647058
Princip Filiales - Principales filiales: 0.8333333333333334
Princip Filiales - Principfiliales: 0.9032258064516129
Principale filiale - Princip filiales: 0.8823529411764706
Principale filiale - Princip Filiales: 0.8235294117647058
Principale filiale - Principales filiales: 0.9473684210526315
Principale filiale - Principfiliales: 0.8484848484848485
Principales filiales - Princip filiales: 0.8888888888888888
Principales filiales - Princip Filiales: 0.8333333333333334
Principa