In [3]:
import pandas as pd
import numpy as np
import gcsfs
import jellyfish
import recordlinkage as rl
from fuzzywuzzy import fuzz
import matching as m
import helpers as h
%load_ext autoreload
%autoreload 2

In [1]:
from gcsfs.core import GCSFileSystem
gcs = GCSFileSystem('courseradataeng', token='/home/jupyter/.config/gcloud/application_default_credentials.json') 

In [4]:
with gcs.open('soleadify_sample_data/unified_matched_data/matches_full.csv') as f:
    matches_full = pd.read_csv(f, quotechar='"', escapechar='\\')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
matches_full.columns

Index(['address__facebook', 'address__google', 'category__facebook',
       'category__google', 'category__website', 'city_clean__facebook',
       'city_clean__google', 'city_clean__website', 'country_clean__facebook',
       'country_clean__google', 'country_clean__website',
       'description__facebook', 'description__google', 'email__facebook',
       'index__facebook', 'index__google', 'index__website',
       'language__website', 'legal_name__website', 'link__facebook',
       'page_type__facebook', 'phone__facebook', 'phone__google',
       'phone__website', 'raw_address__google', 'raw_phone__google',
       'region_clean__facebook', 'region_clean__google',
       'region_clean__website', 'root_domain__facebook', 'root_domain__google',
       'root_domain__website', 'site_name_clean__facebook',
       'site_name_clean__google', 'site_name_clean__website',
       'zip_code__facebook', 'zip_code__google'],
      dtype='object')

**After unifying the 3 datasets into a single 1, there are lot of columns with redundant information across the 3 datasets.
Next I will be applying various strategies to unify these redundant columns into a single one.**

In [6]:
matches_full.head()

Unnamed: 0,address__facebook,address__google,category__facebook,category__google,category__website,city_clean__facebook,city_clean__google,city_clean__website,country_clean__facebook,country_clean__google,...,region_clean__google,region_clean__website,root_domain__facebook,root_domain__google,root_domain__website,site_name_clean__facebook,site_name_clean__google,site_name_clean__website,zip_code__facebook,zip_code__google
0,,"3/304 Manns Rd, West Gosford NSW 2250, Australia",,Promotional Merchandising,Work Clothing & Protection Equipment,,Gosford,Gosford,,Australia,...,New South Wales,New South Wales,,wrapcitysigns.com.au,wrapcitysigns.com.au,,Wrap City and Signs,Wrap City & Signs,,2250
1,,"Suite 228, 12, 111 Fourth Ave, St. Catharines,...",,Real Estate - Agents & Managers,Real Estate - Agents & Managers,,St. Catharines,St. Catharines,,Canada,...,Ontario,Ontario,,thereiteclub.com,thereiteclub.com,,The REITE Club,The REITE Club,,l2s 3p5
2,,"1926 Seventh Street Louth, St. Catharines, ON ...",,Plant Nurseries & Stores,Plant Nurseries & Stores,,St. Catharines,St. Catharines,,Canada,...,Ontario,Ontario,,pioneer-pff.com,pioneer-pff.com,,Pioneer Flower Farms Limited,Pioneer Flower Farms,,l2r 6p9
3,,"585 Carlton St, St. Catharines, ON L2M 4Y1, Ca...",,Funeral Services & Cemeteries,Funeral Services & Cemeteries,,St. Catharines,St. Catharines,,Canada,...,Ontario,Ontario,,dartefuneralhome.com,dartefuneralhome.com,,George Darte Funeral Chapel,George Darte Funeral Home,,l2m 4y1
4,,"10 Northrup Crescent, St. Catharines, ON L2M 7...",,Railroad Service Companies,Fences & Ornamental Metal Work,,St. Catharines,St. Catharines,,Canada,...,Ontario,Ontario,,amberstairs.ca,amberstairs.ca,,Amber Stairs & Railings Inc.,Amber Stairs & Railings,,l2m 7m4


In [7]:
print('''There are {0} instances where the link from facebook is available 
, but the root domain from facebook is not'''
      .format(matches_full.query('link__facebook.notna() & root_domain__facebook.isna()').shape[0]))



There are 0 instances where the link from facebook is available 
, but the root domain from facebook is not


In [8]:
def match_exact(row, column_name):
    if row[column_name+'__facebook'] == row[column_name+'__google'] or \
    row[column_name+'__facebook'] == row[column_name+'__website'] or \
    row[column_name+'__google'] == row[column_name+'__website']:
        combination = [row[column_name+'__facebook']
                       , row[column_name+'__google']
                       , row[column_name+'__website']]
        return pd.Series(combination).dropna().unique()[0]
    elif pd.isna(row[column_name+'__facebook']) and \
        pd.isna(row[column_name+'__google']) and \
        pd.isna(row[column_name+'__website']):
            return np.nan

columns_to_match = ['country_clean','region_clean','city_clean','root_domain']
origins = ['__facebook','__google','__website']
for column_name in columns_to_match:
    
    matches_full[column_name+'_final'] = (matches_full[[column_name+'__facebook'
                                                , column_name+'__google'
                                                , column_name+'__website']]
                                .apply(lambda row: match_exact(row, column_name), axis = 1)
                       )
columns_to_drop = [column_name+origin for column_name in columns_to_match for origin in origins]
matches_simplified = matches_full.drop(columns_to_drop,axis=1)

In [9]:
matches_simplified = matches_simplified.drop('link__facebook',axis=1)

In [10]:
matches_simplified.columns

Index(['address__facebook', 'address__google', 'category__facebook',
       'category__google', 'category__website', 'description__facebook',
       'description__google', 'email__facebook', 'index__facebook',
       'index__google', 'index__website', 'language__website',
       'legal_name__website', 'page_type__facebook', 'phone__facebook',
       'phone__google', 'phone__website', 'raw_address__google',
       'raw_phone__google', 'site_name_clean__facebook',
       'site_name_clean__google', 'site_name_clean__website',
       'zip_code__facebook', 'zip_code__google', 'country_clean_final',
       'region_clean_final', 'city_clean_final', 'root_domain_final'],
      dtype='object')

In [11]:
matches_simplified.head()

Unnamed: 0,address__facebook,address__google,category__facebook,category__google,category__website,description__facebook,description__google,email__facebook,index__facebook,index__google,...,raw_phone__google,site_name_clean__facebook,site_name_clean__google,site_name_clean__website,zip_code__facebook,zip_code__google,country_clean_final,region_clean_final,city_clean_final,root_domain_final
0,,"3/304 Manns Rd, West Gosford NSW 2250, Australia",,Promotional Merchandising,Work Clothing & Protection Equipment,,5.0 (2) · Sticker manufacturer West Gosford NS...,,,105892.0,...,+61 401 333 314,,Wrap City and Signs,Wrap City & Signs,,2250,Australia,New South Wales,Gosford,wrapcitysigns.com.au
1,,"Suite 228, 12, 111 Fourth Ave, St. Catharines,...",,Real Estate - Agents & Managers,Real Estate - Agents & Managers,,"5.0 (6) · Real estate agency Suite 228, 12, 11...",,,2909.0,...,+1 613-601-9018,,The REITE Club,The REITE Club,,l2s 3p5,Canada,Ontario,St. Catharines,thereiteclub.com
2,,"1926 Seventh Street Louth, St. Catharines, ON ...",,Plant Nurseries & Stores,Plant Nurseries & Stores,,5.0 (8) · Wholesale florist 1926 Seventh Stree...,,,4638.0,...,+1 905-641-2221,,Pioneer Flower Farms Limited,Pioneer Flower Farms,,l2r 6p9,Canada,Ontario,St. Catharines,pioneer-pff.com
3,,"585 Carlton St, St. Catharines, ON L2M 4Y1, Ca...",,Funeral Services & Cemeteries,Funeral Services & Cemeteries,,4.1 (22) · Funeral home 585 Carlton St Open 24...,,,8550.0,...,+1 905-937-4444,,George Darte Funeral Chapel,George Darte Funeral Home,,l2m 4y1,Canada,Ontario,St. Catharines,dartefuneralhome.com
4,,"10 Northrup Crescent, St. Catharines, ON L2M 7...",,Railroad Service Companies,Fences & Ornamental Metal Work,,"4.5 (4) · Railing contractor St. Catharines, O...",,,10261.0,...,(800) 263-5684,,Amber Stairs & Railings Inc.,Amber Stairs & Railings,,l2m 7m4,Canada,Ontario,St. Catharines,amberstairs.ca


In [12]:
matches_simplified.query('zip_code__google.notna() & zip_code__facebook.notna()').head(20)

Unnamed: 0,address__facebook,address__google,category__facebook,category__google,category__website,description__facebook,description__google,email__facebook,index__facebook,index__google,...,raw_phone__google,site_name_clean__facebook,site_name_clean__google,site_name_clean__website,zip_code__facebook,zip_code__google,country_clean_final,region_clean_final,city_clean_final,root_domain_final
997,"161 rue du levant, 30420, calvisson, france, l...","CABINET DU LEVANT, 161 Rue du levant, étage1 Z...",Clinics - Surgeons & Physicians|Clinics - Surg...,Alternative Therapy,,,"4.6 (7) · Osteopath CABINET DU LEVANT, 161 Rue...",,30906.0,145480.0,...,+33 6 60 10 58 96,Sylvain BRUN Ostéopathe D.O.,Sylvain Brun Ostéopathe,,30420,30420.0,France,Occitanie,Calvisson,cabinet-levant.fr
998,"4 rue marchands, 30420, calvisson, france, lan...","4 Rue des Marchands, 30420 Calvisson, France",,Catering & Delivery,,,"4.9 (14) · Caterer Calvisson, France +33 6 83 ...",,50068.0,166959.0,...,+33 6 83 34 63 15,Meney Traiteur,Meney Salaisons Traiteur,,30420,30420.0,France,Occitanie,Calvisson,meney-salaisons.fr
999,"48 grand rue, 30420, calvisson, france, langue...","48 Grand Rue, 30420 Calvisson, France",Bed and Breakfast|Travel Agencies,,,,4.4 (10) · Bed & breakfast 48 Grand Rue +33 4 ...,,41856.0,238763.0,...,+33 4 66 01 23 91,Bed and Art,Bed and Art,,30420,30420.0,France,Occitanie,Calvisson,bed-and-art.com
1000,"1 rue des marchands, 30420, calvisson, france,...","1 Rue des Marchands, 30420 Calvisson, France",Bicycle Shops|Sports Medicine & Physical Therapy,,,,"4.5 (180) · Bicycle Shop Calvisson, France Clo...",,31951.0,245437.0,...,+33 4 66 81 43 78,VaunagePassion Vélos,Vaunage Passion Bicycles,,30420,30420.0,France,Occitanie,Calvisson,vaunagepassionvelos.fr
1001,"9 rue des genets, 30420, calvisson, france, la...","9 Rue des Genêts, 30420 Calvisson, France",,Web Development Agencies,,,5.0 (8) · Website designer 3+ years in busines...,,64900.0,301868.0,...,+33 6 82 22 72 51,Extern'Market,Extern'Market,,30420,30420.0,France,Occitanie,Calvisson,extern-market.com
1002,"161, rue du levant, 30420, calvisson, france, ...","161 Rue du Levant, 30420 Calvisson, France",Digital & Marketing Agencies|Commercial Printing,Public Relations Agencies,,,4.0 (1) · Public relations firm 161 Rue du Lev...,,12465.0,347922.0,...,+33 7 82 07 85 84,Studio OnOz,Studio OnOz,,30420,30420.0,France,Occitanie,Calvisson,studioonoz.com
1003,"915-700 west pender street, v6c 1g8, vancouver...","1090 Homer St #300, Vancouver, BC V6B 2W9, Canada",Investment Consultants & Financial Advisors|Bu...,"Insurance - Agents, Carriers & Brokers",,,No reviews · Life insurance agency 1090 Homer ...,,1687.0,896.0,...,+1 604-687-1507,Westward Advisors,Westward Advisors,,v6c 1g8,v6b 2w9,Canada,British Columbia,Vancouver,westwardadvisors.com
1004,"107–8410 ontario street, v5x 4s6, vancouver, b...","8410 Ontario St, Vancouver, BC V5X 4S6, Canada",,,,,5.0 (5) · Professional services 7+ years in bu...,,18276.0,1048.0,...,+1 604-436-0466,R&M Trade Laminating,R & M Trade Laminating Ltd,,v5x 4s6,v5x 4s6,Canada,British Columbia,Vancouver,rmlaminating.com
1005,"310-1275 venables street, v6a 2e4, vancouver, ...","1275 Venables St Studio 310, Vancouver, BC V6A...",,Clothing Stores,,Studio 310 Design Lab is a full service appare...,No reviews · Clothes and fabric manufacturer V...,,21201.0,1629.0,...,+1 604-200-3378,Studio 310 Design Lab,Studio 310 Design Lab Inc,,v6a 2e4,v6a 2e4,Canada,British Columbia,Vancouver,studio310designlab.com
1007,"1541 w broadway, v6j 1w7, vancouver, bc, canad...","1541 W Broadway, Vancouver, BC V6J 1W7, Canada",Art Galleries|Fine Arts Schools,Business Consulting,,仕林设计学院是成立于温哥华的艺术设计教育中心，由Parsons帕森斯 设计学院校友建立。,5.0 (2) · Consultant 1541 W Broadway Open ⋅ Cl...,admin@forecad.org,36849.0,8560.0,...,+1 778-863-2320,Foresight Academy 仕林设计学院,Foresight Academy 仕林设计学院,,v6j 1w7,v6j 1w7,Canada,British Columbia,Vancouver,forecad.org


In [13]:
def match_zip_codes(row):
    try:
        zip_google_clean = str(int(float(row['zip_code__google']))).lower()
    except ValueError as e:
        zip_google_clean = str(row['zip_code__google']).lower()
    try:
        zip_facebook_clean = str(int(float(row['zip_code__facebook']))).lower()
    except ValueError as e:
        zip_facebook_clean = str(row['zip_code__facebook']).lower()
    
    if not pd.isna(row['zip_code__google']) and not pd.isna(row['zip_code__facebook']):
        is_match = m.is_levenstein_matching(zip_google_clean, zip_facebook_clean, fuzzy = True)
        
        if is_match:
            return zip_google_clean
        else:
            return zip_google_clean +'/'+ zip_facebook_clean
        
    elif not pd.isna(row['zip_code__google']) and pd.isna(row['zip_code__facebook']):
        return row['zip_code__google']
    elif pd.isna(row['zip_code__google']) and not pd.isna(row['zip_code__facebook']):
        return row['zip_code__facebook']
    else:
        return np.nan
    
matches_simplified['zip_code_unified']=(matches_simplified[['zip_code__google','zip_code__facebook']]
                                        .apply(lambda row: match_zip_codes(row), axis=1)
                                     )

In [14]:
matches_simplified[['zip_code__google','zip_code__facebook','zip_code_unified']].query('zip_code__google.notna() & zip_code__facebook.notna()').head(20)

Unnamed: 0,zip_code__google,zip_code__facebook,zip_code_unified
997,30420.0,30420,30420
998,30420.0,30420,30420
999,30420.0,30420,30420
1000,30420.0,30420,30420
1001,30420.0,30420,30420
1002,30420.0,30420,30420
1003,v6b 2w9,v6c 1g8,v6b 2w9/v6c 1g8
1004,v5x 4s6,v5x 4s6,v5x 4s6
1005,v6a 2e4,v6a 2e4,v6a 2e4
1007,v6j 1w7,v6j 1w7,v6j 1w7


In [15]:
matches_simplified = matches_simplified.drop(['zip_code__google','zip_code__facebook'],axis=1)

In [16]:
matches_simplified[['phone__facebook', 'phone__google', 'phone__website','raw_phone__google']].head(20)

Unnamed: 0,phone__facebook,phone__google,phone__website,raw_phone__google
0,,61401330000.0,61401330000.0,+61 401 333 314
1,,16136020000.0,12262980000.0,+1 613-601-9018
2,,19056410000.0,19056410000.0,+1 905-641-2221
3,,19059370000.0,19059380000.0,+1 905-937-4444
4,,18002640000.0,18002640000.0,(800) 263-5684
5,,19053980000.0,18555630000.0,+1 905-397-5551
6,,12893620000.0,12893620000.0,+1 289-362-4410
7,,19059380000.0,12899900000.0,+1 905-938-3503
8,,19056830000.0,19056830000.0,+1 905-682-7325
9,,12893480000.0,12893480000.0,+1 289-348-1714


In [17]:
(matches_simplified[['phone__facebook', 'phone__google', 'phone__website','raw_phone__google']]
                .query('phone__facebook.isna() & phone__google.isna() & phone__website.notna() ')
 .head(40)
)

Unnamed: 0,phone__facebook,phone__google,phone__website,raw_phone__google
529,,,19057720000.0,
531,,,50223270000.0,
535,,,18448570000.0,
536,,,14505460000.0,
537,,,8809620000000.0,
542,,,16045970000.0,
546,,,19024310000.0,
550,,,17809530000.0,
555,,,48500130000.0,732 083 523
561,,,16137430000.0,


In [18]:
print('''There are {0} instances where the phone from google is available 
, but the raw phone from google is not'''
      .format(matches_simplified.query('phone__google.notna() and  raw_phone__google.isna()').shape[0]))

There are 0 instances where the phone from google is available 
, but the raw phone from google is not


This means that the raw_phone which is more close to a human usable version can be reliably used at this point

In [19]:
def unify_phone_numbers(row):
    
    try:
        phone_website_str = str(int(float(row['phone__website'])))
    except ValueError as e:
        phone_website_str = str(row['phone__website'])
    
    
    try:
        phone_facebook_str = str(int(float(row['phone__facebook'])))
    except ValueError as e:
        phone_facebook_str = str(row['phone__facebook'])
    
    
    if not pd.isna(row['raw_phone__google']):
        is_match_1 = m.is_levenstein_matching(phone_website_str, row['raw_phone__google'], fuzzy= True)
        
        is_match_2 = m.is_levenstein_matching(phone_facebook_str, row['raw_phone__google'], fuzzy=True)
        
        
        if is_match_1 or is_match_2:
            return row['raw_phone__google']
        else:
            is_match = m.is_levenstein_matching(phone_website_str, phone_facebook_str, fuzzy=False)
            if not pd.isna(row['phone__website']) and not pd.isna(row['phone__facebook']):
                if is_match:
                    return '+' + phone_website_str + '/'+ row['raw_phone__google']
                else:
                    return (
                        '+' + phone_website_str + '/' + 
                        '+' + phone_facebook_str + '/' + 
                          row['raw_phone__google']
                    )
            else:
                if not pd.isna(row['phone__website']):
                    return (
                        '+' + phone_website_str + '/' + 
                        row['raw_phone__google']
                        )
                elif not pd.isna(row['phone__facebook']):
                    return (
                        '+' + phone_facebook_str + '/' + 
                        row['raw_phone__google']
                        )
                else:
                    return row['raw_phone__google']
    else:
        is_match = m.is_levenstein_matching(phone_website_str, phone_facebook_str, fuzzy=False)
        if not pd.isna(row['phone__website']) and not pd.isna(row['phone__facebook']):
            if is_match:
                return '+'+phone_website_str
            else:
                return (
            '+' + phone_website_str + '/' + 
            '+' + phone_facebook_str
            )
        elif not pd.isna(row['phone__website']):
            return '+' + phone_website_str
        elif not pd.isna(row['phone__facebook']):
            return '+' + phone_facebook_str
        else:
            return np.nan

matches_simplified['phone_unified']=(matches_simplified
                                    [['phone__website','phone__facebook','raw_phone__google']]
                                    .apply(lambda row: unify_phone_numbers(row), axis =1))

In [20]:
matches_simplified[['phone__website','phone__facebook','raw_phone__google','phone_unified']].tail(20)

Unnamed: 0,phone__website,phone__facebook,raw_phone__google,phone_unified
29741,15192640000.0,15192640000.0,,+15192641166
29742,61419320000.0,,,+61419322499
29743,611300000000.0,,,+611300000000
29744,61893510000.0,,,+61893512480
29745,15026370000.0,15026370000.0,,+15026374712
29746,15024260000.0,15024260000.0,,+15024255661/+15024256002
29747,15024570000.0,15024570000.0,,+15024565819
29748,18883570000.0,18883570000.0,,+18883566934
29749,15028930000.0,15028930000.0,,+15028930241
29750,15025740000.0,15025740000.0,,+15025741611


In [21]:
matches_simplified = matches_simplified.drop(['phone__website','phone__facebook'
                                              ,'raw_phone__google','phone__google'],axis=1)

In [22]:
matches_simplified.columns

Index(['address__facebook', 'address__google', 'category__facebook',
       'category__google', 'category__website', 'description__facebook',
       'description__google', 'email__facebook', 'index__facebook',
       'index__google', 'index__website', 'language__website',
       'legal_name__website', 'page_type__facebook', 'raw_address__google',
       'site_name_clean__facebook', 'site_name_clean__google',
       'site_name_clean__website', 'country_clean_final', 'region_clean_final',
       'city_clean_final', 'root_domain_final', 'zip_code_unified',
       'phone_unified'],
      dtype='object')

In [23]:
def unify_site_names(row):
    site_name_final = None
    if not pd.isna(row['index__website']) and not pd.isna(row['index__google']):
        site_name_final = row['site_name_clean__website']
    if not site_name_final and not pd.isna(row['index__website']) and not pd.isna(row['index__facebook']):
        site_name_final = row['site_name_clean__website']
    if not site_name_final and not pd.isna(row['index__facebook']) and not pd.isna(row['index__google']):
        site_name_final = row['site_name_clean__facebook']

    return site_name_final
    
matches_simplified['site_name_final'] = matches_simplified.apply(lambda row: unify_site_names(row),axis=1)

In [24]:
matches_simplified = matches_simplified.drop(['site_name_clean__facebook'
                                              ,'site_name_clean__website'
                                             ,'site_name_clean__google'], axis = 1)

In [25]:
matches_simplified[['address__facebook','address__google','raw_address__google']].head(20)

Unnamed: 0,address__facebook,address__google,raw_address__google
0,,"3/304 Manns Rd, West Gosford NSW 2250, Australia","West Gosford NSW, Australia"
1,,"Suite 228, 12, 111 Fourth Ave, St. Catharines,...","Suite 228, 12, 111 Fourth Ave · In Ridley Square"
2,,"1926 Seventh Street Louth, St. Catharines, ON ...",1926 Seventh Street Louth
3,,"585 Carlton St, St. Catharines, ON L2M 4Y1, Ca...",585 Carlton St
4,,"10 Northrup Crescent, St. Catharines, ON L2M 7...","St. Catharines, ON, Canada"
5,,"19 Main St, St. Catharines, ON L2N 4T5, Canada",19 Main St
6,,"13 Duke St, St. Catharines, ON L2R 5W1, Canada",7+ years in business · 13 Duke St
7,,"314 Lake St, St. Catharines, ON L2N 4H4, Canada","7+ years in business · St. Catharines, ON, Can..."
8,,"1688 Gregory Rd, St. Catharines, ON L2R 6P9, C...",1688 Gregory Rd
9,,"105 Merritt St, St. Catharines, ON L2T 1J7, Ca...","St. Catharines, ON, Canada"


In [26]:
(matches_simplified.query('address__facebook.notna() & address__google.notna()')
[['address__facebook','address__google','raw_address__google']].head(20))

Unnamed: 0,address__facebook,address__google,raw_address__google
996,"134 rue entrepreneurs, za du vigné, 30420, cal...","3+ years in business · Calvisson, France","3+ years in business · Calvisson, France"
997,"161 rue du levant, 30420, calvisson, france, l...","CABINET DU LEVANT, 161 Rue du levant, étage1 Z...","CABINET DU LEVANT, 161 Rue du levant, étage1 Z..."
998,"4 rue marchands, 30420, calvisson, france, lan...","4 Rue des Marchands, 30420 Calvisson, France","Calvisson, France"
999,"48 grand rue, 30420, calvisson, france, langue...","48 Grand Rue, 30420 Calvisson, France","Calvisson, France"
1000,"1 rue des marchands, 30420, calvisson, france,...","1 Rue des Marchands, 30420 Calvisson, France","Calvisson, France"
1001,"9 rue des genets, 30420, calvisson, france, la...","9 Rue des Genêts, 30420 Calvisson, France","3+ years in business · Calvisson, France"
1002,"161, rue du levant, 30420, calvisson, france, ...","161 Rue du Levant, 30420 Calvisson, France","5+ years in business · Calvisson, France"
1003,"915-700 west pender street, v6c 1g8, vancouver...","1090 Homer St #300, Vancouver, BC V6B 2W9, Canada",1090 Homer St #300 · In Urban Systems
1004,"107–8410 ontario street, v5x 4s6, vancouver, b...","8410 Ontario St, Vancouver, BC V5X 4S6, Canada","7+ years in business · Vancouver, BC, Canada"
1005,"310-1275 venables street, v6a 2e4, vancouver, ...","1275 Venables St Studio 310, Vancouver, BC V6A...","Vancouver, BC, Canada"


In [231]:
index = 1012
print(matches_simplified.loc[index]['address__google'],'\n'
                                  , matches_simplified.loc[index]['address__facebook'])
print(fuzz.partial_ratio(matches_simplified.loc[index]['address__google']
                                  , matches_simplified.loc[index]['address__facebook']))
print(fuzz.token_sort_ratio(matches_simplified.loc[index]['address__google']
                                  , matches_simplified.loc[index]['address__facebook']))
print(fuzz.token_set_ratio(matches_simplified.loc[index]['address__google']
                                  , matches_simplified.loc[index]['address__facebook']))

601 W Broadway, Vancouver, BC V5Z 4C2, Canada 
 100 west 49th ave., v5y 2z6, vancouver, bc, canada, british columbia
49
58
62


In [230]:
index = 1010
print(matches_simplified.loc[index]['address__google'],'\n'
                                  , matches_simplified.loc[index]['address__facebook'])
print(fuzz.partial_ratio(matches_simplified.loc[index]['address__google']
                                  , matches_simplified.loc[index]['address__facebook']))
print(fuzz.token_sort_ratio(matches_simplified.loc[index]['address__google']
                                  , matches_simplified.loc[index]['address__facebook']))
print(fuzz.token_set_ratio(matches_simplified.loc[index]['address__google']
                                  , matches_simplified.loc[index]['address__facebook']))

88 W Pender St #2075, Vancouver, BC V6B 6N9, Canada 
 2075 - 88 west pender street, v6b 6n9, vancouver, bc, canada, british columbia
55
80
94


In [228]:
index = 1015
print(matches_simplified.loc[index]['address__google'],'\n'
                                  , matches_simplified.loc[index]['address__facebook'])
print(fuzz.partial_ratio(matches_simplified.loc[index]['address__google']
                                  , matches_simplified.loc[index]['address__facebook']))
print(fuzz.token_sort_ratio(matches_simplified.loc[index]['address__google']
                                  , matches_simplified.loc[index]['address__facebook']))
print(fuzz.token_set_ratio(matches_simplified.loc[index]['address__google']
                                  , matches_simplified.loc[index]['address__facebook']))

2696 Nootka St, Vancouver, BC V5M 3M5, Canada 
 2696 nootka strreet, v5m 3m5, vancouver, bc, canada, british columbia
58
79
96


In [229]:
index = 1013
print(matches_simplified.loc[index]['address__google'],'\n'
                                  , matches_simplified.loc[index]['address__facebook'])
print(fuzz.partial_ratio(matches_simplified.loc[index]['address__google']
                                  , matches_simplified.loc[index]['address__facebook']))
print(fuzz.token_sort_ratio(matches_simplified.loc[index]['address__google']
                                  , matches_simplified.loc[index]['address__facebook']))
print(fuzz.token_set_ratio(matches_simplified.loc[index]['address__google']
                                  , matches_simplified.loc[index]['address__facebook']))

209 Abbott St, Vancouver, BC V6B 2K7, Canada 
 209 abbott street, v6b 2k7, vancouver, bc, canada, british columbia
59
80
96


In [224]:
index = 996
print(matches_simplified.loc[index]['address__google'],'\n'
                                  , matches_simplified.loc[index]['address__facebook'])
print(fuzz.partial_ratio(matches_simplified.loc[index]['address__google']
                                  , matches_simplified.loc[index]['address__facebook']))
print(fuzz.token_sort_ratio(matches_simplified.loc[index]['address__google']
                                  , matches_simplified.loc[index]['address__facebook']))
print(fuzz.token_set_ratio(matches_simplified.loc[index]['address__google']
                                  , matches_simplified.loc[index]['address__facebook']))


3+ years in business · Calvisson, France 
 134 rue entrepreneurs, za du vigné, 30420, calvisson, france, languedoc-roussillon
45
45
62


In [222]:
index = 1000
print(matches_simplified.loc[index]['address__google'],'\n'
                                  , matches_simplified.loc[index]['address__facebook'])
print(fuzz.partial_ratio(matches_simplified.loc[index]['address__google']
                                  , matches_simplified.loc[index]['address__facebook']))
print(fuzz.token_sort_ratio(matches_simplified.loc[index]['address__google']
                                  , matches_simplified.loc[index]['address__facebook']))
print(fuzz.token_set_ratio(matches_simplified.loc[index]['address__google']
                                  , matches_simplified.loc[index]['address__facebook']))

1 Rue des Marchands, 30420 Calvisson, France 
 1 rue des marchands, 30420, calvisson, france, languedoc-roussillon
89
80
100


In [223]:
index = 1003
print(matches_simplified.loc[index]['address__google'],'\n'
                                  , matches_simplified.loc[index]['address__facebook'])
print(fuzz.partial_ratio(matches_simplified.loc[index]['address__google']
                                  , matches_simplified.loc[index]['address__facebook']))
print(fuzz.token_sort_ratio(matches_simplified.loc[index]['address__google']
                                  , matches_simplified.loc[index]['address__facebook']))
print(fuzz.token_set_ratio(matches_simplified.loc[index]['address__google']
                                  , matches_simplified.loc[index]['address__facebook']))


1090 Homer St #300, Vancouver, BC V6B 2W9, Canada 
 915-700 west pender street, v6c 1g8, vancouver, bc, canada, british columbia
49
60
62


In [27]:
def unify_address(row):
    if not pd.isna(row['address__facebook']) and not pd.isna(row['address__google']):
        is_match = m.is_fuzzy_address_matching(row['address__facebook'], row['address__google'])
        if is_match:
            return max([row['address__facebook'], row['address__google']], key = len)
        else:
            return row['address__facebook'] +'/'+row['address__google']
    elif not pd.isna(row['address__facebook']):
        return row['address__facebook']
    elif not pd.isna(row['address__google']):
        return row['address__google']
    else:
        return np.nan

matches_simplified['address_unified']=(matches_simplified
                                        [['address__facebook','address__google']]
                                    .apply(lambda row: unify_address(row), axis = 1)
                                      )

In [28]:
(matches_simplified.query('address__facebook.notna() & address__google.notna()')
[['address__facebook','address__google','raw_address__google','address_unified']].head(20))

Unnamed: 0,address__facebook,address__google,raw_address__google,address_unified
996,"134 rue entrepreneurs, za du vigné, 30420, cal...","3+ years in business · Calvisson, France","3+ years in business · Calvisson, France","134 rue entrepreneurs, za du vigné, 30420, cal..."
997,"161 rue du levant, 30420, calvisson, france, l...","CABINET DU LEVANT, 161 Rue du levant, étage1 Z...","CABINET DU LEVANT, 161 Rue du levant, étage1 Z...","161 rue du levant, 30420, calvisson, france, l..."
998,"4 rue marchands, 30420, calvisson, france, lan...","4 Rue des Marchands, 30420 Calvisson, France","Calvisson, France","4 rue marchands, 30420, calvisson, france, lan..."
999,"48 grand rue, 30420, calvisson, france, langue...","48 Grand Rue, 30420 Calvisson, France","Calvisson, France","48 grand rue, 30420, calvisson, france, langue..."
1000,"1 rue des marchands, 30420, calvisson, france,...","1 Rue des Marchands, 30420 Calvisson, France","Calvisson, France","1 rue des marchands, 30420, calvisson, france,..."
1001,"9 rue des genets, 30420, calvisson, france, la...","9 Rue des Genêts, 30420 Calvisson, France","3+ years in business · Calvisson, France","9 rue des genets, 30420, calvisson, france, la..."
1002,"161, rue du levant, 30420, calvisson, france, ...","161 Rue du Levant, 30420 Calvisson, France","5+ years in business · Calvisson, France","161, rue du levant, 30420, calvisson, france, ..."
1003,"915-700 west pender street, v6c 1g8, vancouver...","1090 Homer St #300, Vancouver, BC V6B 2W9, Canada",1090 Homer St #300 · In Urban Systems,"915-700 west pender street, v6c 1g8, vancouver..."
1004,"107–8410 ontario street, v5x 4s6, vancouver, b...","8410 Ontario St, Vancouver, BC V5X 4S6, Canada","7+ years in business · Vancouver, BC, Canada","107–8410 ontario street, v5x 4s6, vancouver, b..."
1005,"310-1275 venables street, v6a 2e4, vancouver, ...","1275 Venables St Studio 310, Vancouver, BC V6A...","Vancouver, BC, Canada","310-1275 venables street, v6a 2e4, vancouver, ..."


In [29]:
matches_simplified=(matches_simplified
                    .drop(['address__facebook','address__google','raw_address__google'], axis=1)
)

In [30]:
matches_simplified.columns

Index(['category__facebook', 'category__google', 'category__website',
       'description__facebook', 'description__google', 'email__facebook',
       'index__facebook', 'index__google', 'index__website',
       'language__website', 'legal_name__website', 'page_type__facebook',
       'country_clean_final', 'region_clean_final', 'city_clean_final',
       'root_domain_final', 'zip_code_unified', 'phone_unified',
       'site_name_final', 'address_unified'],
      dtype='object')

In [31]:
with gcs.open('soleadify_sample_data/unified_matched_data/matches_simplified.csv','w') as f:
    matches_simplified.to_csv(f, index = False, quotechar='"', escapechar='\\')