In [1]:
! pip install recordlinkage
! pip install fsspec
! pip install gcsfs



In [1]:
import pandas as pd
import gcsfs
import recordlinkage as rl

In [2]:
from gcsfs.core import GCSFileSystem
gcs = GCSFileSystem('courseradataeng', token='/home/jupyter/.config/gcloud/application_default_credentials.json') 

In [3]:
with gcs.open('soleadify_sample_data/website_dataset_clean.csv') as f:
    website_dataset_clean = pd.read_csv(f)


In [4]:
with gcs.open('soleadify_sample_data/facebook_dataset_clean.csv') as f:
    facebook_dataset_clean = (pd.read_csv(f , quotechar='"', escapechar='\\')
                                .drop('Unnamed: 0',axis=1)
                                .rename(columns={'country_final':'country_clean'
                                        ,'region_consistent':'region_clean'
                                       ,'city_consistent':'city_clean'
                                       ,'domain':'root_domain'
                                       ,'name':'site_name_clean'}
                                       )
                             )

In [5]:
indexes={'block':['country_clean','region_clean','city_clean']
        ,'sorted_neighbour':['root_domain','site_name_clean']
        }

In [31]:
indexes['block']

['country_clean', 'region_clean', 'city_clean']

In [36]:
facebook_dataset_clean[indexes['block']].value_counts()

country_clean   region_clean  city_clean 
Canada          Ontario       Toronto        2713
                Alberta       Calgary        2529
                              Edmonton       1788
                Ontario       Mississauga    1580
                              Ottawa          814
                                             ... 
United Kingdom  England       Shirley           1
Canada          Manitoba      Carberry          1
United Kingdom  England       Sheerness         1
Canada          Manitoba      Carman            1
Turkey          Samsun        Unkapani          1
Length: 7268, dtype: int64

In [37]:
website_dataset_clean[indexes['block']].value_counts()

country_clean  region_clean  city_clean      
Canada         Ontario       Toronto             3292
               Alberta       Calgary             2764
                             Edmonton            1901
               Ontario       Mississauga         1535
                             Ottawa               976
                                                 ... 
Turkey         Manisa        Akhisar                1
Canada         Ontario       Lambton (Forest)       1
                             Lappe                  1
Turkey         Konya         Cumra                  1
United States  Colorado      Fort Morgan            1
Length: 10833, dtype: int64

In [18]:
website_dataset_clean['country_clean'].describe()

count      64834
unique       181
top       Canada
freq       39779
Name: country_clean, dtype: object

There is missing data in terms of country in both datasets.
Also there are a lot of countries with just a few entries.
I will move these into 2 separate datasets - Country not available and Low Frequency

In [38]:
facebook_country_high_frequency = facebook_dataset_clean.groupby('country_clean').filter(lambda g: len(g)>=10)

website_country_high_frequency = website_dataset_clean.groupby('country_clean').filter(lambda g: len(g)>=10)

countries = pd.concat([facebook_country_high_frequency['country_clean']
                       ,website_country_high_frequency['country_clean']]
                     ).unique()

In [9]:
combinations = (pd.concat([facebook_dataset_clean[indexes['block']]
                          , website_dataset_clean[indexes['block']]])
                .drop_duplicates()
                .dropna()
               )

combinations.head()

Unnamed: 0,country_clean,region_clean,city_clean
0,France,Occitanie,Calvisson
3,Canada,British Columbia,Vancouver
4,Canada,Manitoba,Winnipeg
6,Canada,Ontario,Port Dover
10,Australia,Victoria,Eltham


In [60]:
country = 'Canada'
region = 'Ontario'
city = 'Toronto'

facebook_subset = (facebook_dataset_clean
                .query('''country_clean == @country \
                & region_clean == @region \
                & city_clean == @city \
                & root_domain.notna() \
                & site_name_clean.notna() \
                ''')
                  )
website_subset = (website_dataset_clean
                .query('''country_clean == @country \
                & region_clean == @region \
                & city_clean == @city \
                & root_domain.notna() \
                & site_name_clean.notna() \
                ''')
                 )

counts = {'facebook':facebook_subset.shape[0]
          , 'website':website_subset.shape[0]}

if counts['facebook']> counts['website']:
    datasets = {'left':{'origin':'facebook','data':facebook_subset.copy(deep = True)}
               ,'right':{'origin':'website','data':website_subset.copy(deep = True)}
               }

else:
    datasets = {'left':{'origin':'website','data':website_subset.copy(deep = True)}
               ,'right':{'origin':'facebook','data':facebook_subset.copy(deep = True)}
               }

In [61]:
indexer = rl.Index()
for column in indexes['sorted_neighbour']:
    indexer.add(rl.index.SortedNeighbourhood(left_on = column
                                         , right_on = column
                                         , window = 3))
combined_index = indexer.index(datasets['left']['data'], datasets['right']['data'])

In [62]:
combined_index_unqiue = combined_index.drop_duplicates(keep='first')

In [63]:
compare = rl.Compare(n_jobs = -1)
for column in indexes['sorted_neighbour']:
    compare.string(column, column, method = 'jarowinkler', label = column+'_similarity_score', threshold=0.85)

comparison_vectors = compare.compute(combined_index_unqiue, datasets['left']['data'], datasets['right']['data'])

In [71]:
true_links = comparison_vectors.query('root_domain_similarity_score == 1.0 & site_name_clean_similarity_score == 1.0').index
match_left = (datasets['left']['data']
            .loc[list(true_links.get_level_values(0))]
            .reset_index()
             )
match_right = (datasets['right']['data']
            .loc[list(true_links.get_level_values(1))]
            .reset_index()
        )

In [74]:
subset_of_matches = match_left.join(match_right, lsuffix =  '__'+datasets['left']['origin']
                          , rsuffix = '__'+datasets['right']['origin'])

In [75]:
subset_of_matches.head()

Unnamed: 0,index__website,root_domain__website,domain_suffix,language,phone__website,site_name,tld,site_name_clean__website,category_clean,legal_name_clean,...,email,link,site_name_clean__facebook,page_type,phone__facebook,region_code,zip_code,country_clean__facebook,city_clean__facebook,region_clean__facebook
0,9,micacchi.ca,ca,en,16477257799,Micacchi Architecture,ca,Micacchi Architecture,Architects & Architectural Services,,...,,https://micacchi.ca,Micacchi Architecture,LocalBusiness,,on,m6p 1t4,Canada,Toronto,Ontario
1,44,argogold.ca,ca,,14167867860,Argo Gold,ca,Argo Gold,Mining & Gas Exploration,ARGO GOLD INC,...,,https://argogold.ca,"Argo Gold, Inc.",Organization,14167870000.0,on,m5h2s6,Canada,Toronto,Ontario
2,48,aporia-records.com,com,en,14169448034,Aporia,com,Aporia,Art & Music Publishers,,...,info@aporia-records.com,http://aporia-records.com,Aporia,Organization,,on,m4m 1y3,Canada,Toronto,Ontario
3,74,robertjonesmenswear.com,com,en,14163626837,Robert Jones Menswear,com,Robert Jones Menswear,Clothing Stores,INIS MEAIN KNITTING COMPANY,...,robertjonesmenswear@bellnet.ca,http://robertjonesmenswear.com,Robert Jones Menswear,LocalBusiness,14163630000.0,on,m5j 2j1,Canada,Toronto,Ontario
4,126,classinsession.ca,ca,en,14164772262,Class in Session,ca,Class in Session,"Learning, Tutoring & Courses",,...,,https://classinsession.ca,Class In Session,LocalBusiness,14164770000.0,on,,Canada,Toronto,Ontario
