In [None]:
! pip install -r '/home/jupyter/sample_project/requirements.txt'

In [2]:
import pandas as pd
import gcsfs
import recordlinkage as rl
import matching as m
import helpers as h

In [3]:
from gcsfs.core import GCSFileSystem
gcs = GCSFileSystem('courseradataeng', token='/home/jupyter/.config/gcloud/application_default_credentials.json') 

In [4]:
with gcs.open('soleadify_sample_data/website_dataset_clean.csv') as f:
    website_dataset_clean = pd.read_csv(f)


In [5]:
with gcs.open('soleadify_sample_data/facebook_dataset_clean.csv') as f:
    facebook_dataset_clean = (pd.read_csv(f , quotechar='"', escapechar='\\')
                                .drop('Unnamed: 0',axis=1)
                                .rename(columns={'country_final':'country_clean'
                                        ,'region_consistent':'region_clean'
                                       ,'city_consistent':'city_clean'
                                       ,'domain':'root_domain'
                                       ,'name':'site_name_clean'}
                                       )
                             )

In [6]:
indexes={'block':['country_clean','region_clean','city_clean']
        ,'sorted_neighbour':['root_domain','site_name_clean']
        }

In [36]:
facebook_dataset_clean[indexes['block']].value_counts()

country_clean   region_clean  city_clean 
Canada          Ontario       Toronto        2713
                Alberta       Calgary        2529
                              Edmonton       1788
                Ontario       Mississauga    1580
                              Ottawa          814
                                             ... 
United Kingdom  England       Shirley           1
Canada          Manitoba      Carberry          1
United Kingdom  England       Sheerness         1
Canada          Manitoba      Carman            1
Turkey          Samsun        Unkapani          1
Length: 7268, dtype: int64

In [37]:
website_dataset_clean[indexes['block']].value_counts()

country_clean  region_clean  city_clean      
Canada         Ontario       Toronto             3292
               Alberta       Calgary             2764
                             Edmonton            1901
               Ontario       Mississauga         1535
                             Ottawa               976
                                                 ... 
Turkey         Manisa        Akhisar                1
Canada         Ontario       Lambton (Forest)       1
                             Lappe                  1
Turkey         Konya         Cumra                  1
United States  Colorado      Fort Morgan            1
Length: 10833, dtype: int64

In [18]:
website_dataset_clean['country_clean'].describe()

count      64834
unique       181
top       Canada
freq       39779
Name: country_clean, dtype: object

There is missing data in terms of country in both datasets.
Also there are a lot of countries with just a few entries.
I will move these into 2 separate datasets - Country not available and Low Frequency

In [7]:
combinations = (pd.concat([facebook_dataset_clean[indexes['block']]
                          , website_dataset_clean[indexes['block']]])
                .drop_duplicates()
                .dropna()
               )

combinations = combinations.to_dict(orient = 'records')

In [7]:
country = 'Canada'
region = 'Ontario'
city = 'Toronto'

facebook_subset = (facebook_dataset_clean
                .query('''country_clean == @country \
                & region_clean == @region \
                & city_clean == @city \
                & root_domain.notna() \
                & site_name_clean.notna() \
                ''')
                  )
website_subset = (website_dataset_clean
                .query('''country_clean == @country \
                & region_clean == @region \
                & city_clean == @city \
                & root_domain.notna() \
                & site_name_clean.notna() \
                ''')
                 )

counts = {'facebook':facebook_subset.shape[0]
          , 'website':website_subset.shape[0]}

if counts['facebook']> counts['website']:
    datasets = {'left':{'origin':'facebook','data':facebook_subset.copy(deep = True)}
               ,'right':{'origin':'website','data':website_subset.copy(deep = True)}
               }

else:
    datasets = {'left':{'origin':'website','data':website_subset.copy(deep = True)}
               ,'right':{'origin':'facebook','data':facebook_subset.copy(deep = True)}
               }

In [9]:
print(filtering_query)

country_clean== "France" & region_clean== "Occitanie" & city_clean== "Calvisson" & root_domain.notna() & site_name_clean.notna()


In [8]:
logic_dict = {'country_clean':'value'
              ,'region_clean':'value'
              ,'city_clean':'value'
              ,'root_domain':'notna'
              ,'site_name_clean':'notna'
             }

combination = combinations [0]


filtering_dict = h.get_filtering_dict(logic_dict, combination)
filtering_query = h.get_filtering_query(filtering_dict)


origin_mapping = {'dataset_1':'facebook','dataset_2':'website'}

dataset_1 = facebook_dataset_clean
dataset_2 = website_dataset_clean

datasets = h.filter_data(dataset_1, dataset_2, filtering_query, origin_mapping)

In [10]:
subset_of_matches = m.get_matches(datasets['left'], datasets['right'], indexes['sorted_neighbour'])
subset_of_matches.head()

Unnamed: 0,address,categories,category_clean,city_clean__facebook,city_clean__website,country_clean__facebook,country_clean__website,description,domain_suffix,email,...,region_clean__facebook,region_clean__website,region_code,root_domain__facebook,root_domain__website,site_name,site_name_clean__facebook,site_name_clean__website,tld,zip_code
0,"161, rue du levant, 30420, calvisson, france, ...",Digital & Marketing Agencies|Commercial Printing,Graphic Design,Calvisson,Calvisson,France,France,,com,,...,Occitanie,Occitanie,occ,studioonoz.com,studioonoz.com,Studio OnOz,Studio OnOz,Studio OnOz,com,30420
1,"1 rue des marchands, 30420, calvisson, france,...",Bicycle Shops|Sports Medicine & Physical Therapy,Atvs Dealers & Services,Calvisson,Calvisson,France,France,,fr,,...,Occitanie,Occitanie,occ,vaunagepassionvelos.fr,vaunagepassionvelos.fr,Vaunage Passion Velos,VaunagePassion Vélos,Vaunage Passion Velos,fr,30420
2,"4 rue marchands, 30420, calvisson, france, lan...",,"Meat, Fish & Seafood Stores",Calvisson,Calvisson,France,France,,fr,,...,Occitanie,Occitanie,occ,meney-salaisons.fr,meney-salaisons.fr,Meney Salaisons Traiteur,Meney Traiteur,Meney Salaisons Traiteur,fr,30420


In [14]:
levels_of_logic = {
    'level_1':{'country_clean':'value'
              ,'region_clean':'value'
              ,'city_clean':'value'
              ,'root_domain':'notna'
              ,'site_name_clean':'notna'
             }
    ,'level_2':{'country_clean':'value'
              ,'region_clean':'value'
              ,'city_clean':'isna'
              ,'root_domain':'notna'
              ,'site_name_clean':'notna'
             }
    ,'level_3':{'country_clean':'value'
              ,'region_clean':'isna'
              ,'city_clean':'isna'
              ,'root_domain':'notna'
              ,'site_name_clean':'notna'
             }
    ,'level_4':{'country_clean':'isna'
              ,'region_clean':'isna'
              ,'city_clean':'isna'
              ,'root_domain':'notna'
              ,'site_name_clean':'notna'
             }
    ,'level_5':{'country_clean':'isna'
              ,'region_clean':'isna'
              ,'city_clean':'isna'
              ,'root_domain':'notna'
              ,'site_name_clean':'isna'
             }
}
origin_mapping = {'dataset_1':'facebook','dataset_2':'website'}
dataset_1 = facebook_dataset_clean
dataset_2 = website_dataset_clean

In [None]:
blocking_columns = []
similar_columns = []
for logic_dict in levels_of_logic.values():
    for column in logic_dict.keys():
        if logic_dict[column] == 'value':
            blocking_columns.append(column)
        elif logic_dict[column] == 'notna':
            similar_columns.append(column)
    
    # combinations = h.get_combinations(dataset_1, dataset_2, blocking_columns)
    
#     for index, combination in enumerate(combinations):

#         filtering_dict = h.get_filtering_dict(logic_dict, combination)
#         filtering_query = h.get_filtering_query(filtering_dict)


#         datasets = h.filter_data(dataset_1, dataset_2, filtering_query, origin_mapping)
        
#         subset_of_matches = m.get_matches(datasets['left'], datasets['right'], indexes['sorted_neighbour'])
        
#         file_name = 'subset_of_matches_'+str(index)+'.csv'
#         with gcs.open('soleadify_sample_data/facebook_website_matches/'+file_name,'w') as f:
#             subset_of_matches.to_csv(f, index = False)