# Import packages and data

In [21]:

import pandas as pd
import numpy as np
from utils.create_features import jaro, WRatio, ratio, davies, token_set_ratio, add_lat_lon_distance_features, strike_a_match, leven
from utils.features_utils import name_distance, extract_directions, is_direction_match, name_number_match,\
    is_related_cat, category_match, clean_email, sub_category_match, brand_match, house_match, email_url_match, phone_lcs,phone_category
from Config import config
import glob
import torch

In [22]:
df = pd.read_csv(config.input_dir + f"Fuse_exploded_{config.country}_cleaned.csv",engine='c',dtype={"postalCode": "str", "houseNumber": "str"})

In [23]:
file_list = glob.glob(
    config.output_stage1 + f"batch_candidates/{config.country}_parquet/*.parquet"
)

In [24]:
df_pairs = pd.DataFrame()
for file in file_list:
    df1 = pd.read_parquet(file,engine='fastparquet')
    df_pairs = pd.concat([df_pairs,df1])

In [25]:
df_pairs.head()

Unnamed: 0,ltable_id,rtable_id,placeId1,placeId2
0,16284,17245,efc64577-435d-4569-bdd8-094fb2b4d8a7,463c7723-b5a2-44c0-a62a-b5826cd470f2
1,5290,17690,f33d9b88-d0a3-452e-bca7-061c53e9a69b,944d0537-8c1a-4bf3-87a5-282b9a0e5ba3
2,6549,15895,b54a972a-e9d3-4192-9448-e92c30f4ca13,3bd8ba6e-fa2e-49d2-a7e3-722460c6bc98
3,6484,15211,500069d8-b079-40e4-b8c3-4603d7852dbe,82f388c4-02be-46bf-9269-2e9fe4de6a2f
4,13359,15509,61d1317e-9bf0-42ab-8079-11664c1081fe,8c98a735-9177-4014-8b8f-508b486cc7fa


In [26]:
df_pairs = pd.merge(df_pairs, df,  how='left', left_on=[
                'ltable_id'], right_on=['Id'])
df_pairs.drop('placeId',inplace=True,axis=1)
df_pairs = pd.merge(df_pairs, df,  how='left', left_on=[
                    'rtable_id'], right_on=['Id'],suffixes=["1","2"])

df_pairs.drop('placeId',inplace=True,axis=1)

In [27]:
cols = ['country', 'placeId1', 'placeId2', 'sourceNames1', 'sourceNames2',
       'category1', 'category2', 'brands1', 'brands2', 'email1', 'email2',
       'latitude1', 'longitude1', 'latitude2', 'longitude2', 'houseNumber1',
       'houseNumber2', 'streets1', 'streets2', 'cities1', 'cities2',
       'subCategory1', 'subCategory2', 'phoneNumbers1', 'phoneNumbers2',
       'internet1', 'internet2', 'postalCode1', 'postalCode2']

In [28]:
df_pairs["phoneNumbers1"] = df_pairs["phoneNumbers1"].apply(eval)
df_pairs["phoneNumbers2"] = df_pairs["phoneNumbers2"].apply(eval)

In [29]:
df_pairs.shape

(10933, 38)

# Create Features

In [30]:
# with torch.cuda.amp.autocast(enabled=True):
similarity = name_distance(list(df_pairs["sourceNames1"]),list(df_pairs["sourceNames2"]))
df_pairs["similarity"] = similarity

  0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/171 [00:00<?, ?it/s]

In [31]:
df_pairs = add_lat_lon_distance_features(df_pairs)

In [32]:
df_pairs["direction1"] = df_pairs["sourceNames1"].apply(extract_directions)
df_pairs["direction2"] = df_pairs["sourceNames2"].apply(extract_directions)
df_pairs['Is_direction_match'] = df_pairs.apply(lambda x: is_direction_match(x.direction1, x.direction2), axis=1)


In [33]:
df_pairs["name1_number"] = df_pairs["sourceNames1"].str.extract('(\d+)')
df_pairs["name2_number"] = df_pairs["sourceNames2"].str.extract('(\d+)')
df_pairs['Is_name_number_match'] = df_pairs.apply(lambda x: name_number_match(x.name1_number, x.name2_number), axis=1)

In [34]:
df_pairs['Is_related_cat'] = df_pairs.apply(lambda x: is_related_cat(x.category1, x.category2), axis=1)
df_pairs['Is_category_match'] = df_pairs.apply(lambda x: category_match(x.category1, x.category2), axis=1)
df_pairs['Is_subcategory_match'] = df_pairs.apply(lambda x: sub_category_match(x.subCategory1, x.subCategory2), axis=1)

In [35]:
df_pairs['Is_brand_match'] = df_pairs.apply(lambda x: brand_match(x.brands1, x.brands2), axis=1)
df_pairs['Is_house_match'] = df_pairs.apply(lambda x: house_match(x.houseNumber1, x.houseNumber2), axis=1)
df_pairs['is_phone_match'] = df_pairs.apply(lambda x: phone_category(x.phoneNumbers1, x.phoneNumbers2), axis=1)

In [36]:
df_pairs['Is_email_match'] = df_pairs.apply(lambda x: email_url_match(x.email1, x.email2), axis=1)
df_pairs['Is_url_match'] = df_pairs.apply(lambda x: email_url_match(x.internet1, x.internet2), axis=1)


In [37]:
df_pairs['name_davies'] = df_pairs.apply(lambda x: davies(x.sourceNames1, x.sourceNames2), axis=1)
df_pairs['name_leven'] = df_pairs.apply(lambda x: leven(x.sourceNames1, x.sourceNames2), axis=1)
df_pairs['name_dice'] = df_pairs.apply(lambda x: strike_a_match(x.sourceNames1, x.sourceNames2), axis=1)
df_pairs['name_jaro'] = df_pairs.apply(lambda x: jaro(x.sourceNames1, x.sourceNames2), axis=1)
df_pairs['name_set_ratio'] = df_pairs.apply(lambda x: token_set_ratio(x.sourceNames1, x.sourceNames2), axis=1)

In [38]:
df_pairs['street_davies'] = df_pairs.apply(lambda x: davies(x.streets1, x.streets2), axis=1)
df_pairs['street_leven'] = df_pairs.apply(lambda x: leven(x.streets1, x.streets2), axis=1)
df_pairs['street_jaro'] = df_pairs.apply(lambda x: jaro(x.streets1, x.streets2), axis=1)

In [39]:
df_pairs['email_davies'] = df_pairs.apply(lambda x: davies(x.email1, x.email2), axis=1)
df_pairs['email_leven'] = df_pairs.apply(lambda x: leven(x.email1, x.email2), axis=1)
df_pairs['email_jaro'] = df_pairs.apply(lambda x: jaro(x.email1, x.email2), axis=1)

In [40]:
df_pairs['url_davies'] = df_pairs.apply(lambda x: davies(x.internet1, x.internet2), axis=1)
df_pairs['url_leven'] = df_pairs.apply(lambda x: leven(x.internet1, x.internet2), axis=1)
df_pairs['url_jaro'] = df_pairs.apply(lambda x: jaro(x.internet1, x.internet2), axis=1)

In [41]:
df_pairs['brands_davies'] = df_pairs.apply(lambda x: davies(x.brands1, x.brands2), axis=1)
df_pairs['brand_leven'] = df_pairs.apply(lambda x: leven(x.brands1, x.brands2), axis=1)
df_pairs['brand_jaro'] = df_pairs.apply(lambda x: jaro(x.brands1, x.brands2), axis=1)

In [42]:
df_pairs['phone_lcs'] = df_pairs.apply(lambda x: phone_lcs(x.phoneNumbers1, x.phoneNumbers2), axis=1)
df_pairs['subcat_WRatio'] = df_pairs.apply(lambda x: WRatio(x.subCategory1, x.subCategory2), axis=1)
df_pairs['subcat_ratio'] = df_pairs.apply(lambda x: ratio(x.subCategory1, x.subCategory2), axis=1)
df_pairs['subcat_token_set_ratio'] = df_pairs.apply(lambda x: token_set_ratio(x.subCategory1, x.subCategory2), axis=1)

In [43]:
cat_columns = [
    'Is_direction_match', 'Is_house_match', 'Is_category_match',
    'Is_subcategory_match', 'Is_brand_match', 'Is_related_cat',
    'Is_name_number_match', 'is_phone_match', 'Is_email_match', 'Is_url_match'
]


In [44]:
df_pairs = pd.get_dummies(df_pairs, columns = cat_columns)

In [45]:
cols = [
    'country', 'placeId1', 'placeId2', 'sourceNames1', 'sourceNames2',
    'category1', 'category2', 'brands1', 'brands2', 'email1', 'email2',
    'latitude1', 'longitude1', 'latitude2', 'longitude2', 'houseNumber1',
    'houseNumber2', 'streets1', 'streets2', 'cities1', 'cities2',
    'subCategory1', 'subCategory2', 'phoneNumbers1', 'phoneNumbers2',
    'internet1', 'internet2', 'postalCode1', 'postalCode2', 'similarity',
    'haversine', 'name_davies', 'name_leven', 'name_dice', 'name_jaro',
    'name_set_ratio', 'street_davies', 'street_leven', 'street_jaro',
    'email_davies', 'email_leven', 'email_jaro', 'url_davies', 'url_leven',
    'url_jaro', 'brands_davies', 'brand_leven', 'brand_jaro', 'phone_lcs',
    'subcat_WRatio', 'subcat_ratio', 'subcat_token_set_ratio',
    'Is_direction_match_0', 'Is_direction_match_1', 'Is_direction_match_2',
    'Is_house_match_0', 'Is_house_match_1', 'Is_house_match_2',
    'Is_category_match_0', 'Is_category_match_1', 'Is_subcategory_match_0',
    'Is_subcategory_match_1', 'Is_subcategory_match_2', 'Is_brand_match_0',
    'Is_brand_match_1', 'Is_brand_match_2', 'Is_brand_match_3',
    'Is_related_cat_0', 'Is_related_cat_1', 'Is_name_number_match_0',
    'Is_name_number_match_1', 'Is_name_number_match_2',
    'Is_name_number_match_3', 'is_phone_match_1', 'is_phone_match_2',
    'is_phone_match_3', 'is_phone_match_4', 'Is_email_match_0',
    'Is_email_match_1', 'Is_email_match_2', 'Is_url_match_0', 'Is_url_match_1',
    'Is_url_match_2'
]


In [46]:
df_pairs["country"] = config.country

In [47]:
df_pairs["Is_direction_match_0"] = 0
df_pairs["is_phone_match_4"] = 0

In [49]:
df_pairs[cols].to_parquet(
    config.root_dir
    + f"outputs/df_pairs_features_{config.country}_{config.COSINE_NEIGHBORS}.parquet",
    compression="zstd",
    index=None,
)