In [1]:
import pandas as pd
from Config import config
from utils.create_features import add_lat_lon_distance_features
import glob
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm.auto import tqdm
import torch

In [2]:
df = pd.read_csv(config.input_dir + f"Fuse_{config.country}_cleaned.csv",engine='c',dtype={"postalCode": "str", "houseNumber": "str"})

In [3]:
file_list = glob.glob(
    config.output_stage1 + f"batch_candidates/{config.country}_parquet/*.parquet"
)

In [4]:
df_pairs = pd.DataFrame()
for file in file_list:
    df1 = pd.read_parquet(file,engine='pyarrow')
    df_pairs = pd.concat([df_pairs,df1])

In [5]:
df_pairs = pd.merge(df_pairs, df,  how='left', left_on=[
                'placeId1'], right_on=['placeId'])
df_pairs.drop('placeId',inplace=True,axis=1)
df_pairs = pd.merge(df_pairs, df,  how='left', left_on=[
                    'placeId2'], right_on=['placeId'],suffixes=["1","2"])

df_pairs.drop('placeId',inplace=True,axis=1)

In [6]:
cols = [
    'placeId1', 'placeId2','officialName1', 'officialName2',
    'latitude1', 'latitude2', 'longitude1', 'longitude2', 'subCategory1',
    'subCategory2', 'category1', 'category2', 'cities1', 'cities2', 'streets1',
    'streets2', 'brands1', 'brands2', 'email1', 'email2', 'houseNumber1',
    'houseNumber2', 'internet1', 'internet2', 'phoneNumbers1', 'phoneNumbers2',
    'postalCode1', 'postalCode2'
]


In [7]:
df_pairs = df_pairs[cols]

In [8]:
df_pairs = add_lat_lon_distance_features(df_pairs)

In [9]:
df_pairs.shape

(2990578, 29)

In [10]:
def name_distance(name1,name2,batch=300000):
    model = SentenceTransformer('/workspace/clustering/models/minilLM_model/57600')
    model.max_seq_length = 64
    sims = np.empty((0), np.float32)
    for i in tqdm(range(0, len(name1), batch)):
        embeddings1 = model.encode(name1[i : i + batch], batch_size=512, show_progress_bar=True,normalize_embeddings=True)
        embeddings2 = model.encode( name2[i : i + batch], batch_size=512,normalize_embeddings=True)
        cosine = np.sum(embeddings1 * embeddings2, axis=1)
        cosine = np.round(cosine, 3)
        sims = np.concatenate((sims, cosine))
    return sims



In [11]:
with torch.cuda.amp.autocast(enabled=True):
    similarity = name_distance(list(df_pairs["officialName1"]),list(df_pairs["officialName2"]))

  0%|          | 0/10 [00:00<?, ?it/s]

Batches:   0%|          | 0/586 [00:00<?, ?it/s]

Batches:   0%|          | 0/586 [00:00<?, ?it/s]

Batches:   0%|          | 0/586 [00:00<?, ?it/s]

Batches:   0%|          | 0/586 [00:00<?, ?it/s]

Batches:   0%|          | 0/586 [00:00<?, ?it/s]

Batches:   0%|          | 0/586 [00:00<?, ?it/s]

Batches:   0%|          | 0/586 [00:00<?, ?it/s]

Batches:   0%|          | 0/586 [00:00<?, ?it/s]

Batches:   0%|          | 0/586 [00:00<?, ?it/s]

Batches:   0%|          | 0/568 [00:00<?, ?it/s]

In [12]:
df_pairs["similarity"] = similarity

In [14]:
df_pairs

Unnamed: 0,placeId1,placeId2,officialName1,officialName2,latitude1,latitude2,longitude1,longitude2,subCategory1,subCategory2,...,houseNumber1,houseNumber2,internet1,internet2,phoneNumbers1,phoneNumbers2,postalCode1,postalCode2,haversine,similarity
0,1db029e2-7782-4b48-a11d-8cba87620e0a,3644200b-7e37-4a38-9a02-fcb37640660c,duns,christchurch parkside,-43.540314,-43.539330,172.624405,172.619705,tax services,child care facility,...,100,438,www.duns.co.nz,,6433650768,6433657606,80110,80110,0.394326,0.680
1,50d9a0f0-ed89-41b5-900c-b7a24100eefb,1db029e2-7782-4b48-a11d-8cba87620e0a,value cars warehouse,duns,-43.540516,-43.540314,172.618942,172.624405,car,tax services,...,32,100,www.thevaluecarswharehouse.co.nz,www.duns.co.nz,6433667768,6433650768,80110,80110,0.440670,0.733
2,499d4c0a-b74e-4f5e-9827-6b714c7dd34e,8cd6170b-95c3-4e94-a23d-25e730f76d6d,methodist,forsyth barr,-43.548260,-43.541759,172.621521,172.616898,church,diversified financials,...,,12,www.methodist.org.nz,www.forsythbarr.co.nz,6433650844,6433654244,80240,80240,0.813047,0.264
3,60db419d-df9f-40ea-9b04-4a9943c053bb,499d4c0a-b74e-4f5e-9827-6b714c7dd34e,dairy farm supplies,methodist,-43.540844,-43.548260,172.618195,172.621521,farm,church,...,58,,,www.methodist.org.nz,6433665049,6433650844,80110,80240,0.866951,0.849
4,c7b915fc-8a45-4457-aacb-63d69b99243a,499d4c0a-b74e-4f5e-9827-6b714c7dd34e,dairy farm supplies,methodist,-43.540844,-43.548260,172.618210,172.621521,agricultural supplies,church,...,,,,www.methodist.org.nz,6433665049,6433650844,80110,80240,0.866702,0.849
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2990573,75438fc0-acf5-4184-8b8d-7472a6f6dbf7,da28b87e-742b-4b0a-ba9a-8a4d355b06f2,ngaruariki stream,oruariki stream,-34.424934,-34.425720,172.915512,172.918167,unspecified,unspecified,...,,,,,,,,,0.259159,0.966
2990574,127888a6-d9b0-4d18-9a7d-2f47be6ff3f4,d4e9a46f-bf83-4b5d-910f-b1fb47861d5a,ngatoru stream,te huka stream,-34.424644,-34.440441,172.926285,172.927353,unspecified,unspecified,...,,,,,,,,,1.759174,0.666
2990575,7ccba943-6015-4912-84a6-7dfa59cd9fa2,6db7dd21-bb72-4fb1-b5b6-98ecc8348339,waiparariki stream,akura stream,-34.446896,-34.433472,172.946640,172.948074,unspecified,unspecified,...,,,,,,,,,1.498652,0.676
2990576,68f5fbe1-3f81-44d7-9535-7a8f7f9b35af,babf5cd5-bbf3-4d2c-8b18-297ebe1fdf0b,waitangi stream,matakana stream,-34.427818,-34.438946,172.965240,172.967224,unspecified,unspecified,...,,,,,,,,,1.250411,0.619


In [13]:
df_pairs.to_parquet(f"/workspace/clustering/outputs/df_pairs_sim_{config.country}.parquet",
    compression="zstd",
    index=None,
)