In [174]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

from geopy.geocoders import Nominatim

import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet as wn
from nltk.stem import PorterStemmer, WordNetLemmatizer

import Levenshtein as lev
import math
from collections import Counter

from pickle import dump, load
import time
from sklearn.neighbors import BallTree


import itertools
from tqdm.auto import tqdm
tqdm.pandas()
import gc

from fuzzywuzzy import fuzz

In [175]:
start_time = time.time()

In [176]:
#pairs = pd.read_pickle('../input/fourpoints-location-matching/train_pairs_raw.pkl')
pairs = pd.read_csv('../input/foursquare-location-matching/pairs.csv')

In [177]:
#pairs = pairs.sample(frac=0.1, random_state=1, ignore_index=True)
gc.collect()

188

In [178]:
pairs.shape

(578907, 25)

In [179]:
# missing values generate new cols
missing_list = ['url_1','url_2','phone_1','phone_2','address_1','address_2','city_1','city_2','zip_1','zip_2']
for col in tqdm(missing_list):
    pairs[f"{col}_missing"] = pairs[col].notnull().astype('int8')

  0%|          | 0/10 [00:00<?, ?it/s]

In [180]:
def count_occurance(df, cols_1,cols_2):
    for i in tqdm(range(len(cols_1))):
        df[f"{cols_1[i]}_count"] = df[cols_1[i]].map(df[cols_1[i]].dropna().value_counts().to_dict())
        df[f"{cols_2[i]}_count"] = df[cols_2[i]].map(df[cols_2[i]].dropna().value_counts().to_dict())
        df[f"{cols_1[i]}_count_diff"] = (df[f"{cols_2[i]}_count"] - df[f"{cols_1[i]}_count"])
    return df

In [181]:
count_cols_1 = ['country_1','city_1','state_1','categories_1']
count_cols_2 = ['country_2','city_2','state_2','categories_2']
pairs = count_occurance(pairs, count_cols_1, count_cols_2)

  0%|          | 0/4 [00:00<?, ?it/s]

In [182]:
def numeric_group_counts(df,cols_1,cols_2):
    # cols should be [lat,lon]
    for i in tqdm(range(len(cols_1))):
        group_1 = pd.cut(df[cols_1[i]], 180)
        df[f"{cols_1[i]}_count"] = group_1.map(group_1.value_counts().to_dict())
        group_2 = pd.cut(df[cols_1[i]], 180)
        df[f"{cols_2[i]}_count"] = group_2.map(group_2.value_counts().to_dict())
        df[f"{cols_2[i]}_count_diff"] = (df[f"{cols_1[i]}_count"] - df[f"{cols_2[i]}_count"])
    return df

In [183]:
num_group_count_1 = ['latitude_1','longitude_1']
num_group_count_2 = ['latitude_2','longitude_2']
pairs = numeric_group_counts(pairs, num_group_count_1, num_group_count_2)

  0%|          | 0/2 [00:00<?, ?it/s]

In [184]:
# impute missing values
cat_col = pairs.select_dtypes(include = ['object']).columns
pairs[cat_col] = pairs[cat_col].fillna('')

Data preprocessing & Feature transformation:
1. location (latitude, longtitude): finding the distance between two variables
2. word preprocessing: remove url symbol, stop words removal

In [185]:
# 1. location
def distance(lat1, lon1, lat2, lon2):
    R = 6373.0
    d_lon = lon2 - lon1
    d_lat = lat2 - lat1
    a = (np.sin(d_lat/2)) ** 2 + np.cos(lat1) * np.cos(lat2) * (np.sin(d_lon/2)) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    distance = R * c
    return distance

# lowercase
def lower(df, cols):
    for col in tqdm(cols):
        df[col] = df[col].progress_apply(lambda x: x.lower())
    return df

# number removing
def num_remove(df, cols):
    for col in tqdm(cols):
        df[col] = df[col].progress_apply(lambda x: re.sub(r'\d+', '', x))
    return df

# punctuation removal
def punc_remove(df,cols):
    for col in tqdm(cols):
        df[col] = df[col].progress_apply(lambda x: re.sub('['+string.punctuation+']', ' ', x))
    return df

# white spaces removal
def space_remove(df,cols):
    for col in tqdm(cols):
        df[col] = df[col].progress_apply(lambda x: x.strip()) # remove front and end space
        df[col] = df[col].str.replace('\s+', ' ', regex=True) # remove double space
    return df

def preprocess(df,cols):
    df[cols] = num_remove(df,cols)[cols]
    df[cols] = punc_remove(df,cols)[cols]
    df[cols] = space_remove(df,cols)[cols]
    return df

# remove url
def remove_URL(df,cols):
    # cols = ["url_1","url_2"]
    df[cols] = df[cols].fillna('')
    for i in tqdm(cols):
        df[i] = df[i].str.replace('http', '')
        df[i] = df[i].str.replace('https', '')
        df[i] = df[i].str.replace('www', '')
        #df[i] = df[i].progress_apply(lambda x: re.sub('\W', "", x))
    return df

# stop words removal

def list_to_string(lis):
    string = ''
    for i in tqdm(lis):
        string += i
        string += ' '
    return string[:-1]

def stop(string):
    stops = set(stopwords.words('english'))
    tokens = word_tokenize(string)
    result = [i for i in tqdm(tokens) if not i in stops]
    return result
    
def stop_remove(df,cols):
    stops = set(stopwords.words('english'))
    for col in tqdm(cols):
        df[col] = df[col].progress_apply(lambda x: ' '.join([word for word in x.split() if word not in stops]))
    return df

In [186]:
lowercase_cols = ['name_1','address_1','city_1','state_1','url_1','categories_1','name_2','address_2','city_2','state_2','url_2','categories_2']
preprocess_cols = ['name_1','address_1','name_2','address_2','url_1','url_2','categories_1','categories_2']
url_columns = ['url_1','url_2']
pairs['distance'] = distance(pairs.latitude_1,pairs.longitude_1,pairs.latitude_2,pairs.longitude_2)

In [187]:
pairs[lowercase_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 578907 entries, 0 to 578906
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   name_1        578907 non-null  object
 1   address_1     578907 non-null  object
 2   city_1        578907 non-null  object
 3   state_1       578907 non-null  object
 4   url_1         578907 non-null  object
 5   categories_1  578907 non-null  object
 6   name_2        578907 non-null  object
 7   address_2     578907 non-null  object
 8   city_2        578907 non-null  object
 9   state_2       578907 non-null  object
 10  url_2         578907 non-null  object
 11  categories_2  578907 non-null  object
dtypes: object(12)
memory usage: 53.0+ MB


In [188]:
pairs[lowercase_cols] = lower(pairs, lowercase_cols)[lowercase_cols]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

In [189]:
pairs[preprocess_cols] = preprocess(pairs, preprocess_cols)[preprocess_cols]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

In [190]:
pairs[url_columns] = remove_URL(pairs, url_columns)[url_columns]

  0%|          | 0/2 [00:00<?, ?it/s]

In [191]:
pairs[preprocess_cols] = stop_remove(pairs, preprocess_cols)[preprocess_cols]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

  0%|          | 0/578907 [00:00<?, ?it/s]

Count frequency

# text similarity: fuzzy, cosine similarity

def fuzzy_similarity(df, cols_1, cols_2):
    # length for cols_1 and cols_2 must be the same.
    for i in tqdm(range(len(cols_1))):
        df[f"{cols_1[i]}_fuzzy"] = df.progress_apply(lambda x: lev.ratio(x[cols_1[i]],x[cols_2[i]]), axis = 1)
        df[f"{cols_1[i]}_fuzzy_partial"] = df.progress_apply(lambda x: fuzz.partial_ratio(x[cols_1[i]],x[cols_2[i]]), axis = 1)
        df[f"{cols_1[i]}_fuzzy_set"] = df.progress_apply(lambda x: fuzz.token_set_ratio(x[cols_1[i]],x[cols_2[i]]), axis = 1)
    return df    
col_1 = ['name_1','address_1','categories_1','id_1']
col_2 = ['name_2','address_2','categories_2','id_2']
pairs = fuzzy_similarity(pairs, col_1, col_2)

In [193]:
gc.collect()

140

!pip install -U sentence_transformers
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('stsb-roberta-large')

col_1 = ['name_1','address_1','categories_1']
col_2 = ['name_2','address_2','categories_2']
def sen_tran_pca(text_list):
    model = SentenceTransformer('stsb-roberta-large')
    #model = SentenceTransformer('all-MiniLM-L6-v2')
    start_time = time.time()
    #pool = model.start_multi_process_pool()
    #embedding1 = model.encode_multi_process(text_list_1, pool, batch_size =1024)
    embedding = model.encode(text_list, convert_to_tensor=True)
    total_similarity = util.cos_sim(embeddings1, embeddings2)
    #model.stop_multi_process_pool(pool)
    return np.array(total_similarity).diagonal()

def cos_similarity(df, cols_1, cols_2):
    # length for cols_1 and cols_2 must be the same.
    for i in tqdm(range(len(cols_1))):
        df[f"{cols_1[i]}_cos_sim"] = cos_sim(df[cols_1[i]].tolist(),df[cols_2[i]].tolist())
    return df

!pip install -U sentence_transformers
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('stsb-roberta-large')
from sklearn.metrics.pairwise import cosine_similarity

col_1 = ['name_1','address_1','categories_1']
col_2 = ['name_2','address_2','categories_2']
def cos_sim(text_list_1,text_list_2):
    model = SentenceTransformer('stsb-roberta-large')
    #model = SentenceTransformer('all-MiniLM-L6-v2')
    start_time = time.time()
    pool = model.start_multi_process_pool()
    embedding1 = model.encode_multi_process(text_list_1, pool, batch_size =1024)
    embedding2 = model.encode(text_list_2, convert_to_tensor=True)
    total_similarity = util.cos_sim(embeddings1, embeddings2)
    model.stop_multi_process_pool(pool)
    return np.array(total_similarity).diagonal()

def cos_similarity(df, cols_1, cols_2):
    # length for cols_1 and cols_2 must be the same.
    for i in tqdm(range(len(cols_1))):
        df[f"{cols_1[i]}_cos_sim"] = cos_sim(df[cols_1[i]].tolist(),df[cols_2[i]].tolist())
    return df

from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

# Two lists of sentences
sentences1 = ['The cat sits outside','The cat sits outside',
             'A man is playing guitar',
             'The new movie is awesome','The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great']

sentences2 = ['The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great']

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=False)
'''
embeddings2 = model.encode(sentences2, convert_to_tensor=True)
#Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))
'''
embeddings1.shape

pairs = cos_similarity(pairs, col_1, col_2)

In [195]:
word_cols = ['name_1','name_2','address_1','address_2','url_1','url_2','categories_1','categories_2']
id_cols = ['id_1','id_2']
pairs[pairs.columns.difference(word_cols)].to_pickle('./train_pairs.pkl')

In [196]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 229.2176079750061 seconds ---


pairs_sample = pairs.iloc[:2,:]
dtype_dict = pairs_sample.dtypes.apply(lambda x: x.name).to_dict()
del pairs_sample
dtype_dict

In [197]:
word_cols = ['name_1','name_2','address_1','address_2','url_1','url_2','categories_1','categories_2']
word_pairs = pairs[word_cols]
word_pairs.to_pickle('./train_pairs_word.pkl')

In [198]:
pairs.head()

Unnamed: 0,id_1,name_1,latitude_1,longitude_1,address_1,city_1,state_1,zip_1,country_1,url_1,...,latitude_1_count,latitude_2_count,latitude_2_count_diff,longitude_1_count,longitude_2_count,longitude_2_count_diff,distance,name_1_fuzzy,name_1_fuzzy_partial,name_1_fuzzy_set
0,E_000001272c6c5d,café stad oudenaarde,50.859975,3.634196,abdijstraat,nederename,oost-vlaanderen,9700.0,BE,,...,19851,19851,0,12096,12096,0,60.031344,0.857143,80,100
1,E_000008a8ba4f48,turkcell,37.84451,27.844202,adnan menderes bulvarı,,,,TR,,...,25084,25084,0,24622,24622,0,42.240619,1.0,100,100
2,E_000023d8f4be44,island spa,14.51897,121.018702,th flr newport mall resorts world manila,pasay city,metro manila,,PH,,...,11435,11435,0,12247,12247,0,4.874195,0.714286,100,100
3,E_00007dcd2bb53f,togo sandwiches,38.257797,-122.064599,holiday ln ste b,fairfield,ca,94534.0,US,locations togos com us ca fairfield holiday ln...,...,19488,19488,0,3242,3242,0,0.901924,0.421053,100,100
4,E_0000c362229d93,coffee cat,7.082218,125.610244,f torres st,davao city,davao region,8000.0,PH,,...,1261,1261,0,8614,8614,0,39.890431,0.947368,89,63


In [199]:
word_pairs.head()

Unnamed: 0,name_1,name_2,address_1,address_2,url_1,url_2,categories_1,categories_2
0,café stad oudenaarde,café oudenaarde,abdijstraat,,,,bars,bars
1,turkcell,turkcell,adnan menderes bulvarı,batı aydın,,,mobile phone shops,electronics stores
2,island spa,island spa theater,th flr newport mall resorts world manila,,,,spas,spas
3,togo sandwiches,togo,holiday ln ste b,,locations togos com us ca fairfield holiday ln...,,sandwich places,sandwich places
4,coffee cat,coffeecat,f torres st,e jacinto extension,,,cafés coffee shops,coffee shops cafés dessert shops


# download data
<a href="train_pairs.csv"> train_pairs </a>

<a href="./train_pairs.pkl"> train_pairs pickle </a>

# Appendix

read large data: https://www.kaggle.com/code/rohanrao/tutorial-on-reading-large-datasets/notebook

read data faster: https://towardsdatascience.com/%EF%B8%8F-load-the-same-csv-file-10x-times-faster-and-with-10x-less-memory-%EF%B8%8F-e93b485086c7