In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

from geopy.geocoders import Nominatim

import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet as wn
from nltk.stem import PorterStemmer, WordNetLemmatizer

import Levenshtein as lev
import math
from collections import Counter

from pickle import dump, load
import time
from sklearn.neighbors import BallTree


import itertools
from tqdm.auto import tqdm
tqdm.pandas()
import gc


In [2]:
start_time = time.time()

In [None]:
pairs = pd.read_pickle('../input/fourpoints-location-matching/train_pairs_raw.pkl')

In [None]:
pairs = pairs.sample(frac=0.1, random_state=1, ignore_index=True)
gc.collect()

In [None]:
pairs.shape

In [None]:
pairs_sample = pd.read_csv('../input/foursquare-location-matching/pairs.csv').iloc[0:2,:]

In [None]:
# change original data type
del pairs['index']
dtype_dict = pairs_sample.dtypes.apply(lambda x: x.name).to_dict()
del pairs_sample
gc.collect()
pairs = pairs.astype(dtype_dict)

Reduce memory

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in tqdm(df.columns):
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
pairs = reduce_mem_usage(pairs)

In [None]:
#train = pd.read_csv('../input/foursquare-location-matching/train.csv')
#train = reduce_mem_usage(train)

In [None]:
pairs.info()

for col in tqdm(['latitude_1', 'longitude_1', 'latitude_2', 'longitude_2']):
    pairs[col] = pairs[col].astype('float16')

In [None]:
word_columns = ['name_1','address_1','city_1','state_1','url_1','categories_1','name_2','address_2','city_2','state_2','url_2','categories_2']
for col in tqdm(word_columns):
    pairs[col] = pairs[col].astype('object')

In [None]:
pairs.info()

In [None]:
# missing values generate new cols
missing_list = ['url_1','url_2','phone_1','phone_2','address_1','address_2','city_1','city_2','zip_1','zip_2']
for col in tqdm(missing_list):
    pairs[f"{col}_missing"] = pairs[col].notnull().astype('int8')

In [None]:
# impute missing values
cat_col = pairs.select_dtypes(include = ['object']).columns
pairs[cat_col] = pairs[cat_col].fillna('')

Data preprocessing & Feature transformation:
1. location (latitude, longtitude): finding the distance between two variables
2. word preprocessing: remove url symbol, stop words removal

In [None]:
# 1. location
def distance(lat1, lon1, lat2, lon2):
    R = 6373.0
    d_lon = lon2 - lon1
    d_lat = lat2 - lat1
    a = (np.sin(d_lat/2)) ** 2 + np.cos(lat1) * np.cos(lat2) * (np.sin(d_lon/2)) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    distance = R * c
    return distance

# lowercase
def lower(df, cols):
    for col in tqdm(cols):
        df[col] = df[col].progress_apply(lambda x: x.lower())
    return df

# number removing
def num_remove(df, cols):
    for col in tqdm(cols):
        df[col] = df[col].progress_apply(lambda x: re.sub(r'\d+', '', x))
    return df

# punctuation removal
def punc_remove(df,cols):
    for col in tqdm(cols):
        df[col] = df[col].progress_apply(lambda x: x.translate(str.maketrans("","", string.punctuation)))
    return df

# white spaces removal
def space_remove(df,cols):
    for col in tqdm(cols):
        df[col] = df[col].progress_apply(lambda x: x.strip())
    return df

def preprocess(df,cols):
    df[cols] = num_remove(df,cols)[cols]
    df[cols] = punc_remove(df,cols)[cols]
    df[cols] = space_remove(df,cols)[cols]
    return df

# remove url
def remove_URL(df,cols):
    # cols = ["url_1","url_2"]
    df[cols] = df[cols].fillna('')
    for i in tqdm(cols):
        df[i] = df[i].str.replace('http://', '')
        df[i] = df[i].str.replace('https://', '')
        df[i] = df[i].str.replace('http:', '')
        df[i] = df[i].str.replace('https:', '')
        df[i] = df[i].str.replace('http', '')
        df[i] = df[i].str.replace('https', '')
        df[i] = df[i].str.replace('www.', '')
        df[i] = df[i].str.replace('www', '')
        df[i] = df[i].progress_apply(lambda x: re.sub('\W', "", x))
    return df

# stop words removal

def list_to_string(lis):
    string = ''
    for i in tqdm(lis):
        string += i
        string += ' '
    return string[:-1]

def stop(string):
    stops = set(stopwords.words('english'))
    tokens = word_tokenize(string)
    result = [i for i in tqdm(tokens) if not i in stops]
    return result
    
def stop_remove(df,cols):
    stops = set(stopwords.words('english'))
    for col in tqdm(cols):
        df[col] = df[col].progress_apply(lambda x: ' '.join([word for word in x.split() if word not in stops]))
    return df

In [None]:
lowercase_cols = ['name_1','address_1','city_1','state_1','url_1','categories_1','name_2','address_2','city_2','state_2','url_2','categories_2']
preprocess_cols = ['name_1','address_1','name_2','address_2','url_1','url_2']
url_columns = ['url_1','url_2']
pairs['distance'] = distance(pairs.latitude_1,pairs.longitude_1,pairs.latitude_2,pairs.longitude_2)

In [None]:
pairs[lowercase_cols].info()

In [None]:
pairs[lowercase_cols] = lower(pairs, lowercase_cols)[lowercase_cols]

In [None]:
pairs[preprocess_cols] = preprocess(pairs, preprocess_cols)[preprocess_cols]

In [None]:
pairs[url_columns] = remove_URL(pairs, url_columns)[url_columns]

In [None]:
pairs[preprocess_cols] = stop_remove(pairs, preprocess_cols)[preprocess_cols]

text similarity: fuzzy, cosine similarity

In [None]:
def fuzzy_similarity(df, cols_1, cols_2):
    # length for cols_1 and cols_2 must be the same.
    for i in tqdm(range(len(cols_1))):
        df[f"{cols_1[i]}_fuzzy"] = df.progress_apply(lambda x: lev.ratio(x[cols_1[i]],x[cols_2[i]]), axis = 1)
    return df    

In [None]:
gc.collect()

In [None]:
col_1 = ['name_1','address_1','categories_1']
col_2 = ['name_2','address_2','categories_2']
pairs = fuzzy_similarity(pairs, col_1, col_2)

In [None]:
!pip install -U sentence_transformers
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('stsb-roberta-large')
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
col_1 = ['name_1','address_1','categories_1']
col_2 = ['name_2','address_2','categories_2']
def cos_sim(text_list_1,text_list_2):
    model = SentenceTransformer('stsb-roberta-large')
    #model = SentenceTransformer('all-MiniLM-L6-v2')
    start_time = time.time()
    pool = model.start_multi_process_pool()
    embedding1 = model.encode_multi_process(text_list_1, pool, batch_size =1024)
    embedding2 = model.encode(text_list_2, convert_to_tensor=True)
    total_similarity = util.cos_sim(embeddings1, embeddings2)
    model.stop_multi_process_pool(pool)
    return np.array(total_similarity).diagonal()

def cos_similarity(df, cols_1, cols_2):
    # length for cols_1 and cols_2 must be the same.
    for i in tqdm(range(len(cols_1))):
        df[f"{cols_1[i]}_cos_sim"] = cos_sim(df[cols_1[i]].tolist(),df[cols_2[i]].tolist())
    return df

In [None]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

# Two lists of sentences
sentences1 = ['The cat sits outside','The cat sits outside',
             'A man is playing guitar',
             'The new movie is awesome','The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great']

sentences2 = ['The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great']

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=False)
'''
embeddings2 = model.encode(sentences2, convert_to_tensor=True)
#Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))
'''
embeddings1.shape

In [None]:
pairs = cos_similarity(pairs, col_1, col_2)

In [None]:
pairs.to_csv("train_pairs.csv", index=False)
pairs.to_pickle('./train_pairs.pkl')

In [None]:
print("--- %s seconds ---" % (time.time() - start_time))

pairs_sample = pairs.iloc[:2,:]
dtype_dict = pairs_sample.dtypes.apply(lambda x: x.name).to_dict()
del pairs_sample
dtype_dict

In [None]:
pairs.head()

# download data
<a href="train_pairs.csv"> train_pairs </a>

<a href="./train_pairs.pkl"> train_pairs pickle </a>

# Appendix

read large data: https://www.kaggle.com/code/rohanrao/tutorial-on-reading-large-datasets/notebook

read data faster: https://towardsdatascience.com/%EF%B8%8F-load-the-same-csv-file-10x-times-faster-and-with-10x-less-memory-%EF%B8%8F-e93b485086c7