In [1]:
import os
import gc
import re
import sys
import math
import json
import time
import random
import joblib
import pickle
import warnings
import difflib
import Levenshtein
import numpy as np
import pandas as pd
import seaborn as sns
from glob import glob
from pathlib import Path
from unidecode import unidecode
from collections import Counter
import multiprocessing
from tqdm.auto import tqdm
from argparse import Namespace
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors, KNeighborsRegressor
from sklearn.metrics.pairwise import haversine_distances
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold, StratifiedGroupKFold
# from transformers import AutoTokenizer, TFAutoModel, AutoConfig

def reduce_mem_usage(df, silent=False, allow_categorical=True, float_dtype="float32"):
    """ 
    Iterates through all the columns of a dataframe and downcasts the data type
     to reduce memory usage. Can also factorize categorical columns to integer dtype.
    """
    def _downcast_numeric(series, allow_categorical=allow_categorical):
        """
        Downcast a numeric series into either the smallest possible int dtype or a specified float dtype.
        """
        if pd.api.types.is_sparse(series.dtype) is True:
            return series
        elif pd.api.types.is_numeric_dtype(series.dtype) is False:
            if pd.api.types.is_datetime64_any_dtype(series.dtype):
                return series
            else:
                if allow_categorical:
                    return series
                else:
                    codes, uniques = series.factorize()
                    series = pd.Series(data=codes, index=series.index)
                    series = _downcast_numeric(series)
                    return series
        else:
            series = pd.to_numeric(series, downcast="integer")
        if pd.api.types.is_float_dtype(series.dtype):
            series = series.astype(float_dtype)
        return series

    if silent is False:
        start_mem = np.sum(df.memory_usage()) / 1024 ** 2
        print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    if df.ndim == 1:
        df = _downcast_numeric(df)
    else:
        for col in df.columns:
            df.loc[:, col] = _downcast_numeric(df.loc[:,col])
    if silent is False:
        end_mem = np.sum(df.memory_usage()) / 1024 ** 2
        print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
        print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))

    return df

def shrink_mem_new_cols(matrix, oldcols=None, allow_categorical=True, silent=False):
    # Calls reduce_mem_usage on columns which have not yet been optimized
    if oldcols is not None:
        newcols = matrix.columns.difference(oldcols)
    else:
        newcols = matrix.columns
    matrix.loc[:,newcols] = reduce_mem_usage(matrix.loc[:,newcols], allow_categorical=allow_categorical, 
                                             silent=silent)
    oldcols = matrix.columns  # This is used to track which columns have already been downcast
    return matrix, oldcols

plt.rcParams["font.size"] = 13
warnings.filterwarnings('ignore')

sns.set_style("darkgrid")

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 600)

## Config

In [2]:
CFG = Namespace(
    seed = 42,
    debug = False,
    validate = False,
    target = "point_of_interest",
#     model = '../input/xlmroberta/xlm-roberta-base/'
#     tokenizer = AutoTokenizer.from_pretrained(model)
    n_neighbors = 20,
    n_splits = 10,
    train_path = 'train_dataset',
    feat_columns =  ['name', 'address', 'city', 'state', 'zip', 'closest_city', 'main_categories',
                     'name_encoded', 'address_encoded', 'city_encoded', 'url',  'phone', 'categories', 
                     'country', 'latitude'],
    vec_columns =   ['name', 'name_encoded', 'categories', 'address', 'city'],
    knn_columns =   ['name', 'address'],
    rec_columns =   ['name_clean', 'address_clean', 'phone_clean', 'categories', 'url_encoded'],
    dist_columns =  ['latitude'],
    bad_features =  ['name_len_diff', 'state_jaro', 'address_encoded_gesh', 'state_gesh', 
                     'city_encoded_nlcsk', 'categories_nlcsk', 'city_encoded_sim', 'state_leven',
                     'address_encoded_nlcsk', 'name_nleven', 'city_encoded_gesh',
                     'city_encoded_jaro', 'country_lcs', 'city_encoded_len_diff', 'state_nlcs', 'url_jaro',
                     'state_nleven','categories_len_diff', 'city_encoded_nleven', 'name_gesh',
                     'address_jaro', 'address_encoded_leven', 'closest_city_jaro', 'phone_lcs',
                     'name_nlcs', 'address_nleven', 'name_encoded_jaro', 'main_categories_gesh',
                     'address_encoded_nleven', 'city_nlcs', 'country_jaro', 'main_categories_nlcs', 
                     'country_gesh', 'name_encoded_leven', 'phone_leven', 'address_lcs', 
                     'country_leven', 'country_sim', 'city_nlcsk'],
    save_features = ['address_nlcs', 'address_nlcsk', 'name_encoded_nleven', 'name_lcsk', 
                     'name_nlcsk', 'city_nleven', 'closest_city_nleven', 'main_categories_nleven',
                     'address_encoded_nlcs', 'city_encoded_nlcs', 'url_nlcs']
)

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(CFG.seed)

## Load the data

In [3]:
data_root = 'foursquare_location_matching'
data = pd.read_csv(os.path.join(data_root, 'train.csv'))

if CFG.debug:
    data = data.sample(n = 10000, random_state = CFG.seed)
    data = data.reset_index(drop = True)

## Decode name in unicode

In [4]:
def decode(col):
    if col != col:
        return np.nan
    return unidecode(col)

data['name_encoded'] = data['name'].apply(decode)
data['address_encoded'] = data['address'].apply(decode)
data['city_encoded'] = data['city'].apply(decode)
data['state_encoded'] = data['state'].apply(decode)
data['url_encoded'] = data['url'].apply(decode)

## Clean and sort some features

In [5]:
symbols_to_replace = [',', '.', "'", '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', 
                      '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', 
                      ']', '^', '_', '`', '{', '|', '}', '~']

def sort_col(col):
    if col != col:
        return np.nan
    return ', '.join(sorted(col.split(', ')))

def clean(col):
    if col != col:
        return np.nan
    col = col.lower()
    for s in symbols_to_replace:
        col = col.replace(s, '')
    return col

data['name_clean'] = data['name_encoded'].apply(clean)
data['address_clean'] = data['address_encoded'].apply(clean)
data['phone_clean'] = data['phone'].apply(clean)

data['name_clean'] = data['name_clean'].apply(sort_col)
data['address_clean'] = data['address_clean'].apply(sort_col)
data['categories'] = data['categories'].apply(sort_col)

In [6]:
# import os
# import numpy as np
# import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import HashingVectorizer

# data_root = 'foursquare_location_matching'
# data = pd.read_csv(os.path.join(data_root, 'train.csv'))

# # vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(1,1))
# vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(1,1))
# data['url_clean'] = data['url'].fillna('')
# X = vectorizer.fit_transform(data['url_clean'].values)
# feature_names = vectorizer.get_feature_names()

# sorted(zip(feature_names, np.asarray(X.sum(axis=0))[0]), key=lambda x: x[1], reverse=True)

## Add main category to the train dataset

In [7]:
# cat_freq = dict()
stop_words = ['/', '&', 'or', 'High', 'Miscellaneous', 'Fast', 'Other', 'Asian', 'Chinese', 'Event', 
              'Great', 'Noodle', 'Burger', 'Seafood', 'Breakfast', 'Ice', 'Diners', 'Cream', 'Indonesian', 
              'Thai', "Women's", 'Fried', 'Snack', 'Tea', 'Mexican', 'Nail', 'Sushi', 'Middle', 'Korean', 
              'Gift', 'Drink', 'Pet', 'Turkish', "Men's", 'Indian', 'Malay', 'Cocktail', 'Donut', 'Box', 
              'Condos)', 'Residential', 'Convenience', 'Gas', 'General', 'Bus', 'Pizza', 'Spaces', 'Mobile',
              'Phone', 'Academic', 'Japanese', 'Business', 'Shoe', 'Italian', 'American', 'Home', 'Auto', 
              'Furniture', 'Cosmetics', 'Sandwich', 'Dessert', 'Car', 'Arts', 'Financial', 'Legal', 'BBQ',
              'Hardware', 'Video', 'Music', 'Art', 'Student', 'Jewelry', 'Historic', 'Travel', 'Washes',
              'Beer', 'Arcades', 'Bike', 'Lookouts', 'Scenic', 'Rental', 'Accessories', 'Repairs', 'Discount', 
              'Optical', 'Bodegas', 'Big', 'Assisted', 'Living', 'Athletics', 'Agencies', 'Locations', 'Trails', 
              'Bed', 'Breakfasts', 'Wine', 'Real', 'Elementary', 'Theme', 'Golf', 'Rest',  'Photography', 
              'Nightlife', 'Courses', 'Convention', 'Eastern', 'Concert', 'Conference', 'Startups', 'Tech', 
              'Meeting', 'French', 'Supplies', 'Events', 'Sake', 'Dog', 'Ramen', 'City', 'Juice', 'Science',
              'Liquor', 'Lawyers', 'Insurance', 'Flower', 'Toy', 'Rentals', 'Paper', 'Flea', 'Bases', 'Baseball', 
              'Karaoke', 'Kids', 'Design', 'Farmers', 'Repair', 'Technology', 'Wards', 'Water', 'Supply', 
              'Filipino', 'Piers', 'Salad', 'Mattress', 'Print', 'Wings', 'Engineering', 'Non-Profits', 
              'Gastropubs', 'Bistros', 'Hot', 'Vietnamese', 'Hookah', 'Candy', 'Coffee', 'Electronics',
              'Department', 'Clothing', 'Trucks', 'Chicken', 'Movie', 'Health', 'Soccer', 'Crafts', 
              'Game', 'Community', 'Food', 'College', 'Sporting', 'Beauty', 'Ferries', 'Soup', 'Veterinarians', 
              'Basketball', 'Light', 'Rail', 'Taco', 'Classrooms', 'Shopping', 'Developments', 'Train', 'Performing',
              'Administrative', 'Lingerie', 'Dive', 'Storage', 'Office', 'Landscaping', 'Residence', 'Sports',
              'Goods', 'Dealerships', 'Grocery', 'Workshops', 'History'
             ]


def get_categories(category):
    if category == 'Auto':
        return 'Automotive'
    if category == 'Hotel' or category == 'Motels' or category == 'Hostels':
        return 'Hotels'
    if category == 'Courthouses':
        return 'Court'
    if category == 'College':
        return 'Colleges'
    if category == 'Cafés':
        return 'Cafes'
    if category == "Doctor's" or category == "Dentist's" or category == "Doctors":
        return 'Medical'
    if category == '(Apartments':
        return 'Apartments'
    return category

cat_freq = pd.read_csv('foursquare_main_categories/cat_freq.csv', index_col='Unnamed: 0')
cat_freq_dict = dict(zip(cat_freq['category'], cat_freq['frequence']))

def get_main_category(category):
    if category == category:
        category_list = re.split(', | ', category)
        most_freq_cat = np.nan
        freq = 0
        
        for c in category_list:
            if c in stop_words or c[-2:] == 'an':
                continue
            c = get_categories(c)
            f = cat_freq_dict.get(c, 0)
            if f > freq:
                freq = f
                most_freq_cat = c
        
        return most_freq_cat
            
    return np.nan

data['main_categories'] = data['categories'].apply(get_main_category)

## Add closest city

In [8]:
# states = pd.read_csv('states.csv', index_col='Unnamed: 0')
cities = pd.read_csv('additional_data/cities.csv', encoding = "ISO-8859-1")
cities = cities[['asciiname', 'latitude', 'longitude', 'country code']]
cities.rename({'asciiname': 'city', 'country code': 'country'}, axis=1, inplace=True)

geoname_dict = {'city': cities}

def fill_the_missing_data(args):#, df_dists):
    country, country_df = args
    dfs = []
    columns = list(geoname_dict.keys())
    for c in tqdm(columns):

            geoname_df = geoname_dict[c]
            geoname_df = geoname_df[geoname_df['country'] == country]
                
            if len(country_df) == 0 or len(geoname_df) == 0:
                continue
            
            knn = KNeighborsRegressor(n_neighbors=min(len(geoname_df), 2), metric='haversine', n_jobs=-1)
            knn.fit(geoname_df[['latitude','longitude']], geoname_df.index)
            dists, nears = knn.kneighbors(country_df[['latitude','longitude']], return_distance=True)
            
            if nears.shape[1] < 2:
                continue
            
            nears[:,1] = nears[:,0]
            nears[:,0] = country_df.index
            
            for n in nears:
                t_idx = n[0]
                c_idx = n[1]
                country_df.loc[t_idx, f"closest_{c}"] = geoname_df.iloc[c_idx][c]
                       
    return country_df
    
    
data['country'] = data['country'].fillna('NA')
num_countries = data['country'].nunique()
    
processes = multiprocessing.cpu_count()
with multiprocessing.Pool(processes=processes) as pool:
    dfs = pool.imap_unordered(fill_the_missing_data, data.groupby('country', sort=False))
    dfs = tqdm(dfs, total=num_countries)
    dfs = list(dfs)
    
data = pd.concat(dfs).reset_index(drop=True)

del cities

gc.collect()

  0%|          | 0/222 [00:00<?, ?it/s]

27

## Bring all object columns to low register

In [9]:
def to_lower(df):
    for c in df.columns:
        if c != "id" and c != "longitude" and c != "latitude":
            df[c] = df[c].fillna('unknown')
            df[c] = df[c].astype(str).str.lower()
    return df
    
data = to_lower(data)

## Add text column

In [10]:
# data['text'] = data['name'] + ['SEP'] + data['address'] + ['SEP']\
#                + data['city'] + ['SEP'] + data['state'] + ['SEP']\
#                + data['zip'] + ['SEP'] + data['main_categories'] + ['SEP']\
#                + data['url'] + ['SEP'] + data['phone'] + ['SEP']\
#                + data['country']

## Search Candidates Functions

In [11]:
def recall_simple(df, rec_columns, threshold):
    val2id_d = {}
    for col in rec_columns:
        temp_df = df[['id', col]]
        temp_df[col] = temp_df[col].str.lower()
        val2id = temp_df.groupby(col)['id'].apply(set).to_dict()
        val2id_d[col] = val2id
        del val2id
    
    cus_ids = []
    match_ids = []
    for vals in tqdm(df[rec_columns + ['id']].values):
        cus_id = vals[-1]
        match_id = []
        
        rec_match_count = []
        for i in range(len(rec_columns)):
            col = rec_columns[i]
            
            if vals[i] != 'unknown':
                rec_match_count += list(val2id_d[col][vals[i].lower()])
        rec_match_count = dict(Counter(rec_match_count))
        
        for k, v in rec_match_count.items():
            if v > threshold:
                match_id.append(k)
        
        cus_ids += [cus_id] * len(match_id)
        match_ids += match_id
    
    train_df = pd.DataFrame()
    train_df['id'] = cus_ids
    train_df['match_id'] = match_ids
    train_df = train_df.drop_duplicates()
    del cus_ids, match_ids
    
    num_data = len(train_df)
    print('Num of data: %s' % num_data)
    num_data_per_id = num_data / train_df['id'].nunique()
    print('Num of data per id: %s' % num_data_per_id)
    
    return train_df

def recall_knn(df, n_neighbors, label):
    print(80*'=')
    print('Start KNN grouped by country')
    train_df_country = []
    for country, country_df in tqdm(df.groupby('country')):
        country_df = country_df.reset_index(drop = True)

        # distance KNN
        neighbors = min(len(country_df), n_neighbors)
        knn = KNeighborsRegressor(n_neighbors = neighbors,
                                  metric = 'haversine',
                                  n_jobs = -1)
        knn.fit(country_df[['latitude','longitude']], country_df.index)
        dists, nears = knn.kneighbors(country_df[['latitude', 'longitude']], 
                                     return_distance = True)

        # name KNN
        tfidf = TfidfVectorizer(ngram_range=(3, 3), analyzer="char_wb", use_idf=False)
        x_name = country_df['name_clean'].values
        x_name[x_name == 'unknown'] = ''
        x_name = tfidf.fit_transform(x_name)
        
        knn_name = NearestNeighbors(n_neighbors = neighbors,
                                    metric = 'cosine',
                                    n_jobs = -1)
        knn_name.fit(x_name)
        dists_name, nears_name = knn_name.kneighbors(x_name)
        
        del tfidf, knn, knn_name, x_name
        gc.collect()
        
        # join distance and name KNNs
        for k in range(neighbors):            
            cur_df = country_df[['id']]
            cur_df['match_id'] = country_df['id'].values[nears[:, k]]
            cur_df['kdist_country'] = dists[:, k]
            cur_df['kneighbors_country'] = k
            
            cur_df_name = country_df[['id']]
            cur_df_name['match_id'] = country_df['id'].values[nears_name[:, k]]
            cur_df_name['kdist_name_country'] = dists_name[:, k]
            cur_df_name['kneighbors_name_country'] = k
            cur_df = cur_df.merge(cur_df_name, on = ['id', 'match_id'], how = 'outer')
            
            train_df_country.append(cur_df)
    
    train_df_country = pd.concat(train_df_country)
    
    print('Start KNN for the whole dataset')
    train_df = []
    knn = NearestNeighbors(n_neighbors = n_neighbors,
#                            metric = 'haversine',
                           n_jobs = -1)
    knn.fit(df[['latitude','longitude']], df.index)
    dists, nears = knn.kneighbors(df[['latitude','longitude']])
    
    for k in range(n_neighbors):            
        cur_df = df[['id']]
        cur_df['match_id'] = df['id'].values[nears[:, k]]
        cur_df['kdist'] = dists[:, k]
        cur_df['kneighbors'] = k
        train_df.append(cur_df)
    
    train_df = pd.concat(train_df)
    train_df = train_df.merge(train_df_country, on = ['id', 'match_id'], how = 'outer')
    
    del train_df_country
    gc.collect()
    
    return train_df, label

## Reset the kernel (to avoid OOM)

In [12]:
# %reset --aggressive -f

## Split dataset on two parts to imitate test data distribution

In [13]:
def analysis(df):
    print('Num of data: %s' % len(df))
    print('Num of unique id: %s' % df['id'].nunique())
    print('Num of unique poi: %s' % df['point_of_interest'].nunique())
    
    poi_grouped = df.groupby('point_of_interest')['id'].count().reset_index()
    print('Mean num of unique poi: %s' % poi_grouped['id'].mean())

## Data split
kf = GroupKFold(n_splits=2)
for i, (trn_idx, val_idx) in enumerate(kf.split(data, 
                                                data['point_of_interest'], 
                                                data['point_of_interest'])):
    data.loc[val_idx, 'set'] = i

print('Num of data: %s' % len(data))
print(data['set'].value_counts())

train_data = data[data['set'] == 1]
valid_data = data[data['set'] == 0]

print('Train data: ')
analysis(train_data)
print('Valid data: ')
analysis(valid_data)

train_ids = train_data['id'].unique().tolist()
valid_ids = valid_data['id'].unique().tolist()
      
tv_ids_d = {}
tv_ids_d['train'] = train_ids
tv_ids_d['valid'] = valid_ids


Num of data: 1138812
0.0    569406
1.0    569406
Name: set, dtype: int64
Train data: 
Num of data: 569406
Num of unique id: 569406
Num of unique poi: 369987
Mean num of unique poi: 1.5389892077289202
Valid data: 
Num of data: 569406
Num of unique id: 569406
Num of unique poi: 369985
Mean num of unique poi: 1.5389975269267673


## TF-IDF

In [14]:
def tf_idf_vectorize(df):
    # set dict for corresponding ids and index
    id2index_d = dict(zip(data['id'].values, data.index))

    # make TF-IDF features
    tfidf_d = {}
    for col in tqdm(CFG.vec_columns):
        if col == "categories":
            tfidf = TfidfVectorizer(use_idf=False)
        else:
            tfidf = TfidfVectorizer(ngram_range=(3, 3), analyzer="char_wb", use_idf=False)
        tv_fit = tfidf.fit_transform(data[col].astype(str).fillna(f"no{col}").values)
        tfidf_d[col] = tv_fit
        
    return id2index_d, tfidf_d, tv_fit

id2index_d, tfidf_d, tv_fit = tf_idf_vectorize(train_data)

  0%|          | 0/5 [00:00<?, ?it/s]

## Find closest neighbours

In [15]:
%%time

# find nearest neighbours
train_data_simple = recall_simple(train_data, CFG.rec_columns, threshold=2)
valid_data_simple = recall_simple(valid_data, CFG.rec_columns, threshold=2)

train_data = recall_knn(train_data, CFG.n_neighbors, 'train')[0]
valid_data = recall_knn(valid_data, CFG.n_neighbors, 'valid')[0]

train_data = train_data.merge(train_data_simple,
                              on = ['id', 'match_id'],
                              how = 'outer')

valid_data = valid_data.merge(valid_data_simple,
                              on = ['id', 'match_id'],
                              how = 'outer')

# create train target feature
data = data.set_index('id')
ids = train_data['id'].tolist()
match_ids = train_data['match_id'].tolist()
poi = data.loc[ids]['point_of_interest'].values
match_poi = data.loc[match_ids]['point_of_interest'].values
train_data['label'] = np.array(poi == match_poi, dtype = np.int8)

ids = valid_data['id'].tolist()
match_ids = valid_data['match_id'].tolist()
poi = data.loc[ids]['point_of_interest'].values
match_poi = data.loc[match_ids]['point_of_interest'].values
valid_data['label'] = np.array(poi == match_poi, dtype = np.int8)

del dfs, poi, match_poi, ids, match_ids
gc.collect()

print('Num of unique train id: %s' % train_data['id'].nunique())
print('Num of train data: %s' % len(train_data))
print('Pos rate: %s' % train_data['label'].mean())

print('Num of unique valid id: %s' % valid_data['id'].nunique())
print('Num of valid data: %s' % len(valid_data))
print('Pos rate: %s' % valid_data['label'].mean())

data = data.reset_index('id')

  0%|          | 0/569406 [00:00<?, ?it/s]

Num of data: 2739669
Num of data per id: 7.592665264348129


  0%|          | 0/569406 [00:00<?, ?it/s]

Num of data: 2560097
Num of data per id: 7.102482723033089
Start KNN grouped by country


  0%|          | 0/211 [00:00<?, ?it/s]

Start KNN for the whole dataset
Start KNN grouped by country


  0%|          | 0/210 [00:00<?, ?it/s]

Start KNN for the whole dataset
Num of unique train id: 569406
Num of train data: 26851435
Pos rate: 0.05613778183549594
Num of unique valid id: 569406
Num of valid data: 26675286
Pos rate: 0.05626020279595128
CPU times: user 1h 10min 36s, sys: 16min 34s, total: 1h 27min 10s
Wall time: 30min 33s


## Feature Engineering

In [16]:
%load_ext Cython

## Largest common sequence

In [17]:
%%cython
def LCS(str S, str T):
    cdef int i, j
    cdef list dp = [[0] * (len(T) + 1) for _ in range(len(S) + 1)]
    for i in range(len(S)):
        for j in range(len(T)):
            dp[i + 1][j + 1] = max(dp[i][j] + (S[i] == T[j]), dp[i + 1][j], dp[i][j + 1], dp[i + 1][j + 1])
    return dp[len(S)][len(T)]

## Add distance features

In [18]:
def seq_match_distance(str1, str2):
    if str1 == 'unknown' or str2 == 'unknown':
        return np.nan
    return Levenshtein.ratio(str1, str2)

def lev_distance(str1, str2):
    if str1 == 'unknown' or str2 == 'unknown':
        return np.nan
    return Levenshtein.distance(str1, str2)

def jw_distance(str1, str2):
    if str1 == 'unknown' or str2 == 'unknown':
        return np.nan
    return Levenshtein.jaro_winkler(str1, str2)

def lcs_distance(str1, str2):
    if str1 == 'unknown' or str2 == 'unknown':
        return np.nan
    return LCS(str(str1), str(str2))

def vectorized_haversine(lats1, lats2, longs1, longs2):
    radius = 6371
    dlat=np.radians(lats2 - lats1)
    dlon=np.radians(longs2 - longs1)
    a = np.sin(dlat/2) * np.sin(dlat/2) + np.cos(np.radians(lats1)) \
        * np.cos(np.radians(lats2)) * np.sin(dlon/2) * np.sin(dlon/2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = radius * c
    return d

def categorical_similarity(str1, str2):
    if str1 == 'unknown' or str2 == 'unknown':
        return np.nan

    str1 = set(str1.split(", "))
    str2 = set(str2.split(", "))

    # Find intersection of two sets
    nominator = str1.intersection(str2)

    similarity_1 = len(nominator) / len(str1)
    similarity_2 = len(nominator) / len(str2)

    return max(similarity_1, similarity_2)

def add_distance_features(df):
    for col in tqdm(CFG.feat_columns):    
        # add distance metrics
        if col in CFG.dist_columns:
            lat1 = data.loc[df['id']]['latitude'].values
            lat2 = data.loc[df['match_id']]['latitude'].values
            diff_lat = np.abs(lat2-lat1)
            
            lon1 = data.loc[df['id']]['longitude'].values
            lon2 = data.loc[df['match_id']]['longitude'].values
            diff_lon = np.abs(lon2-lon1)
            
            df["manhattan"] = diff_lat + diff_lon
            df['euclidean'] = np.sqrt(np.square(diff_lat) + np.square(diff_lon))
            df['haversine'] = vectorized_haversine(lat1, lat2, lon1, lon2)
            continue
        
        if col in CFG.vec_columns:
            # for each id and match_id add corresponding TF-IDF vector
            # than multiply them elementwise to get similarity
            tv_fit = tfidf_d[col]
            indexs = [id2index_d[i] for i in df['id']]
            match_indexs = [id2index_d[i] for i in df['match_id']]                    
            df[f'{col}_sim'] = tv_fit[indexs].multiply(tv_fit[match_indexs]).sum(axis = 1).A.ravel()
        
        # set current POI values and values of matched POI
        col_values = data.loc[df['id']][col].values.astype(str)
        match_col_values = data.loc[df['match_id']][col].values.astype(str)
        if f"{col}_gesh" not in CFG.bad_features:
            df[f"{col}_gesh"]=[*map(seq_match_distance, col_values, match_col_values)]
        if f"{col}_leven" not in CFG.bad_features or f"{col}_nleven" in CFG.save_features:
            df[f"{col}_leven"]=[*map(lev_distance, col_values, match_col_values)]
        if f"{col}_jaro" not in CFG.bad_features:
            df[f"{col}_jaro"]=[*map(jw_distance, col_values, match_col_values)]
        if f"{col}_lcs" not in CFG.bad_features or f"{col}_nlcsk" in CFG.save_features or f"{col}_nlcs" in CFG.save_features:
            df[f"{col}_lcs"]=[*map(lcs_distance, col_values, match_col_values)]
            
        # and IOU of categories
        if col in ['categories']:
            df["category_venn"] = [*map(categorical_similarity, col_values, match_col_values)]
        
        # add some length metrics
        if col not in ['phone', 'country', 'zip']:
            df[f'{col}_len'] = list(map(len, col_values))
            df[f'match_{col}_len'] = list(map(len, match_col_values)) 
            
            if f'{col}_len_diff' not in CFG.bad_features:
                df[f'{col}_len_diff'] = np.abs(df[f'{col}_len'] - df[f'match_{col}_len'])
            
            if f'{col}_nleven' not in CFG.bad_features:
                df[f'{col}_nleven'] = df[f'{col}_leven'] / df[[f'{col}_len', f'match_{col}_len']].max(axis = 1)
            
            if f'{col}_nlcsk' not in CFG.bad_features:
                df[f'{col}_nlcsk'] = df[f'{col}_lcs'] / df[f'match_{col}_len']
            
            if f'{col}_nlcs' not in CFG.bad_features:
                df[f'{col}_nlcs'] = df[f'{col}_lcs'] / df[f'{col}_len']
            
            df = df.drop(f'{col}_len', axis = 1)
            df = df.drop(f'match_{col}_len', axis = 1)
            gc.collect()
            
    return df

## Check maximum IOU score

In [19]:
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2id(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df, id2poi, poi2id):
    ''' Calculate IOU score between two sets '''
    scores = []
    for ids, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2id[id2poi[ids]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

def get_max_score(data, train_data, tv_ids_d, label='train'):
    ''' Get all unique ids from the basic dataset, then concat them with ids
        from the train data, that have matches, then group them by id and 
        after that we have all matches that can be found in train dataset.
        Then calculate IOU with all matches that can be found in basic dataset
        for ids from train dataset '''
    data = data.set_index('id')
    data = data.loc[tv_ids_d[label]]
    data = data.reset_index()

    id2poi = get_id2poi(data)
    poi2id = get_poi2id(data)

    eval_df = pd.DataFrame()
    eval_df['id'] = data['id'].unique().tolist()
    eval_df['match_id'] = eval_df['id']
    print('Unique id: %s' % len(eval_df))

    eval_df_ = train_data[train_data['label'] == 1][['id', 'match_id']]
    eval_df = pd.concat([eval_df, eval_df_])
    eval_df = eval_df.groupby('id')['match_id'].apply(list).reset_index()
    eval_df['matches'] = eval_df['match_id'].apply(lambda x: ' '.join(set(x)))
    eval_df = eval_df[['id', 'matches']]
    print('Unique id: %s' % len(eval_df))

    iou_score = get_score(eval_df, id2poi, poi2id)
    return iou_score

get_max_score(data, train_data, tv_ids_d, 'train')
# get_max_score(data, valid_data, tv_ids_d, 'valid')

Unique id: 569406
Unique id: 569406


0.9755151282326718

- KNN 20 - max IOU 0.9231233344021228
- KNN 25 - max IOU 0.9290987088025121
- KNN 30 - max IOU 0.9336762986665911
- KNN 35 - max IOU 0.9372980642092062
- KNN 40 - max IOU 0.9402722991889505
- KNN 45 - max IOU 0.9427770977806317
- KNN 50 - max IOU 0.9449001281362694

- KNN 20 + KNN by name and country - 0.9732
- KNN 20 + KNN by name, address and country - 0.975 (too heavy)
- KNN 20 + KNN by name and country + simple KNN - 0.9737
- KNN 20 + KNN by name and country + simple KNN with sorted values - 0.97395

- KNN 20 + KNN by cleaned name and country + simple KNN with sorted values and TF-IDF(3,3) - 0.975515

## Create train and validation datasets

In [None]:
def create_dataset(df, label='train'):
    print(f'Dataset type is {label}')
    ## Add features
    count = 0
    start_row = 0

    # data = data.set_index('id')
    unique_id = df['id'].unique().tolist()
    num_split_id = len(unique_id) // CFG.n_splits
    for k in range(1, CFG.n_splits + 1):
        print('Current split: %s' % k)
        end_row = start_row + num_split_id
        if k < CFG.n_splits:
            cur_id = unique_id[start_row : end_row]
            cur_data = df[df['id'].isin(cur_id)]
        else:
            cur_id = unique_id[start_row: ]
            cur_data = df[df['id'].isin(cur_id)]

        cur_data = add_distance_features(cur_data)
        cur_data['kdist_diff'] = np.abs(cur_data['kdist'] - cur_data['kdist_country']) /\
                                  cur_data['kdist_country']
        cur_data['kneighbors_mean'] = cur_data[['kneighbors', 'kneighbors_country']].mean(axis = 1)
        
        for f in CFG.bad_features:
            if f in cur_data.columns:
                cur_data = cur_data.drop(f, axis=1)
        
        cur_data = cur_data, _ = shrink_mem_new_cols(cur_data, oldcols=None, allow_categorical=True, silent=True)
        
        if CFG.n_splits == 1:
            cur_data.to_parquet(f'{CFG.train_path}/{label}_data.parquet', index = False)    
        else:
            cur_data.to_parquet(f'{CFG.train_path}/{label}_data{k}.parquet', index = False)    
        
        start_row = end_row
        count += len(cur_data)

        if CFG.debug:
            display(cur_data.head())
        
        del cur_data
        gc.collect()
    
    print(f'Total len is {count}')
    return 0
    
data = data.set_index('id')

with multiprocessing.Pool(processes=2) as pool:
    pool.starmap(create_dataset, [(train_data, 'train'), 
                                  (valid_data, 'valid')])

Dataset type is train
Current split: 1


  0%|          | 0/15 [00:00<?, ?it/s]

Dataset type is valid
Current split: 1


  0%|          | 0/15 [00:00<?, ?it/s]

# Further ideas



- add this location matching https://www.kaggle.com/code/gabrielbchacon/full-foursquare-location-matches
- try DBSCAN https://www.kaggle.com/code/frankmollard/rapids-dbscan
- use simple cat similarity https://www.kaggle.com/code/sgreiner/foursquare-simple-semantic-cat-similarity
- use Cat2Vec to calculate categories similarity https://www.kaggle.com/code/aerdem4/foursquare-cat2vec/notebook
- use BERT to categorize text column https://www.kaggle.com/code/lunapandachan/foursquare-s-bert-labo-note#V3-Labeled-train


- check features with SHAP

- how to handle missing data https://www.kaggle.com/code/parulpandey/a-guide-to-handling-missing-values-in-python


- increase number of nearest neighbours to a very high value (like 50-100-200), so you will be able to find more matches
- Optuna!
- increase number of folds (like 5) 
- use stacking