This notebook shows how to solve the problem as a multi-class classification by finding candidate points based on geographic location.<br>
Similarity as a string, such as edit distance and LCS (Longest Common Subsequence), was used for the features of the candidate points.<br>
<br>
Inference is made on test data only, but the code for training is left commented out.<br>
<br>
In addition, making the matches bidirectional as a post-processing step improved the score by about 1%.<br>
<br>

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
import os
import gc
import re
import sys
import math
import json
import time
import random
import joblib
import pickle
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from glob import glob
from unidecode import unidecode
import multiprocessing
from tqdm.auto import tqdm
from argparse import Namespace
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import haversine_distances
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold

from requests import get

CFG = Namespace(
    seed = 46,
    train = True,
    target = "point_of_interest",
    n_neighbors = 10,
    n_splits = 3
)

random.seed(CFG.seed)
os.environ["PYTHONHASHSEED"] = str(CFG.seed)
np.random.seed(CFG.seed)

plt.rcParams["font.size"] = 13
warnings.filterwarnings('ignore')

sns.set_style("darkgrid")

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 600)

# Get the dataset

In [3]:
train = pd.read_parquet("foursquare-location-matching-parquet/train.parquet")
test = pd.read_csv("foursquare-location-matching/test.csv")
test[CFG.target] = "TEST"

train.head(5)

Unnamed: 0,id,name,latitude,longitude,address,city,state,zip,country,url,phone,categories,point_of_interest
0,E_000001272c6c5d,Café Stad Oudenaarde,50.859975,3.634196,Abdijstraat,Nederename,Oost-Vlaanderen,9700.0,BE,,,Bars,P_677e840bb6fc7e
1,E_000002eae2a589,Carioca Manero,-22.907225,-43.178244,,,,,BR,,,Brazilian Restaurants,P_d82910d8382a83
2,E_000007f24ebc95,ร้านตัดผมการาเกด,13.780813,100.4849,,,,,TH,,,Salons / Barbershops,P_b1066599e78477
3,E_000008a8ba4f48,Turkcell,37.84451,27.844202,Adnan Menderes Bulvarı,,,,TR,,,Mobile Phone Shops,P_b2ed86905a4cd3
4,E_00001d92066153,Restaurante Casa Cofiño,43.338196,-4.326821,,Caviedes,Cantabria,,ES,,,Spanish Restaurants,P_809a884d4407fb


# Prepare data

## Sort categories

In [4]:
def sort_categories(cat):
    if cat is None:
        return None
    return ', '.join(sorted(cat.split(', ')))

train['categories'] = train['categories'].apply(sort_categories)

## Encode all names, addresses, cities and states in unicode 

In [8]:
def decode(val):
    if val is None:
        return None
    return unidecode(val)

train['name'] = train['name'].apply(decode)

## Clean the name

In [10]:
def clean_name(name_col):
    return name_col.str.lower()\
                    .str.replace(",", "")\
                    .str.replace(".", "")\
                    .str.replace("'", "")\
                    .str.replace("the ", "")\
                    .str.split(" ")

## Add main category to the train dataset

In [11]:
cat_freq = dict()
stop_words = ['/', '&', 'or', 'High', 'Miscellaneous', 'Fast', 'Other', 'Asian', 'Chinese', 'Event', 
              'Great', 'Noodle', 'Burger', 'Seafood', 'Breakfast', 'Ice', 'Diners', 'Cream', 'Indonesian', 
              'Thai', "Women's", 'Fried', 'Snack', 'Tea', 'Mexican', 'Nail', 'Sushi', 'Middle', 'Korean', 
              'Gift', 'Drink', 'Pet', 'Turkish', "Men's", 'Indian', 'Malay', 'Cocktail', 'Donut', 'Box', 
              'Condos)', 'Residential', 'Convenience', 'Gas', 'General', 'Bus', 'Pizza', 'Spaces', 'Mobile',
              'Phone', 'Academic', 'Japanese', 'Business', 'Shoe', 'Italian', 'American', 'Home', 'Auto', 
              'Furniture', 'Cosmetics', 'Sandwich', 'Dessert', 'Car', 'Arts', 'Financial', 'Legal', 'BBQ',
              'Hardware', 'Video', 'Music', 'Art', 'Student', 'Jewelry', 'Historic', 'Travel', 'Washes',
              'Beer', 'Arcades', 'Bike', 'Lookouts', 'Scenic', 'Rental', 'Accessories', 'Repairs', 'Discount', 
              'Optical', 'Bodegas', 'Big', 'Assisted', 'Living', 'Athletics', 'Agencies', 'Locations', 'Trails', 
              'Bed', 'Breakfasts', 'Wine', 'Real', 'Elementary', 'Theme', 'Golf', 'Rest',  'Photography', 
              'Nightlife', 'Courses', 'Convention', 'Eastern', 'Concert', 'Conference', 'Startups', 'Tech', 
              'Meeting', 'French', 'Supplies', 'Events', 'Sake', 'Dog', 'Ramen', 'City', 'Juice', 'Science',
              'Liquor', 'Lawyers', 'Insurance', 'Flower', 'Toy', 'Rentals', 'Paper', 'Flea', 'Bases', 'Baseball', 
              'Karaoke', 'Kids', 'Design', 'Farmers', 'Repair', 'Technology', 'Wards', 'Water', 'Supply', 
              'Filipino', 'Piers', 'Salad', 'Mattress', 'Print', 'Wings', 'Engineering', 'Non-Profits', 
              'Gastropubs', 'Bistros', 'Hot', 'Vietnamese', 'Hookah', 'Candy', 'Coffee', 'Electronics',
              'Department', 'Clothing', 'Trucks', 'Chicken', 'Movie', 'Health', 'Soccer', 'Crafts', 
              'Game', 'Community', 'Food', 'College', 'Sporting', 'Beauty', 'Ferries', 'Soup', 'Veterinarians', 
              'Basketball', 'Light', 'Rail', 'Taco', 'Classrooms', 'Shopping', 'Developments', 'Train', 'Performing',
              'Administrative', 'Lingerie', 'Dive', 'Storage', 'Office', 'Landscaping', 'Residence', 'Sports',
              'Goods', 'Dealerships', 'Grocery', 'Workshops', 'History'
             ]


def get_categories(category):
    if category == 'Auto':
        return 'Automotive'
    if category == 'Hotel' or c == 'Motels' or c == 'Hostels':
        return 'Hotels'
    if category == 'Courthouses':
        return 'Court'
    if category == 'College':
        return 'Colleges'
    if category == 'Cafés':
        return 'Cafes'
    if category == "Doctor's" or c == "Dentist's" or c == "Doctors":
        return 'Medical'
    if category == '(Apartments':
        return 'Apartments'
    return category
    

for category in tqdm(train['categories']):
    if category is not None:
        category_list = re.split(', | ', category)
        for c in category_list:
            if c in stop_words or c[-2:] == 'an':
                continue
            c = get_categories(c)
            x = cat_freq.get(c, 0) + 1
            cat_freq[c] = x

            
cat_freq = pd.DataFrame(cat_freq.items(), 
                        columns=['category', 'frequence']).sort_values('frequence', 
                                                                        ascending=False).reset_index(drop=True)
cat_freq = cat_freq.iloc[:108]

cat_freq_dict = dict(zip(cat_freq['category'], cat_freq['frequence']))

def get_main_category(category):
    if category is not None:
        category_list = re.split(', | ', category)
        most_freq_cat = np.nan
        freq = 0
        
        for c in category_list:
            if c in stop_words or c[-2:] == 'an':
                continue
            c = get_categories(c)
            f = cat_freq_dict.get(c, 0)
            if f > freq:
                freq = f
                most_freq_cat = c
        
        return most_freq_cat
            
    return np.nan

train['main_categories'] = train['categories'].apply(get_main_category)
test['main_categories'] = test['categories'].apply(get_main_category)

  0%|          | 0/1138812 [00:00<?, ?it/s]

## Fill the missing data with data from outer sources

- to fill the state we will use objects with code A (country, state, region)

- to fill the city we will use objects with code P (city, village)

- to fill the street we will use objects with codes R (road, railroad) and S (spot, building, farm)

In [12]:
# states = pd.read_csv('states.csv', index_col='Unnamed: 0')
cities = pd.read_csv('additional_data/cities.csv', encoding = "ISO-8859-1")
cities = cities[['asciiname', 'latitude', 'longitude', 'country code']]
cities.rename({'asciiname': 'city', 'country code': 'country'}, axis=1, inplace=True)

## Fill the missing data by finding closest neighbors from outer sources

In [13]:
train.isnull().sum()

id                        0
name                      1
latitude                  0
longitude                 0
address              396621
city                 299189
state                420586
zip                  595426
country                  11
url                  871088
phone                795957
categories            98307
point_of_interest         0
main_categories      189673
dtype: int64

In [14]:
from sklearn.neighbors import KNeighborsRegressor

train['country'] = train['country'].fillna('NA')
test['country'] = test['country'].fillna('NA')

geoname_dict = {'city': cities}
geoname_dists = {'city': 5000}

def fill_the_missing_data(df, df_dict, df_dists):
    dfs = []
    columns = list(df_dict.keys())
    for c in tqdm(columns):
        for country, country_df in tqdm(df.groupby("country")):

            geoname_df = df_dict[c]
            
            country_df = country_df[country_df[c].isnull()]
            geoname_df = geoname_df[geoname_df['country'] == country]
                
            if len(country_df) == 0 or len(geoname_df) == 0:
                continue
            
            knn = KNeighborsRegressor(n_neighbors=min(len(geoname_df), 2), metric='haversine', n_jobs=-1)
            knn.fit(geoname_df[['latitude','longitude']], geoname_df.index)
            dists, nears = knn.kneighbors(country_df[['latitude','longitude']], return_distance=True)
            
            if nears.shape[1] < 2:
                continue
            
            nears[:,1] = nears[:,0]
            nears[:,0] = country_df.index
            dists = dists[:,0]*6371000
            
            nears = nears[dists<=df_dists[c]]
            
#             display(nears)
#             display(dists)
            
            for n in nears:
                t_idx = n[0]
                c_idx = n[1]
                df.loc[t_idx, c] = geoname_df.iloc[c_idx]['city']
#                 display(df.loc[t_idx])
#                 display(geoname_df.iloc[c_idx])
                       
    return df
    
# train = fill_the_missing_data(train, geoname_dict, geoname_dists)
# test = fill_the_missing_data(test, geoname_dict, geoname_dists)

# del cities

gc.collect()

0

In [15]:
train.isnull().sum()

id                        0
name                      1
latitude                  0
longitude                 0
address              396621
city                 299189
state                420586
zip                  595426
country                   0
url                  871088
phone                795957
categories            98307
point_of_interest         0
main_categories      189673
dtype: int64

## Divide Train Data into about 600K×2

This is because test set size is about 600K, so we want our model to train on KNN embeddings built on datasets of similar size.

In [16]:
kf = GroupKFold(n_splits=2)
for i, (trn_idx, val_idx) in enumerate(kf.split(train, train[CFG.target], train[CFG.target])):
    train.loc[val_idx, "set"] = i
train["set"].value_counts()

1.0    569406
0.0    569406
Name: set, dtype: int64

## Search Candidates

In [None]:
from sklearn.neighbors import KNeighborsRegressor

def create_target(row):
    if row[CFG.target] == row['near_target_0']:
        return 1
    return 0

def add_neighbor_features(df, train_mode=True):
    dfs = []
    columns = ['id', 'name', 'address', 'city', 'state',
           'zip', 'country', 'url', 'phone', 'categories', 'main_categories']
    for c in columns:
        if c != "id":
            df[c] = df[c].astype(str).str.lower()

    for country, country_df in tqdm(df.groupby("country")):
        country_df = country_df.reset_index(drop=True)
        
        knn = KNeighborsRegressor(n_neighbors=min(len(country_df), CFG.n_neighbors), 
                                  metric='haversine', n_jobs=-1)
        knn.fit(country_df[['latitude','longitude']], country_df.index)
        dists, nears = knn.kneighbors(country_df[['latitude','longitude']], return_distance=True)

        targets = country_df[CFG.target].values
        
        for i in range(1, min(len(country_df), CFG.n_neighbors)): # 200
            for j in range(1, min(len(country_df), CFG.n_neighbors)): # 200
                temp_df = country_df.copy()
                # if j > 10 and j != i: continue
                if j < i:
                    country_df[f"d_near_{j}"] = dists[:, j]
                    country_df[f"near_target_{j}"] = targets[nears[:, j]]
                    for c in columns:
                        country_df[f"near_{c}_{j}"] = country_df[c].values[nears[:, j]]
                elif j > i:
                    country_df[f"d_near_{j-1}"] = dists[:, j]
                    country_df[f"near_target_{j-1}"] = targets[nears[:, j]]
                    for c in columns:
                        country_df[f"near_{c}_{j-1}"] = country_df[c].values[nears[:, j]]
                else:
                    country_df[f"d_near_0"] = dists[:, j]
                    country_df[f"near_target_0"] = targets[nears[:, j]]
                    for c in columns:
                        country_df[f"near_{c}_0"] = country_df[c].values[nears[:, j]]    

            for j in range(min(len(country_df), CFG.n_neighbors), CFG.n_neighbors):
                country_df[f"d_near_{j}"] = np.nan
                country_df[f"near_target_{j}"] = np.nan
                for c in columns:
                    country_df[f"near_{c}_{j}"] = np.nan

            dfs.append(temp_df)
        
    df = pd.concat(dfs).reset_index(drop=True)
    
    if train_mode:
        df['target'] = df.apply(create_target, axis=1)
    
    return df

train = pd.concat([
    add_neighbor_features(train[train["set"]==0]), 
    add_neighbor_features(train[train["set"]==1])
])
test = add_neighbor_features(test, train_mode=False)

gc.collect()

  0%|          | 0/210 [00:00<?, ?it/s]

  0%|          | 0/211 [00:00<?, ?it/s]

## Balance train dataset

In [None]:
train = train.reset_index(drop=True)

# select indexes of all positive targets
# and select indexes of all ids that don't have postive targets at all
pos_ids = train.loc[train['target'] == 1, 'id'].unique()
pos_idxs = train[train['target'] == 1].index
neg_idxs = train.loc[~train['id'].isin(pos_ids), 'id'].drop_duplicates(keep='first').index

# additionally select indexes of ids that have negative target
# but may have positive target 
neg_idxs1 = train[train['target'] == 0].index
neg_idxs1 = neg_idxs1.difference(neg_idxs)
neg_idxs1 = np.random.choice(neg_idxs1, size=len(pos_idxs)-len(neg_idxs))

# and add them to negative indexes, so the total number of positive and negative indexes are equal
neg_idxs = neg_idxs.union(neg_idxs1)

# select these positive and negative indexes from the dataset
train = train.loc[pos_idxs.union(neg_idxs)]

gc.collect()

train['target'].value_counts()

## Check Maximum Score

In [None]:
# # https://www.kaggle.com/code/columbia2131/foursquare-iou-metrics
# def get_id2poi(input_df: pd.DataFrame) -> dict:
#     return dict(zip(input_df['id'], input_df['point_of_interest']))

# def get_poi2ids(input_df: pd.DataFrame) -> dict:
#     return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

# def get_score(input_df: pd.DataFrame):
#     scores = []
#     for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
#         targets = poi2ids[id2poi[id_str]]
#         preds = set(matches.split())
#         score = len((targets & preds)) / len((targets | preds))
#         scores.append(score)
#     scores = np.array(scores)
#     return scores.mean()

# id2poi = get_id2poi(train)
# poi2ids = get_poi2ids(train)

In [None]:
# scores = []

# train["matches"] = ""
# for i in tqdm(range(CFG.n_neighbors)):
#     idx = train[CFG.target]==train[f"near_target_{i}"]
#     train.loc[idx, "matches"] += " " + train.loc[idx, f"near_id_{i}"]
#     scores.append(get_score(train))
# train["mathces"] = None

In [None]:
# plt.subplots(figsize=(8, 3), facecolor="white")
# plt.plot(range(CFG.n_neighbors), scores, marker="o")
# plt.grid()
# plt.xlabel("# of candidates")
# plt.ylabel("Maximum Score")
# plt.ylim([0.6, 1.0])
# plt.show()

In [None]:
# del train
# gc.collect()

## Feature Engineering

In [None]:
%load_ext Cython

In [None]:
%%cython
def LCS(str S, str T):
    cdef int i, j
    cdef list dp = [[0] * (len(T) + 1) for _ in range(len(S) + 1)]
    for i in range(len(S)):
        for j in range(len(T)):
            dp[i + 1][j + 1] = max(dp[i][j] + (S[i] == T[j]), dp[i + 1][j], dp[i][j + 1], dp[i + 1][j + 1])
    return dp[len(S)][len(T)]

## Split the dataset again (to avoid OOM) 

In [None]:
train = train.reset_index(drop=True)

kf = KFold(n_splits=4)
for i, (trn_idx, val_idx) in enumerate(kf.split(train, train[CFG.target])):
    train.loc[val_idx, "set"] = i
train["set"].value_counts()

## Create distance features

In [None]:
import Levenshtein
import difflib

def _add_distance_features(args):
    _, df = args

    columns = ['name', 'address', 'city', 'state',
           'zip', 'country', 'url', 'phone', 'categories', 'main_categories']

    for i in tqdm(range(CFG.n_neighbors)):
        for c in columns:
            geshs = []
            levens = []
            jaros = []
            lcss = []
            for str1, str2 in df[[c, f"near_{c}_{i}"]].values.astype(str):
                if str1 is not None and str2 is not None:
                    geshs.append(difflib.SequenceMatcher(None, str1, str2).ratio())
                    levens.append(Levenshtein.distance(str1, str2))
                    jaros.append(Levenshtein.jaro_winkler(str1, str2))
                    lcss.append(LCS(str(str1), str(str2)))
                else:
                    geshs.append(-1)
                    levens.append(-1)
                    jaros.append(-1)
            df[f"near_{c}_{i}_gesh"] = geshs
            df[f"near_{c}_{i}_leven"] = levens
            df[f"near_{c}_{i}_jaro"] = jaros
            df[f"near_{c}_{i}_lcs"] = lcss
            
            if not c in ['country', "phone", "zip"]:
                df[f"near_{c}_{i}_len"] = df[f"near_{c}_{i}"].astype(str).map(len)
                df[f"near_{c}_{i}_nleven"] = df[f"near_{c}_{i}_leven"] / df[[f"near_{c}_{i}_len", f"near_{c}_0_len"]].max(axis=1)
                df[f"near_{c}_{i}_nlcsi"] = df[f"near_{c}_{i}_lcs"] / df[f"near_{c}_{i}_len"]
                df[f"near_{c}_{i}_nlcs0"] = df[f"near_{c}_{i}_lcs"] / df[f"near_{c}_0_len"]
    return df


def add_distance_features(df):
    processes = multiprocessing.cpu_count()-1
    with multiprocessing.Pool(processes=processes) as pool:
        dfs = pool.imap_unordered(_add_distance_features, df.groupby('country'))
        dfs = tqdm(dfs)
        dfs = list(dfs)
    df = pd.concat(dfs)
    return df


train = pd.concat([
    add_distance_features(train[train["set"]==0]), 
    add_distance_features(train[train["set"]==1]),
    add_distance_features(train[train["set"]==2]), 
    add_distance_features(train[train["set"]==3])
])
# train = add_distance_features(train)
# test = add_distance_features(test)

## Delete unused columns (just to avoid OOM)

In [None]:
features = []

columns = ['name', 'address', 'city', 'state',
       'zip', 'country', 'url', 'phone', 'categories', 'main_categories']
for i in tqdm(range(CFG.n_neighbors)):
    features.append(f"d_near_{i}")
    for c in columns:        
        features += [f"near_{c}_{i}_gesh", f"near_{c}_{i}_jaro", f"near_{c}_{i}_lcs"]
        if c in ['country', "phone", "zip"]:
            features += [f"near_{c}_{i}_leven"]
        else:
            features += [f"near_{c}_{i}_len", f"near_{c}_{i}_nleven", f"near_{c}_{i}_nlcsi", f"near_{c}_{i}_nlcs0"]

for f in features:
    if f not in test.columns:
        test[f] = np.nan


In [None]:
train = train[features + [CFG.target, "target", "id"] + [f"near_id_{i}" for i in range(CFG.n_neighbors)]]
# test = test[features + ["id"] + [f"near_id_{i}" for i in range(CFG.n_neighbors)]]

train[features] = train[features].astype(np.float16)
# test[features] = test[features].astype(np.float16)

train["target"] = train["target"].fillna(0)
train["near_id_0"] = train["near_id_0"].fillna('')

train.reset_index(drop=True, inplace=True)
# test.reset_index(drop=True, inplace=True)

for _ in range(5):
    gc.collect()

train.info()

# Train

In [None]:
import lightgbm as lgb
from scipy.misc import derivative


def fit_lgbm(X, y, params=None, es_rounds=20, seed=42, N_SPLITS=5, 
             n_class=None, model_dir=None, folds=None):
    cat_features = X.select_dtypes(include='object').columns
    
    models = []
    oof = np.zeros(len(y), dtype=np.float64)
    
    for i in tqdm(range(CFG.n_splits)):
        print(f"== fold {i} ==")
        trn_idx = folds != i
        val_idx = folds == i
        
        train_dataset = lgb.Dataset(X.iloc[trn_idx], y.iloc[trn_idx], categorical_feature=cat_features)
        valid_dataset = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx], categorical_feature=cat_features)

        
        focal_loss = lambda x,y: focal_loss_lgb(x, y, alpha=1., gamma=1.)
        focal_loss_eval = lambda x,y: focal_loss_lgb_eval_error(x, y, alpha=1., gamma=1.)
        
        if model_dir is None:
            model = lgb.train(
                params,
                train_set = train_dataset, 
                valid_sets = [train_dataset, valid_dataset], 
                callbacks = [lgb.log_evaluation(100), 
                             lgb.early_stopping(stopping_rounds=es_rounds)],
            )
        else:
            with open(f'{model_dir}/lgbm_fold{i}.pkl', 'rb') as f:
                model = pickle.load(f)
            
        pred = model.predict(X.iloc[val_idx])
        oof[val_idx] = pred
        models.append(model)
        
        file = f'lgbm_fold{i}.pkl'
        pickle.dump(model, open(file, 'wb'))
        print()

    cv = (np.round(oof) == y).mean()
    print(f"CV-accuracy: {cv}")

    return oof, models

def inference_lgbm(models, feat_df):
    pred = np.array([model.predict(feat_df) for model in models])
    pred = np.mean(pred, axis=0)
    return pred

In [None]:
kf = StratifiedKFold(n_splits=CFG.n_splits, shuffle=True, random_state=CFG.seed)
for i, (trn_idx, val_idx) in tqdm(enumerate(kf.split(train, train["target"], train["target"]))):
    train.loc[val_idx, "fold"] = i

# Validation

### Split folds

In [None]:
warnings.filterwarnings("ignore", module="lightgbm")

params = {
    'seed': CFG.seed,
    'device': 'gpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    'objective': 'binary',
    'learning_rate': 0.2,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'max_bin': 200,
    'max_depth': 7,   
    'num_leaves': 35, 
    'min_data_in_leaf': 25,
    'n_estimators': 5000, 
    'colsample_bytree': 0.9,
    'verbose': -1,
}

if CFG.train:
    oof, models = fit_lgbm(train[features], train["target"].astype(int), 
                           params=params, n_class=int(train["target"].max() + 1), 
                           N_SPLITS=CFG.n_splits, folds=train["fold"].values)
#                            model_dir='foursquare-exp009')
else:
    models = [joblib.load(f'foursquare-exp009/lgbm_fold{i}.pkl') for i in range(CFG.n_splits)]
pred = inference_lgbm(models, test[features])

# Postprocess and validate

In [None]:
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame):
    scores = []
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

def postprocess(df):
    id2match = dict(zip(df["id"].values, df["matches"].str.split()))

    for match in tqdm(df["matches"]):
        match = match.split()
        if len(match) == 1:        
            continue

        base = match[0]
        for m in match[1:]:
            if not base in id2match[m]:
                id2match[m].append(base)
    df["matches"] = df["id"].map(id2match).map(" ".join)
    return df 

near_id = train["near_id_0"].values
matches = []

for id, ps, ids in tqdm(zip(train["id"], oof, near_id)):
    idx = np.round(ps)
    if idx == 1:
        matches.append(id + " " + ids)
    else:
        matches.append(id)
        
train['matches'] = matches
        

train = postprocess(train)
# test = postprocess(test)

id2poi = get_id2poi(train)
poi2ids = get_poi2ids(train)
print(f"CV: {get_score(train):.6f}")

In [None]:
# Baseline 
# acc: 0.9041545048699873
# CV: 0.824261

# Your base model:
# acc: 0.85371
# CV: 0.805996

# Change task to binary, save near neighbours"
# acc: 0.932
# CV: 0.8331

# Add main categories:
# acc 0.93494
# CV 0.834842

# Sort categories:
# acc 0.934852
# CV 0.834894

# Unidecode name column:
# acc 
# CV 

# Check Feature Importances

In [None]:
def plot_importances(models):
    importance_df = pd.DataFrame(models[0].feature_importance(), 
                                 index=features, 
                                 columns=['importance'])\
                        .sort_values("importance", ascending=False)

    plt.subplots(figsize=(len(features) // 4, 5))
    plt.bar(importance_df.index, importance_df.importance)
    plt.grid()
    plt.xticks(rotation=90)
    plt.ylabel("importance")
    plt.tight_layout()
    plt.show()

plot_importances(models)

# Simple Post-Processing

# Submit

In [None]:
ssub = pd.read_csv("../input/foursquare-location-matching/sample_submission.csv")
ssub = ssub.drop(columns="matches")
ssub = ssub.merge(test[["id", "matches"]], on="id")
ssub.to_csv("submission.csv", index=False)

ssub.head()

# Further ideas

- normalize unicode strings: https://www.kaggle.com/competitions/foursquare-location-matching/discussion/320938
- try clean the name after transforming to unicode string (you already have the function for this)
- fill the gaps in address/state data
- after that you can try to submit



- add ntlk.edit_distance to your features
- change KNN to the variant, that was proposed in this notebook: https://www.kaggle.com/code/ragnar123/flm-xlmroberta-inference-baseline
- add manhattan distance and euqlidian distance
- increase number of nearest neighbours to a very high value (like 50-100-200), so you will be able to find more matches; don't increase number of neighbours in the table to avoid OOM


- how to handle missing data https://www.kaggle.com/code/parulpandey/a-guide-to-handling-missing-values-in-python
- mean/median/std encode features
- use feature generation and selection from this notebook https://www.kaggle.com/code/aerdem4/foursquare-gpu-accelerated-lofo-feature-importance
- use Cat2Vec to calculate categories similarity https://www.kaggle.com/code/aerdem4/foursquare-cat2vec/notebook



- try XLMRoberta