In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
import os
import gc
import re
import sys
# import cudf
import math
import json
import time
import random
import joblib
import pickle
import warnings
import difflib
import Levenshtein
import numpy as np
import pandas as pd
import seaborn as sns
from glob import glob
from pathlib import Path
from unidecode import unidecode
import multiprocessing
from tqdm.auto import tqdm
from argparse import Namespace
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics.pairwise import haversine_distances
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold, StratifiedGroupKFold

from requests import get

CFG = Namespace(
    kaggle = False,
    seed = 46,
    train = False,
    validate = False,
    inference = True,
    target = "point_of_interest",
    n_neighbors = 50,
    n_neighbors_in_cols = 5,
    n_splits = 3
)

random.seed(CFG.seed)
os.environ["PYTHONHASHSEED"] = str(CFG.seed)
np.random.seed(CFG.seed)

plt.rcParams["font.size"] = 13
warnings.filterwarnings('ignore')

sns.set_style("darkgrid")

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 600)

# Get the dataset

In [3]:
train = pd.read_csv("foursquare_location_matching/train.csv")
train.head()

Unnamed: 0,id,name,latitude,longitude,address,city,state,zip,country,url,phone,categories,point_of_interest
0,E_000001272c6c5d,Café Stad Oudenaarde,50.859975,3.634196,Abdijstraat,Nederename,Oost-Vlaanderen,9700.0,BE,,,Bars,P_677e840bb6fc7e
1,E_000002eae2a589,Carioca Manero,-22.907225,-43.178244,,,,,BR,,,Brazilian Restaurants,P_d82910d8382a83
2,E_000007f24ebc95,ร้านตัดผมการาเกด,13.780813,100.4849,,,,,TH,,,Salons / Barbershops,P_b1066599e78477
3,E_000008a8ba4f48,Turkcell,37.84451,27.844202,Adnan Menderes Bulvarı,,,,TR,,,Mobile Phone Shops,P_b2ed86905a4cd3
4,E_00001d92066153,Restaurante Casa Cofiño,43.338196,-4.326821,,Caviedes,Cantabria,,ES,,,Spanish Restaurants,P_809a884d4407fb


# Prepare data

## Sort categories

In [4]:
def sort_categories(cat):
    if cat != cat:
        return np.nan
    return ', '.join(sorted(cat.split(', ')))

train['categories'] = train['categories'].apply(sort_categories)

## Encode name  in unicode 

In [5]:
def decode(col):
    if col != col:
        return np.nan
    return unidecode(col)

train['name'] = train['name'].apply(decode)

## Clean the name

In [6]:
def clean_name(col):
    if col != col:
        return np.nan
    return col.lower()\
              .replace(",", "")\
              .replace(".", "")\
              .replace("'", "")\
              .replace("the ", "")

train['name'] = train['name'].apply(clean_name)

## Make same categories for the same names

In [7]:
# brings shops with similar names to one name
train.loc[train['name']=='mc donalds', 'name'] = 'mcdonalds'

## Add main category to the train dataset

In [8]:
# cat_freq = dict()
stop_words = ['/', '&', 'or', 'High', 'Miscellaneous', 'Fast', 'Other', 'Asian', 'Chinese', 'Event', 
              'Great', 'Noodle', 'Burger', 'Seafood', 'Breakfast', 'Ice', 'Diners', 'Cream', 'Indonesian', 
              'Thai', "Women's", 'Fried', 'Snack', 'Tea', 'Mexican', 'Nail', 'Sushi', 'Middle', 'Korean', 
              'Gift', 'Drink', 'Pet', 'Turkish', "Men's", 'Indian', 'Malay', 'Cocktail', 'Donut', 'Box', 
              'Condos)', 'Residential', 'Convenience', 'Gas', 'General', 'Bus', 'Pizza', 'Spaces', 'Mobile',
              'Phone', 'Academic', 'Japanese', 'Business', 'Shoe', 'Italian', 'American', 'Home', 'Auto', 
              'Furniture', 'Cosmetics', 'Sandwich', 'Dessert', 'Car', 'Arts', 'Financial', 'Legal', 'BBQ',
              'Hardware', 'Video', 'Music', 'Art', 'Student', 'Jewelry', 'Historic', 'Travel', 'Washes',
              'Beer', 'Arcades', 'Bike', 'Lookouts', 'Scenic', 'Rental', 'Accessories', 'Repairs', 'Discount', 
              'Optical', 'Bodegas', 'Big', 'Assisted', 'Living', 'Athletics', 'Agencies', 'Locations', 'Trails', 
              'Bed', 'Breakfasts', 'Wine', 'Real', 'Elementary', 'Theme', 'Golf', 'Rest',  'Photography', 
              'Nightlife', 'Courses', 'Convention', 'Eastern', 'Concert', 'Conference', 'Startups', 'Tech', 
              'Meeting', 'French', 'Supplies', 'Events', 'Sake', 'Dog', 'Ramen', 'City', 'Juice', 'Science',
              'Liquor', 'Lawyers', 'Insurance', 'Flower', 'Toy', 'Rentals', 'Paper', 'Flea', 'Bases', 'Baseball', 
              'Karaoke', 'Kids', 'Design', 'Farmers', 'Repair', 'Technology', 'Wards', 'Water', 'Supply', 
              'Filipino', 'Piers', 'Salad', 'Mattress', 'Print', 'Wings', 'Engineering', 'Non-Profits', 
              'Gastropubs', 'Bistros', 'Hot', 'Vietnamese', 'Hookah', 'Candy', 'Coffee', 'Electronics',
              'Department', 'Clothing', 'Trucks', 'Chicken', 'Movie', 'Health', 'Soccer', 'Crafts', 
              'Game', 'Community', 'Food', 'College', 'Sporting', 'Beauty', 'Ferries', 'Soup', 'Veterinarians', 
              'Basketball', 'Light', 'Rail', 'Taco', 'Classrooms', 'Shopping', 'Developments', 'Train', 'Performing',
              'Administrative', 'Lingerie', 'Dive', 'Storage', 'Office', 'Landscaping', 'Residence', 'Sports',
              'Goods', 'Dealerships', 'Grocery', 'Workshops', 'History'
             ]


def get_categories(category):
    if category == 'Auto':
        return 'Automotive'
    if category == 'Hotel' or category == 'Motels' or category == 'Hostels':
        return 'Hotels'
    if category == 'Courthouses':
        return 'Court'
    if category == 'College':
        return 'Colleges'
    if category == 'Cafés':
        return 'Cafes'
    if category == "Doctor's" or category == "Dentist's" or category == "Doctors":
        return 'Medical'
    if category == '(Apartments':
        return 'Apartments'
    return category

if CFG.kaggle:
    cat_freq = pd.read_csv('../input/foursquare-main-categories/cat_freq.csv', index_col='Unnamed: 0')
else:
    cat_freq = pd.read_csv('foursquare_main_categories/cat_freq.csv', index_col='Unnamed: 0')

cat_freq_dict = dict(zip(cat_freq['category'], cat_freq['frequence']))

def get_main_category(category):
    if category == category:
        category_list = re.split(', | ', category)
        most_freq_cat = np.nan
        freq = 0
        
        for c in category_list:
            if c in stop_words or c[-2:] == 'an':
                continue
            c = get_categories(c)
            f = cat_freq_dict.get(c, 0)
            if f > freq:
                freq = f
                most_freq_cat = c
        
        return most_freq_cat
            
    return np.nan

train['main_categories'] = train['categories'].apply(get_main_category)

## Fill the missing data with data from outer sources

In [9]:
# states = pd.read_csv('states.csv', index_col='Unnamed: 0')
cities = pd.read_csv('additional_data/cities.csv', encoding = "ISO-8859-1")
cities = cities[['asciiname', 'latitude', 'longitude', 'country code']]
cities.rename({'asciiname': 'city', 'country code': 'country'}, axis=1, inplace=True)

# starbucks = pd.read_csv('additional_data/starbucks.csv', index_col='Unnamed: 0')
# starbucks = starbucks[['countryCode', 'latitude', 'longitude', 'streetAddressLine2', 'city']]
# starbucks.rename({'countryCode': 'country', 'streetAddressLine2': 'address'}, axis=1, inplace=True)
# starbucks.head()

## Fill the missing data by finding closest neighbors from outer sources

In [10]:
train['country'] = train['country'].fillna('NA')
# train['closest_city'] = ''

geoname_dict = {'city': cities}

def fill_the_missing_data(args):#, df_dists):
    country, country_df = args
    dfs = []
    columns = list(geoname_dict.keys())
    for c in tqdm(columns):

            geoname_df = geoname_dict[c]
            
#             country_df = country_df[(country_df[c].isnull())]
            geoname_df = geoname_df[geoname_df['country'] == country]
                
            if len(country_df) == 0 or len(geoname_df) == 0:
                continue
            
            knn = KNeighborsRegressor(n_neighbors=min(len(geoname_df), 2), metric='haversine', n_jobs=-1)
            knn.fit(geoname_df[['latitude','longitude']], geoname_df.index)
            dists, nears = knn.kneighbors(country_df[['latitude','longitude']], return_distance=True)
            
            if nears.shape[1] < 2:
                continue
            
            nears[:,1] = nears[:,0]
            nears[:,0] = country_df.index
#             dists = dists[:,0]*6371000
            
#             nears = nears[dists<=df_dists[c]]
            
#             display(nears)
#             display(dists)
            
            for n in nears:
                t_idx = n[0]
                c_idx = n[1]
                country_df.loc[t_idx, f"closest_{c}"] = geoname_df.iloc[c_idx][c]
#                 display(df.loc[t_idx])
#                 display(geoname_df.iloc[c_idx])
                       
    return country_df
    
    
num_countries = train['country'].nunique()
    
# processes = multiprocessing.cpu_count()
# with multiprocessing.Pool(processes=processes) as pool:
#     dfs = pool.imap_unordered(fill_the_missing_data, train.groupby('country', sort=False))
#     dfs = tqdm(dfs, total=num_countries)
#     dfs = list(dfs)
    
# train = pd.concat(dfs).reset_index(drop=True)

# del cities

# gc.collect()

## Bring all object columns to low register

In [11]:
def to_lower(df):
    for c in df.columns:
#         df[c] = df[c].fillna('')
        if c != "id":
            df[c] = df[c].astype(str).str.lower()
    return df
    
train = to_lower(train)

## Search Candidates Function

In [12]:
# def create_target(row):
#     if row[CFG.target] == row['near_target_0']:
#         return 1
#     return 0

# def add_neighbor_features(df, train_mode=True):
#     dfs = None
#     columns = ['id', 'name', 'address', 'city', 'state',
#            'zip', 'country', 'url', 'phone', 'categories', 'main_categories']
#     for c in columns:
#         if c != "id":
#             df[c] = df[c].astype(str).str.lower()

#     for country, country_df in tqdm(df.groupby("country")):
#         dfs_list = list()
#         country_df = country_df.reset_index(drop=True)
        
#         knn = KNeighborsRegressor(n_neighbors=min(len(country_df), CFG.n_neighbors), 
#                                   metric='haversine', n_jobs=-1)
#         knn.fit(country_df[['latitude','longitude']], country_df.index)
#         dists, nears = knn.kneighbors(country_df[['latitude','longitude']], return_distance=True)

#         targets = country_df[CFG.target].values
        
#         for i in range(min(len(country_df)-1, 1), min(len(country_df), CFG.n_neighbors)): # 200
#             for j in range(min(len(country_df)-1, 1), min(len(country_df), CFG.n_neighbors)): # 200
#                 if j > CFG.n_neighbors_in_cols and j != i: 
#                     continue
#                 elif j < i:
#                     country_df[f"d_near_{j}"] = dists[:, j]
#                     country_df[f"near_target_{j}"] = targets[nears[:, j]]
#                     for c in columns:
#                         country_df[f"near_{c}_{j}"] = country_df[c].values[nears[:, j]]
#                 elif j > i:
#                     country_df[f"d_near_{j-1}"] = dists[:, j]
#                     country_df[f"near_target_{j-1}"] = targets[nears[:, j]]
#                     for c in columns:
#                         country_df[f"near_{c}_{j-1}"] = country_df[c].values[nears[:, j]]
#                 else:
#                     country_df[f"d_near_0"] = dists[:, j]
#                     country_df[f"near_target_0"] = targets[nears[:, j]]
#                     for c in columns:
#                         country_df[f"near_{c}_0"] = country_df[c].values[nears[:, j]]    

#             for j in range(min(len(country_df), CFG.n_neighbors), CFG.n_neighbors):
#                 country_df[f"d_near_{j}"] = np.nan
#                 country_df[f"near_target_{j}"] = np.nan
#                 for c in columns:
#                     country_df[f"near_{c}_{j}"] = np.nan

#             if train_mode:
#                 df['target'] = df.apply(country_df, axis=1)
            
#             dfs_list.append(country_df.copy())
                            
#         res = pd.concat(dfs_list)
                            
#         if dfs is not None:
#             dfs = pd.concat([dfs, res])
#         else:
#             dfs = res.copy()
                            
#         del res
#         gc.collect()

#     return dfs.reset_index(drop=True)

In [13]:
def create_target(target, near_target):
    if target == near_target:
        return 1
    return 0

def add_neighbour_features(df, train_mode=True):
    dfs = None
    columns = ['id', 'name', 'address', 'city', 'state', # 'closest_city', 
           'zip', 'country', 'url', 'phone', 'categories', 'main_categories']

    for country, country_df in tqdm(df.groupby("country")):
        dfs_list = list()
        knn = KNeighborsRegressor(n_neighbors=min(len(country_df), CFG.n_neighbors), 
                                  metric='haversine', n_jobs=-1)
        knn.fit(country_df[['latitude','longitude']], country_df.index)
        dists, nears = knn.kneighbors(country_df[['latitude','longitude']], return_distance=True)

        targets = country_df[CFG.target].values
        
        if len(country_df) == 1:
            country_df[f"d_near"] = np.nan
            country_df[f"near_target"] = np.nan
            for c in columns:
                country_df[f"near_{c}"] = np.nan
            if train_mode:
                country_df['target'] = np.nan
                
            dfs_list.append(country_df.copy())
            
        else:
            for i in range(1, min(len(country_df), CFG.n_neighbors)): # 200
                country_df[f"d_near"] = dists[:, i]
                country_df[f"near_target"] = targets[nears[:, i]]
                for c in columns:
                    country_df[f"near_{c}"] = country_df[c].values[nears[:, i]]

                if train_mode:
                    country_df['target'] = country_df.apply(create_target, axis=1)
                
                dfs_list.append(country_df.copy())
            
        res = pd.concat(dfs_list)
        
        del dists, nears, targets, dfs_list
        gc.collect()
        
        if dfs is not None:
            dfs = pd.concat([dfs, res])
        else:
            dfs = res.copy()
            
        del res
        gc.collect()
    
    return dfs.reset_index(drop=True)

def add_neighbour_features_low_mem(country_df):
    dfs_list = list()
    columns = ['id', 'name', 'address', 'city', 'state', # 'closest_city', 
           'zip', 'country', 'url', 'phone', 'categories', 'main_categories']

    
    knn = KNeighborsRegressor(n_neighbors=min(len(country_df), CFG.n_neighbors), 
                              metric='haversine', n_jobs=-1)
    knn.fit(country_df[['latitude','longitude']], country_df.index)
    dists, nears = knn.kneighbors(country_df[['latitude','longitude']], return_distance=True)

    targets = country_df[CFG.target].values

    if len(country_df) == 1:
        country_df[f"d_near"] = np.nan
        country_df[f"near_target"] = np.nan
        for c in columns:
            country_df[f"near_{c}"] = '' 
        country_df['target'] = [*map(create_target, country_df[CFG.target], country_df["near_target"])]

        dfs_list.append(country_df.copy())

    else:
        for i in range(1, min(len(country_df), CFG.n_neighbors)): # 200
            country_df[f"d_near"] = dists[:, i].astype(float)
            country_df[f"near_target"] = targets[nears[:, i]]
            for c in columns:
                country_df[f"near_{c}"] = country_df[c].values[nears[:, i]]
            country_df['target'] = [*map(create_target, country_df[CFG.target], country_df["near_target"])]
            
            dfs_list.append(country_df.copy())
            
    del knn, dists, nears, targets
    gc.collect()
    
    return pd.concat(dfs_list)

### Reset the kernel (to avoid OOM)

In [14]:
# %reset --aggressive -f

## Feature Engineering

In [15]:
%load_ext Cython

In [16]:
%%cython
def LCS(str S, str T):
    cdef int i, j
    cdef list dp = [[0] * (len(T) + 1) for _ in range(len(S) + 1)]
    for i in range(len(S)):
        for j in range(len(T)):
            dp[i + 1][j + 1] = max(dp[i][j] + (S[i] == T[j]), dp[i + 1][j], dp[i][j + 1], dp[i + 1][j + 1])
    return dp[len(S)][len(T)]

## Create distance features

In [17]:
# import Levenshtein
# import difflib

# def _add_distance_features(args):
#     _, df = args

#     columns = ['name', 'address', 'city', 'closest_city', 'state',
#            'zip', 'country', 'url', 'phone', 'categories', 'main_categories']

#     for i in tqdm(range(CFG.n_neighbors)):
#         for c in columns:
#             geshs = []
#             levens = []
#             jaros = []
#             lcss = []
#             for str1, str2 in df[[c, f"near_{c}_{i}"]].values.astype(str):
#                 if str1==str1 and str2==str2:
#                     geshs.append(difflib.SequenceMatcher(None, str1, str2).ratio())
#                     levens.append(Levenshtein.distance(str1, str2))
#                     jaros.append(Levenshtein.jaro_winkler(str1, str2))
#                     lcss.append(LCS(str(str1), str(str2)))
#                 else:
#                     geshs.append(-1)
#                     levens.append(-1)
#                     jaros.append(-1)
#             df[f"near_{c}_{i}_gesh"] = geshs
#             df[f"near_{c}_{i}_leven"] = levens
#             df[f"near_{c}_{i}_jaro"] = jaros
#             df[f"near_{c}_{i}_lcs"] = lcss
            
#             if not c in ['country', "phone", "zip"]:
#                 df[f"near_{c}_{i}_len"] = df[f"near_{c}_{i}"].astype(str).map(len)
#                 df[f"near_{c}_{i}_nleven"] = df[f"near_{c}_{i}_leven"] / df[[f"near_{c}_{i}_len", f"near_{c}_0_len"]].max(axis=1)
#                 df[f"near_{c}_{i}_nlcsi"] = df[f"near_{c}_{i}_lcs"] / df[f"near_{c}_{i}_len"]
#                 df[f"near_{c}_{i}_nlcs0"] = df[f"near_{c}_{i}_lcs"] / df[f"near_{c}_0_len"]
#     return df


# def add_distance_features(df):
#     processes = multiprocessing.cpu_count()-1
#     with multiprocessing.Pool(processes=processes) as pool:
#         dfs = pool.imap_unordered(_add_distance_features, df.groupby('country'))
#         dfs = tqdm(dfs)
#         dfs = list(dfs)
#     df = pd.concat(dfs)
#     return df


# if CFG.train:
#     train = pd.concat([
#         add_distance_features(train[train["set"]==0]), 
#         add_distance_features(train[train["set"]==1]),
#         add_distance_features(train[train["set"]==2]), 
#         add_distance_features(train[train["set"]==3]),
#         add_distance_features(train[train["set"]==4]), 
#         add_distance_features(train[train["set"]==5]),
#         add_distance_features(train[train["set"]==6]), 
#         add_distance_features(train[train["set"]==7])
#     ])

# if n_test_splits == 5:
#     test = pd.concat([
#         add_distance_features(test[test["set"]==0]), 
#         add_distance_features(test[test["set"]==1]),
#         add_distance_features(test[test["set"]==2]), 
#         add_distance_features(test[test["set"]==3]),
#         add_distance_features(test[test["set"]==4]), 
#         add_distance_features(test[test["set"]==5])
#     ])
# else:
#     test = pd.concat([
#             add_distance_features(test[test["set"]==0]), 
#             add_distance_features(test[test["set"]==1]),
#             add_distance_features(test[test["set"]==2]), 
#             add_distance_features(test[test["set"]==3]),
#             add_distance_features(test[test["set"]==4]), 
#             add_distance_features(test[test["set"]==5]),
#             add_distance_features(test[test["set"]==6]), 
#             add_distance_features(test[test["set"]==7])
#         ])

In [18]:
def seq_match_distance(str1, str2):
#     if str1 == '' or str1 == '':
#         return -1
    return difflib.SequenceMatcher(None, str1, str2).ratio()

def lev_distance(str1, str2):
#     if str1 == '' or str1 == '':
#         return -1
    return Levenshtein.distance(str1, str2)

def jw_distance(str1, str2):
#     if str1 == '' or str1 == '':
#         return -1
    return Levenshtein.jaro_winkler(str1, str2)

def lcs_distance(str1, str2):
#     if str1 == '' or str1 == '':
#         return -1
    return LCS(str1, str2)

def get_distances(str1, str2):
#     if str1 == '' or str1 == '':
#         return -1, -1, -1, -1
    return difflib.SequenceMatcher(None, str1, str2).ratio(), \
           Levenshtein.distance(str1, str2), \
           Levenshtein.jaro_winkler(str1, str2), \
           LCS(str1, str2)

def add_distance_features(df):

    columns = ['name', 'address', 'city', 'state', 'zip', # 'closest_city', 
               'country', 'url', 'phone', 'categories', 'main_categories']

    for c in columns:
        df[f"near_{c}_gesh"]=[*map(seq_match_distance, df[c], df[f"near_{c}"])]
        df[f"near_{c}_leven"]=[*map(lev_distance, df[c], df[f"near_{c}"])]
        df[f"near_{c}_jaro"]=[*map(jw_distance, df[c], df[f"near_{c}"])]
        df[f"near_{c}_lcs"]=[*map(lcs_distance, df[c], df[f"near_{c}"])]
#         df[f"near_{c}_gesh"], df[f"near_{c}_leven"], \
#         df[f"near_{c}_jaro"], df[f"near_{c}_lcs"] = zip(*map(get_distances, df[c], df[f"near_{c}"]))

        if not c in ['country', "phone", "zip"]:
            df[f"near_{c}_len"] = df[f"near_{c}"].astype(str).map(len)
            df[f"near_{c}_nleven"] = df[f"near_{c}_leven"] / df[f"near_{c}_len"]
            df[f"near_{c}_nlcsi"] = df[f"near_{c}_lcs"] / df[f"near_{c}_len"]
            df[f"near_{c}_nlcs0"] = df[f"near_{c}_lcs"] / df[f"near_{c}_len"]
            
    return df

## Reduce memory function

In [19]:
def reduce_mem(df, train_mode=False):
    for f in features:
        if f not in df.columns:
            df[f] = np.nan

    if train_mode:
        df = df[features + [CFG.target, "target", "id"] + ["near_id"]]
        df["target"] = df["target"].fillna(0)
    else:    
        df = df[features + ["id"] + ["near_id"]]


    df[features] = df[features].astype(np.float16)
    df["near_id"] = df["near_id"].fillna('')

    gc.collect()
        
    return df

## Set features to predict on

In [20]:
features = list()

columns = ['name', 'address', 'city', 'state', # 'closest_city', 
           'zip', 'country', 'url', 'phone', 'categories', 'main_categories']
features.append(f"d_near")
for c in columns:        
    features += [f"near_{c}_gesh", f"near_{c}_jaro", f"near_{c}_lcs"]
    if c in ['country', "phone", "zip"]:
        features += [f"near_{c}_leven"]
    else:
        features += [f"near_{c}_len", f"near_{c}_nleven", f"near_{c}_nlcsi", f"near_{c}_nlcs0"]

## Process the dataset

In [None]:
%%time

def add_neighbours(args):
    country, df = args
    columns = ['name', 'address', 'city', 'state', 'zip',  # 'closest_city', 
               'country', 'url', 'phone', 'categories', 'main_categories']
    df = add_neighbour_features_low_mem(df)
    
    return df
    
processes = multiprocessing.cpu_count()-1
with multiprocessing.Pool(processes=processes) as pool:
    dfs = pool.imap_unordered(add_neighbours, train.groupby('country', sort=False))
    dfs = tqdm(dfs, total=num_countries)
    dfs = list(dfs)
    

del train
gc.collect()

  0%|          | 0/222 [00:00<?, ?it/s]

## Split list of dasets on smaller chunks to speed up process and prevent OOM

In [None]:
%%time

threshold = 250000

def split_df(df, idx):
    n_splits = int(df.shape[0] / threshold) + 1
    kf = KFold(n_splits=n_splits)
    for trn_idx, val_idx in kf.split(df):
        dfs.append(df.loc[val_idx])
    
idx = 0
while idx < len(dfs):
    if dfs[idx].shape[0] > threshold:
        dfs[idx] = dfs[idx].reset_index(drop=True)
        split_df(dfs[idx], idx)
        dfs.pop(idx)
    else:
        idx += 1
    

## Add distance features and reduce the dataset

In [None]:
%%time

def add_distance_and_reduce(df):
    df = add_distance_features(df)
    df = reduce_mem(df, train_mode=True)
    return df

l = len(dfs)

with multiprocessing.Pool(processes=processes) as pool:
    dfs = pool.imap_unordered(add_distance_and_reduce, dfs)
    dfs = tqdm(dfs, total=l)
    dfs = list(dfs)
    
train = pd.concat(dfs).reset_index(drop=True)
del dfs

gc.collect()

## Balance the dataset

In [None]:
# select indexes of all positive targets
# and select indexes of all ids that don't have postive targets at all
pos_ids = train.loc[train['target'] == 1, 'id'].unique()
pos_idxs = train[train['target'] == 1].index
neg_idxs = train.loc[~train['id'].isin(pos_ids), 'id'].drop_duplicates(keep='first').index

# additionally select indexes of ids that have negative target
# but may have positive target 
neg_idxs1 = train[train['target'] == 0].index
neg_idxs1 = neg_idxs1.difference(neg_idxs)
neg_idxs1 = np.random.choice(neg_idxs1, size=len(pos_idxs)-len(neg_idxs))

# and add them to negative indexes, so the total number of positive and negative indexes are equal
neg_idxs = neg_idxs.union(neg_idxs1)

# select these positive and negative indexes from the dataset
train = train.loc[pos_idxs.union(neg_idxs)]

gc.collect()

# Train

In [None]:
import lightgbm as lgb
from scipy.misc import derivative


def fit_lgbm(X, y, params=None, es_rounds=50, seed=42, N_SPLITS=5, 
             n_class=None, model_dir=None, folds=None):
    cat_features = X.select_dtypes(include='object').columns
    
    models = []
    oof = np.zeros(len(y), dtype=np.float64)
    
    for i in tqdm(range(CFG.n_splits)):
        print(f"== fold {i} ==")
        trn_idx = folds != i
        val_idx = folds == i
        
        train_dataset = lgb.Dataset(X.iloc[trn_idx], y.iloc[trn_idx], categorical_feature=cat_features)
        valid_dataset = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx], categorical_feature=cat_features)

        
        focal_loss = lambda x,y: focal_loss_lgb(x, y, alpha=1., gamma=1.)
        focal_loss_eval = lambda x,y: focal_loss_lgb_eval_error(x, y, alpha=1., gamma=1.)
        
        if model_dir is None:
            model = lgb.train(
                params,
                train_set = train_dataset, 
                valid_sets = [train_dataset, valid_dataset], 
                callbacks = [lgb.log_evaluation(100), 
                             lgb.early_stopping(stopping_rounds=es_rounds)],
            )
        else:
            with open(f'{model_dir}/lgbm_fold{i}.pkl', 'rb') as f:
                model = pickle.load(f)
            
        pred = model.predict(X.iloc[val_idx])
        oof[val_idx] = pred
        models.append(model)
        
        file = f'lgbm_fold{i}.pkl'
        pickle.dump(model, open(file, 'wb'))
        print()

    cv = (np.round(oof) == y).mean()
    print(f"CV-accuracy: {cv}")

    return oof, models

def inference_lgbm(models, feat_df):
    pred = np.array([model.predict(feat_df) for model in models])
    pred = np.mean(pred, axis=0)
    return pred

## Split dataset for train

In [None]:
train = train.reset_index(drop=True)

kf = StratifiedGroupKFold(n_splits=CFG.n_splits, shuffle=True, random_state=CFG.seed)
for i, (trn_idx, val_idx) in tqdm(enumerate(kf.split(train, train["target"], train["id"]))):
    train.loc[val_idx, "fold"] = i

## Train model

In [None]:
warnings.filterwarnings("ignore", module="lightgbm")

params = {
    'seed': CFG.seed,
    'device': 'gpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    'objective': 'binary',
    'learning_rate': 0.2,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'max_bin': 200,
    'max_depth': 7,   
    'num_leaves': 35, 
    'min_data_in_leaf': 25,
    'n_estimators': 5000, 
    'colsample_bytree': 0.9,
    'verbose': -1,
}

oof, models = fit_lgbm(train[features], train["target"].astype(int), 
                       params=params, n_class=int(train["target"].max() + 1), 
                       N_SPLITS=CFG.n_splits, folds=train["fold"].values)

## Fuctions for postprocessing and validation

In [None]:
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame):
    scores = []
    id2poi = get_id2poi(input_df)
    poi2ids = get_poi2ids(input_df)
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

def postprocess(df):
    id2match = dict(zip(df["id"].values, df["matches"].str.split()))

    for match in df["matches"].values:
        match = match.split()
        if len(match) == 1:        
            continue

        base = match[0]
        for m in match[1:]:
            if not base in id2match[m]:
                id2match[m].append(base)
    df["matches"] = df["id"].map(id2match).map(" ".join)
    
    return df 

def get_matches(df, preds):
    near_id = df["near_id"].values
    matches = []

    for df_id, pred, near_idx in zip(df["id"], preds, near_id):
        idx = np.round(pred)
        if idx == 1:
            matches.append(df_id + " " + near_idx)
        else:
            matches.append(df_id)
    
    df['matches'] = matches
    df = postprocess(df)
    
    return df[['id', 'matches', 'point_of_interest']]

##  Predict matches and postprocess them

In [None]:
%%time

train = get_matches(train, oof)
train = train.drop_duplicates()
print(f"CV: {get_score(train):.6f}")

gc.collect()

In [None]:
# Baseline:
# acc: 0.9013
# CV: 0.861652

# Add closest_city from city dataframe
# acc: 0.90581
# CV: 0.863641

# Don't replace nans with ''
# acc: 0.909
# CV: 0.865036

# Further ideas

- do we need url/zip/phone?
- add ntlk.edit_distance to your features
- change KNN to the variant, that was proposed in this notebook: https://www.kaggle.com/code/ragnar123/flm-xlmroberta-inference-baseline
- add manhattan distance and euqlidian distance
- increase number of nearest neighbours to a very high value (like 50-100-200), so you will be able to find more matches; don't increase number of neighbours in the table to avoid OOM
- add KNN by country


- how to handle missing data https://www.kaggle.com/code/parulpandey/a-guide-to-handling-missing-values-in-python
- mean/median/std encode features
- use feature generation and selection from this notebook https://www.kaggle.com/code/aerdem4/foursquare-gpu-accelerated-lofo-feature-importance
- use Cat2Vec to calculate categories similarity https://www.kaggle.com/code/aerdem4/foursquare-cat2vec/notebook


- Optuna!



- try XLMRoberta


- you can use dict to store key-poi_id pairs and store only keys to save the memory