In [1]:
import os
import gc
import re
import sys
import math
import json
import time
import random
import joblib
import pickle
import warnings
import difflib
import Levenshtein
import numpy as np
import pandas as pd
import seaborn as sns
from glob import glob
from pathlib import Path
from unidecode import unidecode
import multiprocessing
from tqdm.auto import tqdm
from argparse import Namespace
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors, KNeighborsRegressor
from sklearn.metrics.pairwise import haversine_distances
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold, StratifiedGroupKFold

def reduce_mem_usage(df, silent=False, allow_categorical=True, float_dtype="float32"):
    """ 
    Iterates through all the columns of a dataframe and downcasts the data type
     to reduce memory usage. Can also factorize categorical columns to integer dtype.
    """
    def _downcast_numeric(series, allow_categorical=allow_categorical):
        """
        Downcast a numeric series into either the smallest possible int dtype or a specified float dtype.
        """
        if pd.api.types.is_sparse(series.dtype) is True:
            return series
        elif pd.api.types.is_numeric_dtype(series.dtype) is False:
            if pd.api.types.is_datetime64_any_dtype(series.dtype):
                return series
            else:
                if allow_categorical:
                    return series
                else:
                    codes, uniques = series.factorize()
                    series = pd.Series(data=codes, index=series.index)
                    series = _downcast_numeric(series)
                    return series
        else:
            series = pd.to_numeric(series, downcast="integer")
        if pd.api.types.is_float_dtype(series.dtype):
            series = series.astype(float_dtype)
        return series

    if silent is False:
        start_mem = np.sum(df.memory_usage()) / 1024 ** 2
        print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    if df.ndim == 1:
        df = _downcast_numeric(df)
    else:
        for col in df.columns:
            df.loc[:, col] = _downcast_numeric(df.loc[:,col])
    if silent is False:
        end_mem = np.sum(df.memory_usage()) / 1024 ** 2
        print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
        print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))

    return df

def shrink_mem_new_cols(matrix, oldcols=None, allow_categorical=True, silent=False):
    # Calls reduce_mem_usage on columns which have not yet been optimized
    if oldcols is not None:
        newcols = matrix.columns.difference(oldcols)
    else:
        newcols = matrix.columns
    matrix.loc[:,newcols] = reduce_mem_usage(matrix.loc[:,newcols], allow_categorical=allow_categorical, 
                                             silent=silent)
    oldcols = matrix.columns  # This is used to track which columns have already been downcast
    return matrix, oldcols

plt.rcParams["font.size"] = 13
warnings.filterwarnings('ignore')

sns.set_style("darkgrid")

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 600)

In [2]:
# def optimize_floats(df: pd.DataFrame) -> pd.DataFrame:
#     floats = df.select_dtypes(include=['float64']).columns.tolist()
#     df[floats] = df[floats].apply(pd.to_numeric, downcast='float')
#     return df


# def optimize_ints(df: pd.DataFrame) -> pd.DataFrame:
#     ints = df.select_dtypes(include=['int64']).columns.tolist()
#     df[ints] = df[ints].apply(pd.to_numeric, downcast='integer')
#     return df

## Config

In [3]:
CFG = Namespace(
    kaggle = False,
    seed = 42,
    debug = False,
    validate = False,
    inference = True,
    target = "point_of_interest",
    n_neighbors = 20,
    n_splits = 5,
    train_path = 'train_dataset',
    feat_columns = ['name', 'address', 'city', 'state', 'zip', # 'closest_city', 'main_categories'
                    'name_encoded', 'address_encoded', 'city_encoded', 'state_encoded',
                    'url',  'phone', 'categories', 'country'],
    vec_columns = ['name', 'name_encoded', 'categories', 'address', 'state_encoded', 
                   'city', 'country'],
    bad_features = ['name_len_diff', 'state_jaro', 'state_encoded_lcs',
                    'address_encoded_gesh', 'state_gesh', 'city_encoded_nlcsk',
                    'categories_nlcsk', 'city_encoded_sim', 'state_leven',
                    'address_encoded_nlcsk', 'name_nleven', 'city_encoded_gesh',
                    'state_encoded_leven', 'city_encoded_jaro', 'country_lcs',
                    'city_encoded_len_diff', 'state_nlcs', 'url_jaro']
)

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(CFG.seed)

## Load the data

In [4]:
## Data load
if CFG.kaggle:
    data_root = '../input/foursquare-location-matching'
else:
    data_root = 'foursquare_location_matching'
data = pd.read_csv(os.path.join(data_root, 'train.csv'))

if CFG.debug:
    data = data.sample(n = 10000, random_state = CFG.seed)
    data = data.reset_index(drop = True)

## Sort categories

In [5]:
def sort_categories(cat):
    if cat != cat:
        return np.nan
    return ', '.join(sorted(cat.split(', ')))

data['categories'] = data['categories'].apply(sort_categories)

## Encode name in unicode

In [6]:
def decode(col):
    if col != col:
        return np.nan
    return unidecode(col)

data['name_encoded'] = data['name'].apply(decode)
data['address_encoded'] = data['address'].apply(decode)
data['city_encoded'] = data['city'].apply(decode)
data['state_encoded'] = data['state'].apply(decode)

## Clean name

In [7]:
def clean_name(col):
    if col != col:
        return np.nan
    return col.lower()\
              .replace(",", "")\
              .replace(".", "")\
              .replace("'", "")\
              .replace("the ", "")

# train['name'] = train['name'].apply(clean_name)

## Make same categories for the same names

In [8]:
# brings shops with similar names to one name
# data.loc[data['name']=='mc donalds', 'name'] = 'mcdonalds'

## Add main category to the train dataset

In [9]:
# cat_freq = dict()
stop_words = ['/', '&', 'or', 'High', 'Miscellaneous', 'Fast', 'Other', 'Asian', 'Chinese', 'Event', 
              'Great', 'Noodle', 'Burger', 'Seafood', 'Breakfast', 'Ice', 'Diners', 'Cream', 'Indonesian', 
              'Thai', "Women's", 'Fried', 'Snack', 'Tea', 'Mexican', 'Nail', 'Sushi', 'Middle', 'Korean', 
              'Gift', 'Drink', 'Pet', 'Turkish', "Men's", 'Indian', 'Malay', 'Cocktail', 'Donut', 'Box', 
              'Condos)', 'Residential', 'Convenience', 'Gas', 'General', 'Bus', 'Pizza', 'Spaces', 'Mobile',
              'Phone', 'Academic', 'Japanese', 'Business', 'Shoe', 'Italian', 'American', 'Home', 'Auto', 
              'Furniture', 'Cosmetics', 'Sandwich', 'Dessert', 'Car', 'Arts', 'Financial', 'Legal', 'BBQ',
              'Hardware', 'Video', 'Music', 'Art', 'Student', 'Jewelry', 'Historic', 'Travel', 'Washes',
              'Beer', 'Arcades', 'Bike', 'Lookouts', 'Scenic', 'Rental', 'Accessories', 'Repairs', 'Discount', 
              'Optical', 'Bodegas', 'Big', 'Assisted', 'Living', 'Athletics', 'Agencies', 'Locations', 'Trails', 
              'Bed', 'Breakfasts', 'Wine', 'Real', 'Elementary', 'Theme', 'Golf', 'Rest',  'Photography', 
              'Nightlife', 'Courses', 'Convention', 'Eastern', 'Concert', 'Conference', 'Startups', 'Tech', 
              'Meeting', 'French', 'Supplies', 'Events', 'Sake', 'Dog', 'Ramen', 'City', 'Juice', 'Science',
              'Liquor', 'Lawyers', 'Insurance', 'Flower', 'Toy', 'Rentals', 'Paper', 'Flea', 'Bases', 'Baseball', 
              'Karaoke', 'Kids', 'Design', 'Farmers', 'Repair', 'Technology', 'Wards', 'Water', 'Supply', 
              'Filipino', 'Piers', 'Salad', 'Mattress', 'Print', 'Wings', 'Engineering', 'Non-Profits', 
              'Gastropubs', 'Bistros', 'Hot', 'Vietnamese', 'Hookah', 'Candy', 'Coffee', 'Electronics',
              'Department', 'Clothing', 'Trucks', 'Chicken', 'Movie', 'Health', 'Soccer', 'Crafts', 
              'Game', 'Community', 'Food', 'College', 'Sporting', 'Beauty', 'Ferries', 'Soup', 'Veterinarians', 
              'Basketball', 'Light', 'Rail', 'Taco', 'Classrooms', 'Shopping', 'Developments', 'Train', 'Performing',
              'Administrative', 'Lingerie', 'Dive', 'Storage', 'Office', 'Landscaping', 'Residence', 'Sports',
              'Goods', 'Dealerships', 'Grocery', 'Workshops', 'History'
             ]


def get_categories(category):
    if category == 'Auto':
        return 'Automotive'
    if category == 'Hotel' or category == 'Motels' or category == 'Hostels':
        return 'Hotels'
    if category == 'Courthouses':
        return 'Court'
    if category == 'College':
        return 'Colleges'
    if category == 'Cafés':
        return 'Cafes'
    if category == "Doctor's" or category == "Dentist's" or category == "Doctors":
        return 'Medical'
    if category == '(Apartments':
        return 'Apartments'
    return category

if CFG.kaggle:
    cat_freq = pd.read_csv('../input/foursquare-main-categories/cat_freq.csv', index_col='Unnamed: 0')
else:
    cat_freq = pd.read_csv('foursquare_main_categories/cat_freq.csv', index_col='Unnamed: 0')

cat_freq_dict = dict(zip(cat_freq['category'], cat_freq['frequence']))

def get_main_category(category):
    if category == category:
        category_list = re.split(', | ', category)
        most_freq_cat = np.nan
        freq = 0
        
        for c in category_list:
            if c in stop_words or c[-2:] == 'an':
                continue
            c = get_categories(c)
            f = cat_freq_dict.get(c, 0)
            if f > freq:
                freq = f
                most_freq_cat = c
        
        return most_freq_cat
            
    return np.nan

# data['main_categories'] = data['categories'].apply(get_main_category)

## Fill the missing data with data from outer sources

In [10]:
# states = pd.read_csv('states.csv', index_col='Unnamed: 0')
cities = pd.read_csv('additional_data/cities.csv', encoding = "ISO-8859-1")
cities = cities[['asciiname', 'latitude', 'longitude', 'country code']]
cities.rename({'asciiname': 'city', 'country code': 'country'}, axis=1, inplace=True)

# starbucks = pd.read_csv('additional_data/starbucks.csv', index_col='Unnamed: 0')
# starbucks = starbucks[['countryCode', 'latitude', 'longitude', 'streetAddressLine2', 'city']]
# starbucks.rename({'countryCode': 'country', 'streetAddressLine2': 'address'}, axis=1, inplace=True)
# starbucks.head()

# data['country'] = data['country'].fillna('NA')
# data['closest_city'] = ''

geoname_dict = {'city': cities}

def fill_the_missing_data(args):#, df_dists):
    country, country_df = args
    dfs = []
    columns = list(geoname_dict.keys())
    for c in tqdm(columns):

            geoname_df = geoname_dict[c]
            
#             country_df = country_df[(country_df[c].isnull())]
            geoname_df = geoname_df[geoname_df['country'] == country]
                
            if len(country_df) == 0 or len(geoname_df) == 0:
                continue
            
            knn = KNeighborsRegressor(n_neighbors=min(len(geoname_df), 2), metric='haversine', n_jobs=-1)
            knn.fit(geoname_df[['latitude','longitude']], geoname_df.index)
            dists, nears = knn.kneighbors(country_df[['latitude','longitude']], return_distance=True)
            
            if nears.shape[1] < 2:
                continue
            
            nears[:,1] = nears[:,0]
            nears[:,0] = country_df.index
#             dists = dists[:,0]*6371000
            
#             nears = nears[dists<=df_dists[c]]
            
#             display(nears)
#             display(dists)
            
            for n in nears:
                t_idx = n[0]
                c_idx = n[1]
                country_df.loc[t_idx, f"closest_{c}"] = geoname_df.iloc[c_idx][c]
#                 display(df.loc[t_idx])
#                 display(geoname_df.iloc[c_idx])
                       
    return country_df
    
    
num_countries = data['country'].nunique()
    
# processes = multiprocessing.cpu_count()
# with multiprocessing.Pool(processes=processes) as pool:
#     dfs = pool.imap_unordered(fill_the_missing_data, train.groupby('country', sort=False))
#     dfs = tqdm(dfs, total=num_countries)
#     dfs = list(dfs)
    
# train = pd.concat(dfs).reset_index(drop=True)

# del cities

# gc.collect()

## Bring all object columns to low register

In [11]:
def to_lower(df):
    for c in df.columns:
        if c != "id":
            df[c] = df[c].astype(str).str.lower()
    return df
    
data = to_lower(data)

## Search Candidates Function

In [12]:
def recall_knn(df, n_neighbors=10):
    print(80*'=')
    print('Start KNN grouped by country')
    train_df_country = []
    for country, country_df in tqdm(df.groupby('country')):
        country_df = country_df.reset_index(drop = True)

        neighbors = min(len(country_df), n_neighbors)
        knn = KNeighborsRegressor(n_neighbors = neighbors,
                                    metric = 'haversine',
                                    n_jobs = -1)
        knn.fit(country_df[['latitude','longitude']], country_df.index)
        dists, nears = knn.kneighbors(country_df[['latitude', 'longitude']], 
                                        return_distance = True)

        for k in range(neighbors):            
            cur_df = country_df[['id']]
            cur_df['match_id'] = country_df['id'].values[nears[:, k]]
            cur_df['kdist_country'] = dists[:, k]
            cur_df['kneighbors_country'] = k
            
            train_df_country.append(cur_df)
    
    train_df_country = pd.concat(train_df_country)
    
    print('Start KNN for the whole dataset')
    train_df = []
    knn = NearestNeighbors(n_neighbors = n_neighbors)
    knn.fit(df[['latitude','longitude']], df.index)
    dists, nears = knn.kneighbors(df[['latitude','longitude']])
    
    for k in range(n_neighbors):            
        cur_df = df[['id']]
        cur_df['match_id'] = df['id'].values[nears[:, k]]
        cur_df['kdist'] = dists[:, k]
        cur_df['kneighbors'] = k
        train_df.append(cur_df)
    
    train_df = pd.concat(train_df)
    train_df = train_df.merge(train_df_country,
                                 on = ['id', 'match_id'],
                                 how = 'outer')
    del train_df_country
    
    return train_df

## Reset the kernel (to avoid OOM)

In [13]:
# %reset --aggressive -f

## Split dataset on two parts to imitate test data distribution

In [14]:
def analysis(df):
    print('Num of data: %s' % len(df))
    print('Num of unique id: %s' % df['id'].nunique())
    print('Num of unique poi: %s' % df['point_of_interest'].nunique())
    
    poi_grouped = df.groupby('point_of_interest')['id'].count().reset_index()
    print('Mean num of unique poi: %s' % poi_grouped['id'].mean())

## Data split
kf = GroupKFold(n_splits=2)
for i, (trn_idx, val_idx) in enumerate(kf.split(data, 
                                                data['point_of_interest'], 
                                                data['point_of_interest'])):
    data.loc[val_idx, 'set'] = i

print('Num of data: %s' % len(data))
print(data['set'].value_counts())

train_data = data[data['set'] == 1]
valid_data = data[data['set'] == 0]

print('Train data: ')
analysis(train_data)
print('Valid data: ')
analysis(valid_data)

train_ids = train_data['id'].unique().tolist()
valid_ids = valid_data['id'].unique().tolist()
      
tv_ids_d = {}
tv_ids_d['train_ids'] = train_ids
tv_ids_d['valid_ids'] = valid_ids


Num of data: 1138812
1.0    569406
0.0    569406
Name: set, dtype: int64
Train data: 
Num of data: 569406
Num of unique id: 569406
Num of unique poi: 369987
Mean num of unique poi: 1.5389892077289202
Valid data: 
Num of data: 569406
Num of unique id: 569406
Num of unique poi: 369985
Mean num of unique poi: 1.5389975269267673


## Find closest neighbours

In [15]:
# find nearest neighbours
train_data = recall_knn(train_data, CFG.n_neighbors)
valid_data = recall_knn(valid_data, CFG.n_neighbors)

# create train target feature
data = data.set_index('id')
ids = train_data['id'].tolist()
match_ids = train_data['match_id'].tolist()
poi = data.loc[ids]['point_of_interest'].values
match_poi = data.loc[match_ids]['point_of_interest'].values
train_data['label'] = np.array(poi == match_poi, dtype = np.int8)

ids = valid_data['id'].tolist()
match_ids = valid_data['match_id'].tolist()
poi = data.loc[ids]['point_of_interest'].values
match_poi = data.loc[match_ids]['point_of_interest'].values
valid_data['label'] = np.array(poi == match_poi, dtype = np.int8)

del poi, match_poi, ids, match_ids
gc.collect()

print('Num of unique train id: %s' % train_data['id'].nunique())
print('Num of train data: %s' % len(train_data))
print('Pos rate: %s' % train_data['label'].mean())

print('Num of unique valid id: %s' % valid_data['id'].nunique())
print('Num of valid data: %s' % len(valid_data))
print('Pos rate: %s' % valid_data['label'].mean())

data = data.reset_index('id')

Start KNN grouped by country


  0%|          | 0/211 [00:00<?, ?it/s]

Start KNN for the whole dataset
Start KNN grouped by country


  0%|          | 0/210 [00:00<?, ?it/s]

Start KNN for the whole dataset
Num of unique train id: 569406
Num of train data: 13928057
Pos rate: 0.07242905453359359
Num of unique valid id: 569406
Num of valid data: 13917577
Pos rate: 0.0721093190287361


## TF-IDF

In [16]:
def tf_idf_vectorize(df):
    # set dict for corresponding ids and index
    id2index_d = dict(zip(data['id'].values, data.index))

    # make TF-IDF features
    tfidf_d = {}
    for col in CFG.vec_columns:
        tfidf = TfidfVectorizer()
        tv_fit = tfidf.fit_transform(data[col].fillna('nan'))
        tfidf_d[col] = tv_fit
        
    return id2index_d, tfidf_d, tv_fit

train_id2index_d, train_tfidf_d, train_tv_fit = tf_idf_vectorize(train_data)
valid_id2index_d, valid_tfidf_d, valid_tv_fit = tf_idf_vectorize(valid_data)

## Feature Engineering

In [17]:
%load_ext Cython

In [18]:
%%cython
def LCS(str S, str T):
    cdef int i, j
    cdef list dp = [[0] * (len(T) + 1) for _ in range(len(S) + 1)]
    for i in range(len(S)):
        for j in range(len(T)):
            dp[i + 1][j + 1] = max(dp[i][j] + (S[i] == T[j]), dp[i + 1][j], dp[i][j + 1], dp[i + 1][j + 1])
    return dp[len(S)][len(T)]

## Add distance features

In [19]:
def seq_match_distance(str1, str2):
    if str1 == 'nan' or str2 == 'nan':
        return np.nan
    return difflib.SequenceMatcher(None, str1, str2).ratio()

def lev_distance(str1, str2):
    if str1 == 'nan' or str2 == 'nan':
        return np.nan
    return Levenshtein.distance(str1, str2)

def jw_distance(str1, str2):
    if str1 == 'nan' or str2 == 'nan':
        return np.nan
    return Levenshtein.jaro_winkler(str1, str2)

def lcs_distance(str1, str2):
    if str1 == 'nan' or str2 == 'nan':
        return np.nan
    return LCS(str(str1), str(str2))

def add_distance_features(df, id2index_d, tfidf_d, tv_fit):
    for col in tqdm(CFG.feat_columns):
        if col in CFG.vec_columns:
            # for each id and match_id add corresponding TF-IDF vector
            # than multiply them elementwise to get similarity
            tv_fit = tfidf_d[col]
            indexs = [id2index_d[i] for i in df['id']]
            match_indexs = [id2index_d[i] for i in df['match_id']]                    
            df[f'{col}_sim'] = tv_fit[indexs].multiply(tv_fit[match_indexs]).sum(axis = 1).A.ravel()
        
        # set current POI values and values of matched POI
        col_values = data.loc[df['id']][col].values.astype(str)
        match_col_values = data.loc[df['match_id']][col].values.astype(str)
        
        if f"{col}_gesh" not in CFG.bad_features:
            df[f"{col}_gesh"]=[*map(seq_match_distance, col_values, match_col_values)]
        if f"{col}_leven" not in CFG.bad_features:
            df[f"{col}_leven"]=[*map(lev_distance, col_values, match_col_values)]
        if f"{col}_jaro" not in CFG.bad_features:
            df[f"{col}_jaro"]=[*map(jw_distance, col_values, match_col_values)]
        if f"{col}_lcs" not in CFG.bad_features:
            df[f"{col}_lcs"]=[*map(lcs_distance, col_values, match_col_values)]
            
        if col not in ['phone', 'country', 'zip']:
            df[f'{col}_len'] = list(map(len, col_values))
            df[f'match_{col}_len'] = list(map(len, match_col_values)) 
            
            if f'{col}_len_diff' not in CFG.bad_features:
                df[f'{col}_len_diff'] = np.abs(df[f'{col}_len'] - df[f'match_{col}_len'])
            
            if f'{col}_nleven' not in CFG.bad_features and f'{col}_leven' not in CFG.bad_features:
                df[f'{col}_nleven'] = df[f'{col}_leven'] / df[[f'{col}_len', f'match_{col}_len']].max(axis = 1)
            
            if f'{col}_nlcsk' not in CFG.bad_features and f'{col}_lcs' not in CFG.bad_features:
                df[f'{col}_nlcsk'] = df[f'{col}_lcs'] / df[f'match_{col}_len']
            
            if f'{col}_nlcs' not in CFG.bad_features and f'{col}_lcs' not in CFG.bad_features:
                df[f'{col}_nlcs'] = df[f'{col}_lcs'] / df[f'{col}_len']
            
            df = df.drop(f'{col}_len', axis = 1)
            df = df.drop(f'match_{col}_len', axis = 1)
            gc.collect()
            
    return df

## Check maximum IOU score

In [20]:
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2id(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df, id2poi, poi2id):
    ''' Calculate IOU score between two sets '''
    scores = []
    for ids, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2id[id2poi[ids]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

def get_max_score(data, train_data):
    ''' Get all unique ids from basic dataset, then concat them with ids
        from train data, that have matches, then group them by id and 
        after that we have all matches that can be found in train dataset.
        Then calculate IOU with all matches that can be found in basic dataset
        for ids from train dataset '''
    data = data.reset_index()

    id2poi = get_id2poi(data)
    poi2id = get_poi2id(data)

    eval_df = pd.DataFrame()
    eval_df['id'] = data['id'].unique().tolist()
    eval_df['match_id'] = eval_df['id']
    print('Unique id: %s' % len(eval_df))

    eval_df_ = train_data[train_data['label'] == 1][['id', 'match_id']]
    eval_df = pd.concat([eval_df, eval_df_])
    eval_df = eval_df.groupby('id')['match_id'].apply(list).reset_index()
    eval_df['matches'] = eval_df['match_id'].apply(lambda x: ' '.join(set(x)))
    eval_df = eval_df[['id', 'matches']]
    print('Unique id: %s' % len(eval_df))

    iou_score = get_score(eval_df, id2poi, poi2id)
    return iou_score

# get_max_score(data, train_data)

- KNN 20 - max IOU 0.9231233344021228
- KNN 25 - max IOU 0.9290987088025121
- KNN 30 - max IOU 0.9336762986665911
- KNN 35 - max IOU 0.9372980642092062
- KNN 40 - max IOU 0.9402722991889505
- KNN 45 - max IOU 0.9427770977806317
- KNN 50 - max IOU 0.9449001281362694

## Create train and validation datasets

In [21]:
def create_dataset(df, id2index_d, tfidf_d, tv_fit, label='train'):
    ## Add features
    count = 0
    start_row = 0

    # data = data.set_index('id')
    unique_id = df['id'].unique().tolist()
    num_split_id = len(unique_id) // CFG.n_splits
    for k in range(1, CFG.n_splits + 1):
        print('Current split: %s' % k)
        end_row = start_row + num_split_id
        if k < CFG.n_splits:
            cur_id = unique_id[start_row : end_row]
            cur_data = df[df['id'].isin(cur_id)]
        else:
            cur_id = unique_id[start_row: ]
            cur_data = df[df['id'].isin(cur_id)]

        cur_data = add_distance_features(cur_data, id2index_d, tfidf_d, tv_fit)
        cur_data = cur_data, _ = shrink_mem_new_cols(cur_data, oldcols=None, allow_categorical=True, silent=True)
        
        if CFG.n_splits == 1:
            cur_data.to_parquet(f'{CFG.train_path}/{label}_data.parquet', index = False)    
        else:
            cur_data.to_parquet(f'{CFG.train_path}/{label}_data{k}.parquet', index = False)    
        
        start_row = end_row
        count += len(cur_data)

        del cur_data
        gc.collect()
    
    print(f'Total len is {count}')
    
data = data.set_index('id')
create_dataset(train_data, train_id2index_d, train_tfidf_d, train_tv_fit, label='train')
create_dataset(valid_data, valid_id2index_d, valid_tfidf_d, valid_tv_fit, label='valid')

Current split: 1


  0%|          | 0/13 [00:00<?, ?it/s]

Current split: 2


  0%|          | 0/13 [00:00<?, ?it/s]

Current split: 3


  0%|          | 0/13 [00:00<?, ?it/s]

Current split: 4


  0%|          | 0/13 [00:00<?, ?it/s]

Current split: 5


  0%|          | 0/13 [00:00<?, ?it/s]

Total len is 13928057
Current split: 1


  0%|          | 0/13 [00:00<?, ?it/s]

Current split: 2


  0%|          | 0/13 [00:00<?, ?it/s]

Current split: 3


  0%|          | 0/13 [00:00<?, ?it/s]

Current split: 4


  0%|          | 0/13 [00:00<?, ?it/s]

Current split: 5


  0%|          | 0/13 [00:00<?, ?it/s]

Total len is 13917577


# Further ideas

- train lgbm, validate, submit (add 'scale_pos_weight' argument, use auc metric)
- check all your previous improvements on validation and test

- add ntlk.edit_distance to your features
- change KNN to the variant, that was proposed in this notebook: https://www.kaggle.com/code/ragnar123/flm-xlmroberta-inference-baseline
- add manhattan distance and euqlidian distance
- increase number of nearest neighbours to a very high value (like 50-100-200), so you will be able to find more matches
- add kdist_diff and kdist mean


- how to handle missing data https://www.kaggle.com/code/parulpandey/a-guide-to-handling-missing-values-in-python
- mean/median/std encode features
- use feature generation and selection from this notebook https://www.kaggle.com/code/aerdem4/foursquare-gpu-accelerated-lofo-feature-importance
- use Cat2Vec to calculate categories similarity https://www.kaggle.com/code/aerdem4/foursquare-cat2vec/notebook


- Optuna!



- try XLMRoberta


- you can use dict to store key-poi_id pairs and store only keys to save the memory