In [1]:
import pandas as pd
import numpy as np

from Levenshtein import distance as lev
from sklearn.neighbors import BallTree
import pickle

import itertools
from tqdm.auto import tqdm
tqdm.pandas()
import time

In [2]:
start_time = time.time()

In [3]:
train_data = pd.read_csv("/kaggle/input/foursquare-location-matching/train.csv")
test_data = pd.read_csv("/kaggle/input/foursquare-location-matching/test.csv")

# True labels

In [4]:
train_merged = pd.merge(train_data, train_data, on='point_of_interest', suffixes=('_1', '_2'), how='inner')
train_pairs_true = train_merged[train_merged['id_1'] != train_merged['id_2']]
train_pairs_true = train_pairs_true.drop(['point_of_interest'], axis=1)
train_pairs_true['match'] = True
train_pairs_true.shape

(1901006, 25)

In [5]:
train_pairs_true['match'].value_counts()

True    1901006
Name: match, dtype: int64

# cloest location

In [6]:
def create_match_loc(test, neighbour = 11):
    # minimum neighbour: 3 (include itself)
    if len(test) < neighbour:
        neighbour = len(test)
    tree = BallTree(np.deg2rad(test[['latitude', 'longitude']].values), metric='haversine')
    dist, ind = tree.query(np.deg2rad(test[['latitude', 'longitude']].values), k=neighbour)
    dist = dist[:,1:].squeeze()
    ind = ind[:,1:].squeeze()
    test_col = test.columns.tolist()
    combine_col = [str + '_1' for str in tqdm(test_col)] + [str + '_2' for str in tqdm(test_col)]
    df_combine = pd.DataFrame(np.concatenate([
                np.repeat(np.array(test), neighbour-1, axis = 0),
                test.iloc[list(itertools.chain.from_iterable(ind.tolist())),:]
               ], axis=1))    
    df_combine.columns = combine_col
    return df_combine                          

In [7]:
train_pairs_close = create_match_loc(train_data, neighbour = 15)

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

In [8]:
train_pairs_close_True = train_pairs_close[train_pairs_close['point_of_interest_1'] == train_pairs_close['point_of_interest_2']]
train_pairs_close_False = train_pairs_close[train_pairs_close['point_of_interest_1'] != train_pairs_close['point_of_interest_2']]

train_pairs_close_True = train_pairs_close_True.drop(['point_of_interest_1','point_of_interest_2'], axis=1)
train_pairs_close_False = train_pairs_close_False.drop(['point_of_interest_1','point_of_interest_2'], axis=1)

train_pairs_close_True['match'] = True
train_pairs_close_False['match'] = False

In [9]:
train_pairs_close_True.shape

(736918, 25)

In [10]:
train_pairs_close_False.shape

(15206450, 25)

In [11]:
train_pairs = pd.concat([train_pairs_true,train_pairs_close_False],axis = 0)
train_pairs.shape

(17107456, 25)

In [12]:
train_pairs.reset_index(inplace = True)

In [13]:
train_pairs.to_csv('train_pairs_raw.csv', index = False)

In [16]:
train_pairs.to_pickle('./train_pairs_raw.pkl')

In [17]:
train_pairs.shape

(17107456, 26)

In [None]:
print("--- %s seconds ---" % (time.time() - start_time))

# download data
<a href="train_pairs_raw.csv"> train_pairs csv </a>

<a href="./train_pairs_raw.pkl"> train_pairs pickle </a>