In [1]:
import pandas as pd
import numpy as np

from Levenshtein import distance as lev
from sklearn.neighbors import BallTree
import pickle

import itertools
from tqdm.auto import tqdm
tqdm.pandas()
import time
import gc

In [2]:
start_time = time.time()

In [3]:
train_data = pd.read_csv("/kaggle/input/foursquare-location-matching/train.csv")

# True labels

In [4]:
train_merged = pd.merge(train_data, train_data, on='point_of_interest', suffixes=('_1', '_2'), how='inner')
train_pairs_true = train_merged[train_merged['id_1'] != train_merged['id_2']]
train_pairs_true = train_pairs_true.drop(['point_of_interest'], axis=1)
train_pairs_true['match'] = True
train_pairs_true.shape

(1901006, 25)

In [5]:
train_pairs_true['match'].value_counts()

True    1901006
Name: match, dtype: int64

# cloest location

In [6]:
def create_match_loc(test, neighbour = 11):
    # minimum neighbour: 3 (include itself)
    if len(test) < neighbour:
        neighbour = len(test)
    tree = BallTree(np.deg2rad(test[['latitude', 'longitude']].values), metric='haversine')
    dist, ind = tree.query(np.deg2rad(test[['latitude', 'longitude']].values), k=neighbour)
    dist = dist[:,1:].squeeze()
    ind = ind[:,1:].squeeze()
    test_col = test.columns.tolist()
    combine_col = [str + '_1' for str in tqdm(test_col)] + [str + '_2' for str in tqdm(test_col)]
    df_combine = pd.DataFrame(np.concatenate([
                np.repeat(np.array(test), neighbour-1, axis = 0),
                test.iloc[list(itertools.chain.from_iterable(ind.tolist())),:]
               ], axis=1))    
    df_combine.columns = combine_col
    return df_combine                          

In [7]:
train_pairs_close = create_match_loc(train_data, neighbour = 15)

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

In [8]:
train_pairs_close_True = train_pairs_close[train_pairs_close['point_of_interest_1'] == train_pairs_close['point_of_interest_2']]
train_pairs_close_False = train_pairs_close[train_pairs_close['point_of_interest_1'] != train_pairs_close['point_of_interest_2']]

train_pairs_close_True = train_pairs_close_True.drop(['point_of_interest_1','point_of_interest_2'], axis=1)
train_pairs_close_False = train_pairs_close_False.drop(['point_of_interest_1','point_of_interest_2'], axis=1)

train_pairs_close_True['match'] = True
train_pairs_close_False['match'] = False

In [9]:
train_pairs_close_True.shape

(736918, 25)

In [10]:
train_pairs_close_False.shape

(15206450, 25)

In [11]:
train_pairs = pd.concat([train_pairs_true,train_pairs_close_False],axis = 0)
train_pairs.shape

(17107456, 25)

In [12]:
train_pairs.reset_index(inplace = True)

# change back to original data type

In [13]:
pairs_sample = pd.read_csv('../input/foursquare-location-matching/pairs.csv').iloc[0:2,:]
# change original data type
del train_pairs['index']
dtype_dict = pairs_sample.dtypes.apply(lambda x: x.name).to_dict()
del pairs_sample
gc.collect()
train_pairs = train_pairs.astype(dtype_dict)

In [14]:
#train_pairs.to_csv('train_pairs_raw.csv', index = False)

In [15]:
train_pairs.shape

(17107456, 25)

In [16]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in tqdm(df.columns):
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [17]:
train_pairs = reduce_mem_usage(train_pairs)

Memory usage of dataframe is 3148.78 MB


  0%|          | 0/25 [00:00<?, ?it/s]

Memory usage after optimization is: 1585.74 MB
Decreased by 49.6%


In [18]:
word_columns = ['name_1','address_1','city_1','state_1','url_1','categories_1','name_2','address_2','city_2','state_2','url_2','categories_2']
for col in tqdm(word_columns):
    train_pairs[col] = train_pairs[col].astype('object')

  0%|          | 0/12 [00:00<?, ?it/s]

In [19]:
train_pairs.memory_usage().sum() / 1024**2

2294.1519775390625

In [20]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 1471.40793466568 seconds ---


In [21]:
train_pairs.to_pickle('./train_pairs_raw.pkl')

# download data
<a href="train_pairs_raw.csv"> train_pairs csv </a>

<a href="./train_pairs_raw.pkl"> train_pairs pickle </a>