In [68]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [85]:
def find_nearest_neighbors(
    df,
    n_neighbors = 5,
    train=False
):
    '''
    add some documentation here
    '''
    
    orig_df = df.copy()
    
    # define and scale the location df
    location_data = df[['latitude', 'longitude']].values
    location_data = StandardScaler().fit_transform(location_data) 
    
    # define and fit the knn
    knn = NearestNeighbors(n_neighbors = min(n_neighbors + 1, len(df)), 
                           algorithm = 'kd_tree',
                           n_jobs=-1)
    knn.fit(location_data)
    
    # generate the neighbours array - index of closest n_neighbors + 1 samples to each of the samples in the location data array, including that sample istelf
    neighbors_array = knn.kneighbors(location_data,
                                     return_distance=False)

    # create column dictionaries
    cols_1 = dict(zip(orig_df.columns, [f'{col}_1' for col in orig_df.columns]))
    cols_2 = dict(zip(orig_df.columns, [f'{col}_2' for col in orig_df.columns]))

    # create dataframes needed for final join
    df_1 = orig_df
    idxs = neighbors_array.flatten()
    df_2 = df_1.iloc[idxs].reset_index().rename(columns=cols_2)

    # change df_1
    df_1['orig_index'] = df_1.index
    df_1 = pd.concat(knn.n_neighbors * [df_1],
                     ignore_index=True)
    df_1 = df_1.reset_index().rename(columns = cols_1)
    df_1 = df_1.sort_values(['orig_index', 'index'])
    df_1.drop(columns=['orig_index', 'index'],
              inplace=True)
    df_1.reset_index(drop=True, inplace=True)
    
    # create final df
    df = pd.concat([df_1, df_2],
                   axis=1)
    df = df.loc[df.id_1 != df.id_2].reset_index(drop=True)
    
    # add final match column
    if train:
        df['match'] = df['point_of_interest_1'] == df['point_of_interest_2']
        df.drop(columns=['point_of_interest_1', 'point_of_interest_2'], inplace=True)
    
    return df

In [86]:
train.shape

(1138812, 13)

In [90]:
test_pairs = find_nearest_neighbors(train, 5, train = True)

In [88]:
print(test_pairs.shape)
test_pairs.head(20)

(6834597, 26)


Unnamed: 0,id_1,name_1,latitude_1,longitude_1,address_1,city_1,state_1,zip_1,country_1,url_1,phone_1,categories_1,index,id_2,name_2,latitude_2,longitude_2,address_2,city_2,state_2,zip_2,country_2,url_2,phone_2,categories_2,match
0,E_000001272c6c5d,Café Stad Oudenaarde,50.859975,3.634196,Abdijstraat,Nederename,Oost-Vlaanderen,9700.0,BE,,,Bars,704744,E_9e4a5adfe5b1c5,Frituur De Groten Honger,50.858896,3.633336,Abdijstraat 83,Nederename,Oost-Vlaanderen,,BE,,+3255309661,Friteries,False
1,E_000001272c6c5d,Café Stad Oudenaarde,50.859975,3.634196,Abdijstraat,Nederename,Oost-Vlaanderen,9700.0,BE,,,Bars,280465,E_3ef360d8d73dcf,Huisarts De Lille Renaat,50.858821,3.633495,Abdijstraat 64,Oudenaarde,Oost-Vlaanderen,9700,BE,,+3255304444,Doctor's Offices,False
2,E_000001272c6c5d,Café Stad Oudenaarde,50.859975,3.634196,Abdijstraat,Nederename,Oost-Vlaanderen,9700.0,BE,,,Bars,366990,E_52716b9f498e53,Dokter De Lille,50.858576,3.633424,,Ename,Oost-Vlaanderen,,BE,,,Doctor's Offices,False
3,E_000001272c6c5d,Café Stad Oudenaarde,50.859975,3.634196,Abdijstraat,Nederename,Oost-Vlaanderen,9700.0,BE,,,Bars,1090830,E_f5238107545107,Feeste t' Ename,50.855537,3.631917,Lotharingenstraat,Ename,Oost-Vlaanderen,9700,BE,http://www.enameleeft.be,,Festivals,False
4,E_000001272c6c5d,Café Stad Oudenaarde,50.859975,3.634196,Abdijstraat,Nederename,Oost-Vlaanderen,9700.0,BE,,,Bars,9905,E_0240ed6c6cfa9b,Estetika,50.855023,3.629794,,Oudenaarde,Flemish Region,,BE,,,Cosmetics Shops,False
5,E_000001272c6c5d,Café Stad Oudenaarde,50.859975,3.634196,Abdijstraat,Nederename,Oost-Vlaanderen,9700.0,BE,,,Bars,509434,E_728785cf2fead4,Schoonheidsinstituut Estetika,50.8548,3.630315,,,,,BE,,,Cosmetics Shops,False
6,E_000002eae2a589,Carioca Manero,-22.907225,-43.178244,,,,,BR,,,Brazilian Restaurants,860109,E_c12fb799a8a0e4,Claro,-22.907135,-43.178162,Shopping Avenida Central,Rio De Janeiro,RJ,20040-901,BR,http://www.claro.com.br,,Mobile Phone Shops,False
7,E_000002eae2a589,Carioca Manero,-22.907225,-43.178244,,,,,BR,,,Brazilian Restaurants,1043520,E_ea777cbcbc9a19,Techno Game,-22.907257,-43.177703,Shopping Avenida Central,Rio de Janeiro,Rio de Janeiro,20040-901,BR,,+552125321799,Video Game Stores,False
8,E_000002eae2a589,Carioca Manero,-22.907225,-43.178244,,,,,BR,,,Brazilian Restaurants,1012744,E_e384ed31f2c434,Largo da Carioca,-22.907069,-43.177968,Lrg. da Carioca,Rio de Janeiro,RJ,20050-020,BR,,,Plazas,False
9,E_000002eae2a589,Carioca Manero,-22.907225,-43.178244,,,,,BR,,,Brazilian Restaurants,609064,E_88ead605bf6276,Sensorium,-22.907215,-43.177556,Ed. Av. Central,Rio de Janeiro,RJ,20040-901,BR,http://www.sensorium.com.br,552122403848,Doctor's Offices,False


In [89]:
# positive sample portion
df[df['match']==True].shape[0] / df.shape[0]

0.09870847688125688