# Generate "NonMatch" pairs for train

In [1]:
import pandas as pd
from tqdm import tqdm
from additional_funcs import coordDistance,similar
import os
import random
import time
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import numpy as np

In [2]:
data = pd.read_csv('datasets/train.csv')
data = data.drop(columns=['address','city','state','zip','country','url','phone'])

In [3]:
data

Unnamed: 0,id,name,latitude,longitude,categories,point_of_interest
0,E_000001272c6c5d,Café Stad Oudenaarde,50.859975,3.634196,Bars,P_677e840bb6fc7e
1,E_000002eae2a589,Carioca Manero,-22.907225,-43.178244,Brazilian Restaurants,P_d82910d8382a83
2,E_000007f24ebc95,ร้านตัดผมการาเกด,13.780813,100.484900,Salons / Barbershops,P_b1066599e78477
3,E_000008a8ba4f48,Turkcell,37.844510,27.844202,Mobile Phone Shops,P_b2ed86905a4cd3
4,E_00001d92066153,Restaurante Casa Cofiño,43.338196,-4.326821,Spanish Restaurants,P_809a884d4407fb
...,...,...,...,...,...,...
1138807,E_ffffb80854f713,青ガエル,35.659020,139.700780,,P_7ccbeab96cd82e
1138808,E_ffffbf9a83e0ba,Deshon Place,40.872116,-79.945343,Housing Developments,P_db0abc418e7365
1138809,E_ffffc572b4d35b,İzmir Adnan Menderes Havaalanı,38.423733,27.142826,Airport Services,P_ae96252a6a9380
1138810,E_ffffca745329ed,焼肉 和家,35.710712,139.775000,BBQ Joints,P_146662f246d418


In [4]:
# download dict with_all matches
answers_df = pd.read_csv('train_df_for_scoring.csv')
train_scoring_dict = answers_df.set_index('id').to_dict()['matches']
for ind in train_scoring_dict:
    train_scoring_dict[ind] = set(train_scoring_dict[ind].split())

In [5]:
def generatePairsIds(init_data, train_scoring_dict, n_neighbours = 7 ):
    #import scaler and scale
    data = init_data.copy()
    scaler= StandardScaler()
    num_features = ['latitude','longitude']
    data[num_features] = scaler.fit_transform(data[num_features])
    
    X = data[['latitude','longitude']]
    
    # apply KNN
    nbrs = NearestNeighbors(n_neighbors = n_neighbours, algorithm='kd_tree').fit(X)
    indices = nbrs.kneighbors(X,return_distance=False)
    indices_set = list(map(set,indices))
    
    # remove any duplicated pairs: if we have a=b, delete b=a
    for primary_id in tqdm(range(len(indices_set))):
        ind_set = indices_set[primary_id]
        if primary_id in ind_set:
            ind_set.remove(primary_id)
        ids_to_remove = []
        for secondary_id in ind_set:
            # check if entries belong to one poi
            if init_data.iloc[secondary_id]['id'] in train_scoring_dict[init_data.iloc[primary_id]['id']]:
                ids_to_remove.append(secondary_id)
            if primary_id in indices_set[secondary_id]:
                indices_set[secondary_id].remove(primary_id)
        for id_to_remove in ids_to_remove:
            ind_set.remove(id_to_remove)
    count = 0
    for i in indices_set:
        count += len(i)
    print(f'{count} pairs ids generated')
    return indices_set

In [6]:
ind_set = generatePairsIds(data, train_scoring_dict, n_neighbours = 3)

100%|██████████████████████████████████████████████████████████████████████| 1138812/1138812 [05:47<00:00, 3273.09it/s]


1291452 pairs ids generated


In [7]:
def generatePairsDf(data, ind_set):
    pairs_locations =  []
    for primary_id in range(len(ind_set)):
        neighbours = ind_set[primary_id]
        if len(neighbours) > 0:
            for neighbour_id in neighbours:
                pairs_locations.append([primary_id, neighbour_id])
    
    pairs_locations = np.array(pairs_locations)
    result_df = pd.concat([data.iloc[pairs_locations[:,0]].reset_index(drop=True),
                           data.iloc[pairs_locations[:,1]].reset_index(drop=True)],axis = 1)
    result_df['match'] = [False for _ in range(len(result_df))]
    result_df.columns = ['id_1','name_1','latitude_1','longitude_1','categories_1', 'poi1',
                            'id_2','name_2','latitude_2','longitude_2', 'categories_2','poi2','match']
    return result_df

In [8]:
# generate DF from indices set
pairs_df = generatePairsDf(data, ind_set)

In [9]:
pairs_df.to_csv('pairs/nonmatching_pairs.csv', index = False)