# Generate 'Match' pairs from train

In [1]:
import pandas as pd
import itertools
import time
from tqdm import tqdm
import os

In [2]:
data = pd.read_csv('datasets/train.csv')
data = data.drop(columns=['address','city','state','zip','country','url','phone'])

Let's find points of interest what have at least 2 entries.

In [3]:
poi_count = data.groupby('point_of_interest')['point_of_interest'].count().reset_index(name='count')
pois_to_match = poi_count[poi_count['count']>1]
pois_to_match.head()

Unnamed: 0,point_of_interest,count
1,P_00001c309a5e0a,4
3,P_0000561fe92bed,2
4,P_0000bccb92573c,2
5,P_0000c58a53df6d,2
7,P_00010455487fcf,2


In [4]:
len(pois_to_match)

314948

Now we should leave rows of the dataset that located in pois_to_match.

In [5]:
data = data[data['point_of_interest'].isin(pois_to_match['point_of_interest'])]
data.reset_index(drop = True, inplace = True)
data.head()

Unnamed: 0,id,name,latitude,longitude,categories,point_of_interest
0,E_000001272c6c5d,Café Stad Oudenaarde,50.859975,3.634196,Bars,P_677e840bb6fc7e
1,E_000002eae2a589,Carioca Manero,-22.907225,-43.178244,Brazilian Restaurants,P_d82910d8382a83
2,E_00001d92066153,Restaurante Casa Cofiño,43.338196,-4.326821,Spanish Restaurants,P_809a884d4407fb
3,E_000023d8f4be44,Island Spa,14.51897,121.018702,Spas,P_020de174484ec6
4,E_0000764d65557e,McDonald's,-7.265894,112.749382,Fast Food Restaurants,P_be89c778befb23


In [6]:
#data = data.iloc[:10000]

In [7]:
#grouping objects by pois to create pairs
grouped_data = data.groupby(['point_of_interest'])
pois_dict = grouped_data.groups

In [8]:
def generatePairs(data, pois_dict):
    custom_pairs = pd.DataFrame(columns = ['id_1','name_1','latitude_1','longitude_1','categories_1',
                                       'id_2','name_2','latitude_2','longitude_2', 'categories_2','poi','match'])
    output_path='custom_pairs.csv'
    counter = 0
    for poi,ids in tqdm(pois_dict.items()):
        counter += 1
        if len(ids)>1:
            temp_pairs = pd.DataFrame(columns = ['id_1','name_1','latitude_1','longitude_1','categories_1',
                                           'id_2','name_2','latitude_2','longitude_2', 'categories_2','poi','match'])
            for (id1,id2) in itertools.combinations(ids,2):
                match_to_add = pd.concat([data.iloc[id1].drop(labels = ['point_of_interest']), 
                                          data.iloc[id2].drop(labels = ['point_of_interest']),
                                          pd.Series([poi,True],index=['poi', 'match'])], 
                                         axis = 0).to_frame().transpose()
                match_to_add.columns = ['id_1','name_1','latitude_1','longitude_1','categories_1',
                                        'id_2','name_2','latitude_2','longitude_2', 'categories_2','poi','match']
                temp_pairs = pd.concat([temp_pairs,match_to_add], ignore_index = True)
            custom_pairs = pd.concat([custom_pairs,temp_pairs], ignore_index = True)
        
            if counter%100 == 0:
                custom_pairs.to_csv(output_path, mode='a', header=not os.path.exists(output_path), index = False)
                custom_pairs = pd.DataFrame(columns = ['id_1','name_1','latitude_1','longitude_1','categories_1',
                                           'id_2','name_2','latitude_2','longitude_2', 'categories_2','poi','match'])
    
    if counter%100!=0:
        custom_pairs.to_csv(output_path, mode='a', header=not os.path.exists(output_path), index = False)
    print("PAIRS GENERATED")
            

In [9]:
%%time
generatePairs(data,pois_dict)

100%|█████████████████████████████████████████████████████████████████████████| 314948/314948 [40:31<00:00, 129.54it/s]

PAIRS GENERATED
Wall time: 40min 31s





In [11]:
pd.read_csv('custom_pairs.csv')

Unnamed: 0,id_1,name_1,latitude_1,longitude_1,categories_1,id_2,name_2,latitude_2,longitude_2,categories_1.1,poi,match
0,E_3b8a876a004093,Taichung International Airport (RMQ) (臺中國際機場),24.254227,120.599732,"Airports, Military Bases",E_7b12ba037d7af8,Taichung Airport Int'l Terminal,24.254209,120.599844,Airport Terminals,P_00001c309a5e0a,True
1,E_3b8a876a004093,Taichung International Airport (RMQ) (臺中國際機場),24.254227,120.599732,"Airports, Military Bases",E_a66af820515384,Taichung International Airport,24.255480,120.600316,Airports,P_00001c309a5e0a,True
2,E_3b8a876a004093,Taichung International Airport (RMQ) (臺中國際機場),24.254227,120.599732,"Airports, Military Bases",E_c1f62f2aa41ff6,台中清泉機場,24.214837,120.622577,Airports,P_00001c309a5e0a,True
3,E_7b12ba037d7af8,Taichung Airport Int'l Terminal,24.254209,120.599844,Airport Terminals,E_a66af820515384,Taichung International Airport,24.255480,120.600316,Airports,P_00001c309a5e0a,True
4,E_7b12ba037d7af8,Taichung Airport Int'l Terminal,24.254209,120.599844,Airport Terminals,E_c1f62f2aa41ff6,台中清泉機場,24.214837,120.622577,Airports,P_00001c309a5e0a,True
...,...,...,...,...,...,...,...,...,...,...,...,...
950498,E_a7aa308dd4169b,Ppp-Aps Group,51.143466,4.558219,"Factories, Offices",E_ec26c606217b82,PPP,51.143397,4.558140,Offices,P_fffef398f6f50c,True
950499,E_581a04f7b94fda,Centrum Bar Zaandam,52.433176,4.829669,Bars,E_d2eb367d863c79,Centrum Bar Zaandam,52.438678,4.824190,Bars,P_ffff001f71e321,True
950500,E_47d063fddd9115,Darty,48.191717,6.474805,Electronics Stores,E_ca42d07c618f1c,DARTY Epinal,48.191275,6.474963,"Electronics Stores, Furniture / Home Stores",P_ffff287983c417,True
950501,E_12c3aee0fd46bf,Doggis,-23.580388,-46.594376,Hot Dog Joints,E_62e191be55a951,Doggis,-23.580677,-46.594296,"Hot Dog Joints, Fast Food Restaurants",P_ffffa5ef8f6b07,True
