In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

from geopy.geocoders import Nominatim

import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet as wn
from nltk.stem import PorterStemmer, WordNetLemmatizer

import Levenshtein as lev
import math
from collections import Counter

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import cross_val_score

from xgboost import XGBClassifier
from pickle import dump, load
import time
from sklearn.neighbors import BallTree


import itertools
from tqdm.auto import tqdm
tqdm.pandas()

In [2]:
start_time = time.time()

# Generate test pairs dataset

In [3]:
test = pd.read_csv('../input/foursquare-location-matching/test.csv')
#test = pd.read_csv('../input/foursquare-location-matching/train.csv').iloc[:,:-1]
#test = pd.read_csv('../input/foursquare-location-matching/train.csv').iloc[:100000,:-1]

In [4]:
test.shape

(5, 12)

In [5]:
# %load_ext Cython

In [6]:
def create_match_loc(test, neighbour = 11):
    # minimum neighbour: 3 (include itself)
    if len(test) < neighbour:
        neighbour = len(test)
    tree = BallTree(np.deg2rad(test[['latitude', 'longitude']].values), metric='haversine')
    dist, ind = tree.query(np.deg2rad(test[['latitude', 'longitude']].values), k=neighbour)
    dist = dist[:,1:].squeeze()
    ind = ind[:,1:].squeeze()
    test_col = test.columns.tolist()
    combine_col = [str + '_1' for str in tqdm(test_col)] + [str + '_2' for str in tqdm(test_col)]
    df_combine = pd.DataFrame(np.concatenate([
                np.repeat(np.array(test), neighbour-1, axis = 0),
                test.iloc[list(itertools.chain.from_iterable(ind.tolist())),:]
               ], axis=1))    
    df_combine.columns = combine_col
    return df_combine                          

# old function

def create_match_by_pos(df, train = False, neighbour = 10):
    # create data by nearest 10 neighbours, by train or test dataset
    
    if len(df) < neighbour:
        neighbour = len(df)-1
    neighbor = NearestNeighbors(n_neighbors=neighbour+1, metric="haversine", n_jobs=-1) # haversine: 2 arcsin(sqrt(sin^2(0.5*dx) + cos(x1)cos(x2)sin^2(0.5*dy)))
    position = df.iloc[:,2:4]
    neighbor.fit(position)
    #distance = neighbor.kneighbors(position)[0]
    cloest_pos = neighbor.kneighbors(position)[1]
    
    df_combine = pd.DataFrame()
    for i in range(1,neighbour+1):
        df1 = df.loc[cloest_pos[:,0]]
        df2 = df.loc[cloest_pos[:,i]]
        for col in df1.columns:
            df1.rename(columns={col: f"{col}_1"}, inplace = True)
            df2.rename(columns={col: f"{col}_2"}, inplace = True)    
        df2.reset_index(drop=True,inplace = True)
        df_new = pd.concat([df1,df2], axis = 1)
        if train == True:
            df_new['match'] = df_new['point_of_interest_1'] == df_new['point_of_interest_2']
            df_new.drop(columns=['point_of_interest_1', 'point_of_interest_2'], inplace = True)
        df_combine = pd.concat([df_combine,df_new], axis = 0)
    df_combine.reset_index(drop=True, inplace = True)
    return df_combine

In [7]:
test_pairs = create_match_loc(test)

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

In [8]:
test_pairs.shape

(20, 24)

# Data Cleanning: categorical filling with '', numerical filling with 0

In [9]:
# because my computer's RAM is not enough, replace column with more than 70% missing values to 0 and 1
del_list = ['url_1','url_2','phone_1','phone_2']
for col in tqdm(del_list):
    test_pairs[col] = test_pairs[col].notnull().astype('int')

  0%|          | 0/4 [00:00<?, ?it/s]

In [10]:
cat_col = test_pairs.select_dtypes(include = ['object']).columns
test_pairs[cat_col] = test_pairs[cat_col].fillna('')

Data preprocessing & Feature transformation:
1. location (latitude, longtitude): finding the distance between two variables
2. word preprocessing: remove url symbol, stop words removal

In [11]:
# 1. location
def distance(lat1, lon1, lat2, lon2):
    R = 6373.0
    d_lon = lon2 - lon1; d_lat = lat2 - lat1
    a = (np.sin(d_lat/2)) ** 2 + np.cos(lat1) * np.cos(lat2) * (np.sin(d_lon/2)) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    distance = R * c
    return distance

# remove url
def remove_URL(df,cols):
    for col in tqdm(cols):
        df[col] = df[col].fillna('').progress_apply(lambda x: re.sub(r"http\S+", "", x))
    return df

# stop words removal

def list_to_string(lis):
    string = ''
    for i in tqdm(lis):
        string += i
        string += ' '
    return string[:-1]

def stop(string):
    stops = set(stopwords.words('english'))
    tokens = word_tokenize(string)
    result = [i for i in tqdm(tokens) if not i in stops]
    return result
    
def stop_remove(df,cols):
    stops = set(stopwords.words('english'))
    for col in tqdm(cols):
        df[col] = df[col].progress_apply(lambda x: ' '.join([word for word in x.split() if word not in stops]))
    return df

In [12]:
stop_columns = ['name_1','address_1','name_2','address_2']
url_columns = ['url_1','url_2']
test_pairs['distance'] = distance(test_pairs.latitude_1,test_pairs.longitude_1,test_pairs.latitude_2,test_pairs.longitude_2)

In [13]:
#test_pairs[url_columns] = remove_URL(test_pairs, url_columns)[url_columns]

In [14]:
test_pairs[stop_columns] = stop_remove(test_pairs, stop_columns)[stop_columns]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

In [15]:
def fuzzy_similarity(df, cols_1, cols_2):
    # length for cols_1 and cols_2 must be the same.
    for i in tqdm(range(len(cols_1))):
        df[f"{cols_1[i]}_fuzzy"] = df.progress_apply(lambda x: lev.ratio(x[cols_1[i]],x[cols_2[i]]), axis = 1)
    return df    

In [16]:
col_1 = ['name_1','address_1','categories_1']
col_2 = ['name_2','address_2','categories_2']
test_pairs = fuzzy_similarity(test_pairs, col_1, col_2)

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Feature Selection

In [17]:
cols = ['latitude_1','longitude_1','country_1','latitude_2','longitude_2','country_2','name_1_fuzzy', 'address_1_fuzzy', 'categories_1_fuzzy','url_1','url_2','phone_1','phone_2']
ids = ['id_1','id_2']

In [18]:
test_id = test_pairs[ids]
test_X = test_pairs[cols]
test_X['country_same'] = np.where(test_X['country_1'] == test_X['country_2'],1,0)
del test_X['country_1'] 
del test_X['country_2']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


# model

In [19]:
# load robust scaler
col_scaler = ['latitude_1','longitude_1','latitude_2','longitude_2']
X_transform = test_X[col_scaler]
scaler = load(open('../input/fourpoints-location-matching/scaler.pkl', 'rb'))
test_X[col_scaler] = scaler.transform(X_transform)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


In [20]:
# load xgboost
xgb = XGBClassifier()
xgb.load_model("../input/fourpoints-location-matching/model.json")

In [21]:
# predict
test_y = xgb.predict(test_X)

In [22]:
test_pairs_final = test_id
test_pairs_final['match'] = test_y
test_id_unique = test['id']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [23]:
base = pd.concat([test_id_unique,test_id_unique,pd.Series(np.repeat(1,len(test_id_unique)))],axis = 1)
base.columns = test_pairs_final.columns
test_pairs_final = pd.concat([test_pairs_final,base],axis = 0)
test_pairs_final.reset_index(inplace = True)

In [24]:
match_list = []
match_id = test_pairs_final[test_pairs_final['match']==1]
grouped = match_id.groupby('id_1')
for name, group in tqdm(grouped):
    #if len(success_pairs) == 0:
    matches = ' '.join(group.id_2.tolist())
    match_list += [matches]

  0%|          | 0/5 [00:00<?, ?it/s]

In [25]:
submission = pd.DataFrame({'id':test['id'],'matches':match_list})
submission.to_csv("submission.csv", index=False)
submission.head()

Unnamed: 0,id,matches
0,E_00001118ad0191,E_00002f98667edf E_000020eb6fed40 E_00001118ad...
1,E_000020eb6fed40,E_00002f98667edf E_00001118ad0191 E_001b6bad66...
2,E_00002f98667edf,E_00001118ad0191 E_001b6bad66eb98 E_0283d9f61e...
3,E_001b6bad66eb98,E_0283d9f61e569d E_00001118ad0191 E_00002f9866...
4,E_0283d9f61e569d,E_001b6bad66eb98 E_00001118ad0191 E_00002f9866...


In [26]:
submission.matches[0]

'E_00002f98667edf E_000020eb6fed40 E_00001118ad0191'

In [27]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 1.5962259769439697 seconds ---
