In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

from geopy.geocoders import Nominatim

import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet as wn
from nltk.stem import PorterStemmer, WordNetLemmatizer

import Levenshtein as lev
import math
from collections import Counter

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import cross_val_score

from xgboost import XGBClassifier

In [2]:
train = pd.read_csv('../input/foursquare-location-matching/train.csv')
#train = train.head()
#train.head()

# Generate test pairs dataset

In [3]:
test = pd.read_csv('../input/foursquare-location-matching/test.csv')
#test = test.head()
#test.head()

In [4]:
def create_match_by_pos(df, train = False, neighbour = 10):
    # create data by nearest 10 neighbours, by train or test dataset
    
    if len(df) < neighbour:
        neighbour = len(df)-1
    neighbor = NearestNeighbors(n_neighbors=neighbour+1, metric="haversine", n_jobs=-1) # haversine: 2 arcsin(sqrt(sin^2(0.5*dx) + cos(x1)cos(x2)sin^2(0.5*dy)))
    position = df.iloc[:,2:4]
    neighbor.fit(position)
    #distance = neighbor.kneighbors(position)[0]
    cloest_pos = neighbor.kneighbors(position)[1]
    
    df_combine = pd.DataFrame()
    for i in range(1,neighbour+1):
        df1 = df.loc[cloest_pos[:,0]]
        df2 = df.loc[cloest_pos[:,i]]
        for col in df1.columns:
            df1.rename(columns={col: f"{col}_1"}, inplace = True)
            df2.rename(columns={col: f"{col}_2"}, inplace = True)    
        df2.reset_index(drop=True,inplace = True)
        df_new = pd.concat([df1,df2], axis = 1)
        if train == True:
            df_new['match'] = df_new['point_of_interest_1'] == df_new['point_of_interest_2']
            df_new.drop(columns=['point_of_interest_1', 'point_of_interest_2'], inplace = True)
        df_combine = pd.concat([df_combine,df_new], axis = 0)
    df_combine.reset_index(drop=True, inplace = True)
    return df_combine

In [5]:
test_pairs = create_match_by_pos(test)
#test_pairs

In [6]:
pairs = pd.read_csv('../input/foursquare-location-matching/pairs.csv')
#pairs = pairs.head()
pairs = pairs.iloc[0:10000,:]
#pairs.head()

In [7]:
loc_name_1 = ['city_1','state_1','zip_1']
loc_name_2 = ['city_2','state_2','zip_2']
name = ['city','state','postcode']

def find_loc(lat,lon,name):
    geolocator = Nominatim(user_agent="geoapiExercises")
    lat = str(lat); lon = str(lon)
    location = geolocator.reverse(lat+","+lon)
    address = location.raw['address']
    return address.get(name)
    
def find_loc_all(df,lat,lon,loc_name,name):

    for i in range(len(loc_name)):
        new_col = df.apply(lambda x: find_loc(x[lat],x[lon],name[i]), axis = 1)
        df[loc_name[i]].fillna(new_col, inplace=True)
    return df

In [8]:
# impute data base on location
# pairs = find_loc_all(pairs,'latitude_1','longitude_1',loc_name_1,name)
# pairs = find_loc_all(pairs,'latitude_2','longitude_2',loc_name_2,name)
# pairs.head()

In [9]:
#pairs.iloc[:,0:12]

In [10]:
#pairs.iloc[:,12:]

In [11]:
#pairs.info()

In [12]:
#pairs.match.value_counts()

# Data preprocessing & Feature transformation:

1. location (latitude, longtitude): finding the distance between two variables
2. word preprocessing: remove url symbol, lowercasing, number removing, punctuation removal, white spaces removal, stop words removal, stemming and lemmatization


In [13]:
# 1. location
def distance(lat1, lon1, lat2, lon2):
    R = 6373.0
    d_lon = lon2 - lon1; d_lat = lat2 - lat1
    a = (np.sin(d_lat/2)) ** 2 + np.cos(lat1) * np.cos(lat2) * (np.sin(d_lon/2)) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    distance = R * c
    return distance

In [14]:
pairs['distance'] = distance(pairs.latitude_1,pairs.longitude_1,pairs.latitude_2,pairs.longitude_2)
test_pairs['distance'] = distance(test_pairs.latitude_1,test_pairs.longitude_1,test_pairs.latitude_2,test_pairs.longitude_2)

In [15]:
# 2 word preprocessing

# lowercase
def lower(df, cols):
    for col in cols:
        df[col] = df[col].fillna('').apply(lambda x: x.lower())
    return df

# number removing
def num_remove(df, cols):
    for col in cols:
        df[col] = df[col].fillna('').apply(lambda x: re.sub(r'\d+', '', x))
    return df

# punctuation removal
def punc_remove(df,cols):
    for col in cols:
        df[col] = df[col].fillna('').apply(lambda x: x.translate(str.maketrans("","", string.punctuation)))
    return df

# white spaces removal
def space_remove(df,cols):
    for col in cols:
        df[col] = df[col].fillna('').apply(lambda x: x.strip())
    return df

# stop words removal

def list_to_string(lis):
    string = ''
    for i in lis:
        string += i
        string += ' '
    return string

def stop(string):
    stops = set(stopwords.words('english'))
    tokens = word_tokenize(string)
    result = [i for i in tokens if not i in stops]
    return result
    
def stop_remove(df,cols):
    for col in cols:
        df[col] = df[col].fillna('').apply(lambda x: stop(x))
    return df

# stemming and lemmatization
def lemma_stem(lis):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    new_lis = []
    for word in lis:
        word_new = lemmatizer.lemmatize(word)
        word_new = stemmer.stem(word_new)
        new_lis += [word_new]
    return new_lis

def lemma_stemming(df,cols):
    for col in cols:
        df[col] = df[col].fillna('').apply(lambda x: lemma_stem(x))
    return df

In [16]:
# remove url
def remove_URL(df,cols):
    for col in cols:
        df[col] = df[col].fillna('').apply(lambda x: re.sub(r"http\S+", "", x))
    return df

In [17]:
word_columns = ['name_1','address_1','city_1','state_1','url_1','categories_1','name_2','address_2','city_2','state_2','url_2','categories_2']
url_columns = ['url_1','url_2']
for data in [pairs, test_pairs]:
    data[url_columns] = remove_URL(data, url_columns)[url_columns]
    data[word_columns] = lower(data, word_columns)[word_columns]
    data[word_columns] = num_remove(data, word_columns)[word_columns]
    data[word_columns] = punc_remove(data, word_columns)[word_columns]
    data[word_columns] = space_remove(data, word_columns)[word_columns]
    data[word_columns] = stop_remove(data, word_columns)[word_columns]
    data[word_columns] = lemma_stemming(data, word_columns)[word_columns]

text similarity

In [18]:
def fuzzy_similarity(df, cols_1, cols_2):
    # length for cols_1 and cols_2 must be the same.
    for i in range(len(cols_1)):
        df[f"{cols_1[i]}_fuzzy"] = df.apply(lambda x: lev.ratio(x[cols_1[i]],x[cols_2[i]]), axis = 1)
    return df    

def counter_cosine_similarity(c1, c2):
    terms = set(c1).union(c2)
    dotprod = sum(c1.get(k, 0) * c2.get(k, 0) for k in terms)
    magA = math.sqrt(sum(c1.get(k, 0)**2 for k in terms))
    magB = math.sqrt(sum(c2.get(k, 0)**2 for k in terms))
    return dotprod / (magA * magB)

In [19]:
col_1 = ['name_1','address_1','city_1','state_1','url_1','categories_1']
col_2 = ['name_2','address_2','city_2','state_2','url_2','categories_2']

def fuzzy(data):
    pairs_new = data.copy()
    for col in word_columns:
        pairs_new[col] = pairs_new[col].apply(lambda x: list_to_string(x))
    pairs_fuzzy = fuzzy_similarity(pairs_new, col_1,col_2)
    pairs_fuzzy = pairs_fuzzy.iloc[:,-len(col_1):]
    data = pd.concat([data, pairs_fuzzy], axis = 1)
    return data
pairs = fuzzy(pairs); test_pairs = fuzzy(test_pairs)


In [20]:
# wordnet similarity method
def similarity(word1, word2):
    max_sim = []
    syn1 = wn.synsets(word1)
    syn2 = wn.synsets(word2)
    for s1 in syn1:
        for s2 in syn2:
            max_sim.append(s1.path_similarity(s2))
    max_sim = [z for z in max_sim if z!=None]
    if max_sim == []:
        return 0
    return max(max_sim)
    
def similarity_lis(lis_1, lis_2):
    score_lis = []
    for word in lis_1:
        score_lis2 = []
        for word_2 in lis_2:
            sim = similarity(word, word_2)
            if sim != 0:
                score_lis2 += [sim]
        if score_lis2 == []:
            score_lis += [0]
        else:
            score_lis+=[max(score_lis2)]
    if score_lis == []:
        return 0
    return sum(score_lis)/len(score_lis)

def sim_lis(lis_1, lis_2):
    return (similarity_lis(lis_1, lis_2) + similarity_lis(lis_2, lis_1))/2

def nltk_similarity(df, cols_1, cols_2):
    # length for cols_1 and cols_2 must be the same.
    for i in range(len(cols_1)):
        df[f"{cols_1[i]}_nltk"] = df.apply(lambda x: sim_lis(x[cols_1[i]],x[cols_2[i]]), axis = 1)
    return df   

In [21]:
col_1 = ['name_1','address_1','city_1','state_1','url_1','categories_1']
col_2 = ['name_2','address_2','city_2','state_2','url_2','categories_2']
for data in [pairs, test_pairs]:
    data = nltk_similarity(data, col_1,col_2)

In [22]:
cols = ['latitude_1','longitude_1','country_1','latitude_2','longitude_2','country_2','name_1_fuzzy', 'address_1_fuzzy', 'city_1_fuzzy',
       'state_1_fuzzy', 'url_1_fuzzy', 'categories_1_fuzzy','name_1_nltk', 'address_1_nltk', 'city_1_nltk', 'state_1_nltk',
       'url_1_nltk', 'categories_1_nltk','match']
test_cols = ['latitude_1','longitude_1','country_1','latitude_2','longitude_2','country_2','name_1_fuzzy', 'address_1_fuzzy', 'city_1_fuzzy',
       'state_1_fuzzy', 'url_1_fuzzy', 'categories_1_fuzzy','name_1_nltk', 'address_1_nltk', 'city_1_nltk', 'state_1_nltk',
       'url_1_nltk', 'categories_1_nltk']
ids = ['id_1','id_2']
#pairs[cols].head()

In [23]:
train_id = pairs[ids]; test_id = pairs[ids]
train_X = pairs[cols]; test_X = test_pairs[test_cols]
for X in [train_X,test_X]:
    X['country_same'] = np.where(X['country_1'] == X['country_2'],1,0)
    del X['country_1'] 
    del X['country_2']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [24]:
col_scaler = ['latitude_1','longitude_1','latitude_2','longitude_2']
X_transform = train_X[col_scaler]; X_transform_test = test_X[col_scaler]
transformer = RobustScaler().fit(X_transform)
train_X[col_scaler] = transformer.transform(X_transform)
test_X[col_scaler] = transformer.transform(X_transform_test)
train_y = pairs['match']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


In [25]:
train_X.to_csv("train_X.csv", index=False)
pd.DataFrame({'match':train_y}).to_csv("train_y.csv", index=False)

In [26]:
xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.3, max_delta_step=0, max_depth=3,
              min_child_weight=2, monotone_constraints='()',
              n_estimators=1600, n_jobs=-1, nthread=-1, num_parallel_tree=1,
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=0.9, tree_method='exact', validate_parameters=1,
              verbosity=None)
xgb.fit(train_X, train_y)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.5,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type='gain', interaction_constraints='',
              learning_rate=0.3, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=2,
              missing=nan, monotone_constraints='()', n_estimators=1600,
              n_jobs=-1, nthread=-1, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0, ...)

In [27]:
test_y = xgb.predict(train_X)

In [28]:
test_pairs_final = test_id
test_pairs_final['match'] = test_y
test_id_unique = test['id']

def id_match(test_id_unique, test_pairs_final):
    match_list = []
    for id in test_id_unique:
        id_pairs = test_pairs_final[test_pairs_final.id_1 == id]
        success_pairs = id_pairs[id_pairs['match'] == 1]
        if len(success_pairs) == 0:
            matches = id
        else:
            matches = ' '.join([id] + success_pairs.id_2.tolist())
        match_list += [matches]
    return match_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [29]:
match = id_match(test_id_unique, test_pairs_final)
submission = pd.DataFrame({'id':test['id'],'matches':match})

In [30]:
submission.to_csv("submission.csv", index=False)

In [31]:
submission.head()

Unnamed: 0,id,matches
0,E_00001118ad0191,E_00001118ad0191
1,E_000020eb6fed40,E_000020eb6fed40
2,E_00002f98667edf,E_00002f98667edf
3,E_001b6bad66eb98,E_001b6bad66eb98
4,E_0283d9f61e569d,E_0283d9f61e569d E_5ec7420a228da8 E_9c05396793...
