In [29]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

from geopy.geocoders import Nominatim

import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet as wn
from nltk.stem import PorterStemmer, WordNetLemmatizer

import Levenshtein as lev
import math
from collections import Counter

In [30]:
train = pd.read_csv('../input/foursquare-location-matching/train.csv')
pairs = pd.read_csv('../input/foursquare-location-matching/pairs.csv')
#pairs = pairs.iloc[0:10000,:]

In [31]:
#pairs.info()

# Data Cleanning: categorical filling with '', numerical filling with 0

In [32]:
cat_col = pairs.select_dtypes(include = ['object']).columns
pairs[cat_col] = pairs[cat_col].fillna('')

Data preprocessing & Feature transformation:
1. location (latitude, longtitude): finding the distance between two variables
2. word preprocessing: remove url symbol, stop words removal

In [33]:
# 1. location
def distance(lat1, lon1, lat2, lon2):
    R = 6373.0
    d_lon = lon2 - lon1; d_lat = lat2 - lat1
    a = (np.sin(d_lat/2)) ** 2 + np.cos(lat1) * np.cos(lat2) * (np.sin(d_lon/2)) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    distance = R * c
    return distance

# remove url
def remove_URL(df,cols):
    for col in cols:
        df[col] = df[col].fillna('').apply(lambda x: re.sub(r"http\S+", "", x))
    return df

# stop words removal

def list_to_string(lis):
    string = ''
    for i in lis:
        string += i
        string += ' '
    return string[:-1]

def stop(string):
    stops = set(stopwords.words('english'))
    tokens = word_tokenize(string)
    result = [i for i in tokens if not i in stops]
    return result
    
def stop_remove(df,cols):
    for col in cols:
        df[col] = df[col].fillna('').apply(lambda x: stop(x))
        df[col] = df[col].apply(lambda x: list_to_string(x))
    return df

In [34]:
stop_columns = ['name_1','address_1','url_1','categories_1','name_2','address_2','url_2','categories_2']
url_columns = ['url_1','url_2']
pairs['distance'] = distance(pairs.latitude_1,pairs.longitude_1,pairs.latitude_2,pairs.longitude_2)
pairs[url_columns] = remove_URL(pairs, url_columns)[url_columns]
pairs[stop_columns] = stop_remove(pairs, stop_columns)[stop_columns]

text similarity: fuzzy, cosine similarity

In [35]:
def fuzzy_similarity(df, cols_1, cols_2):
    # length for cols_1 and cols_2 must be the same.
    for i in range(len(cols_1)):
        df[f"{cols_1[i]}_fuzzy"] = df.apply(lambda x: lev.ratio(x[cols_1[i]],x[cols_2[i]]), axis = 1)
    return df    

In [36]:
col_1 = ['name_1','address_1','city_1','state_1','url_1','categories_1']
col_2 = ['name_2','address_2','city_2','state_2','url_2','categories_2']
pairs = fuzzy_similarity(pairs, col_1, col_2)

In [37]:
pairs.to_csv("train_pairs.csv", index=False)