In [28]:
import json
import csv
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support as prf_score
from sklearn.metrics import accuracy_score as accuracy_score
import Levenshtein as lv
from difflib import SequenceMatcher
import affinegap
import scipy
import timeit
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from scipy.stats import randint as sp_randint
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', None)

In [34]:
def clean_foursquare(dataset):
        #OVERALL (NaN and None)
        dataset.replace('', np.nan, inplace=True)
        dataset.fillna(value=np.nan, inplace=True)
        # Dropping country and region (non-original)
        dataset.drop(['country','region', 'phone', 'website', 'street_address', 'locality', 'postal_code', 'latitude', 'longitude'],axis=1, inplace=True)

        #FEATURE: NAME
        dataset['name'] = dataset['name'].str.upper()
        dataset['name'] = dataset['name'].str.replace(' ', '').str.replace('É', 'E').str.replace('&', 'AND').str.replace('[^a-zA-Z0-9]', '')
        return dataset

In [35]:
def clean_locu(dataset):
        #OVERALL (NaN and None)
        dataset.replace('', np.nan, inplace=True)
        dataset.fillna(value=np.nan, inplace=True)
        # Dropping country and region (non-original)
        dataset.drop(['country','region', 'phone', 'website', 'street_address', 'locality', 'postal_code', 'latitude', 'longitude'],axis=1, inplace=True)

        #FEATURE: NAME
        # SHOOULD WE ALIGN THINGS LIKE PIZZERIA VS PIZZA AS PART OF DATA CLEANING?
        dataset['name'] = dataset['name'].str.upper()
        dataset['name'] = dataset['name'].str.replace(' ', '').str.replace('É', 'E').str.replace('&', 'AND').str.replace('[^a-zA-Z0-9]', '')

        return dataset

In [36]:
foursquare_train_path = pd.read_json('data/training/fs.json')
locu_train_path = pd.read_json('data/training/locu.json')
foursquare_test_path = pd.read_json('data/training/fs.json')
locu_test_path = pd.read_json('data/training/locu.json')

foursquare_train = clean_foursquare(foursquare_train_path)
locu_train = clean_locu(locu_train_path)
foursquare_test = clean_foursquare(foursquare_test_path)
locu_test = clean_locu(locu_test_path)

In [37]:
def create_dataframe(foursquare, locu):
        foursquare_ids_list = list(foursquare['id'])
        locu_ids_list = list(locu['id'])
        length_fids = len(foursquare_ids_list)
        length_lids = len(locu_ids_list)

        locu_ids_repeated = np.repeat(locu_ids_list,length_fids)
        #ex. [0,0,0,1,1,1,2,2,2]
        foursquare_ids_tiled = np.tile(foursquare_ids_list,length_lids)
        #ex. [0,1,2,0,1,2,0,1,2]
        df = pd.DataFrame({'locu_id': locu_ids_repeated,'foursquare_id':foursquare_ids_tiled})
        #ex. [0,0] [0,1] [0,2] [1,0] [1,1] [1,2] [2,0] [2,1] [2,2]
        foursquare = foursquare.add_suffix('_F')
        locu = locu.add_suffix('_L')
        df = df.merge(foursquare,left_on='foursquare_id',right_on=['id_F'],how='left').merge(locu,left_on='locu_id',right_on='id_L', how='left')
        df['unique_id'] = df['foursquare_id'] + df['locu_id']
        return df

In [38]:
df_train = create_dataframe(foursquare_train, locu_train)
df_test = create_dataframe(foursquare_test, locu_test)

In [39]:
df_train.head()

Unnamed: 0,locu_id,foursquare_id,id_F,name_F,id_L,name_L,unique_id
0,cc9e8f40230c6ead2873,4f328ea619836c91c7e3714a,4f328ea619836c91c7e3714a,CHENJINDIAORESTAURANT,cc9e8f40230c6ead2873,CHIPOTLEMEXICANGRILL,4f328ea619836c91c7e3714acc9e8f40230c6ead2873
1,cc9e8f40230c6ead2873,4c37b5f6ae2da593a56affc5,4c37b5f6ae2da593a56affc5,WESTSIDESTEAKHOUSE,cc9e8f40230c6ead2873,CHIPOTLEMEXICANGRILL,4c37b5f6ae2da593a56affc5cc9e8f40230c6ead2873
2,cc9e8f40230c6ead2873,4b41060df964a52098bf25e3,4b41060df964a52098bf25e3,PEARLSCHINESEANDSZECHUANCUISINE,cc9e8f40230c6ead2873,CHIPOTLEMEXICANGRILL,4b41060df964a52098bf25e3cc9e8f40230c6ead2873
3,cc9e8f40230c6ead2873,3fd66200f964a520ece41ee3,3fd66200f964a520ece41ee3,SUSPENDERS,cc9e8f40230c6ead2873,CHIPOTLEMEXICANGRILL,3fd66200f964a520ece41ee3cc9e8f40230c6ead2873
4,cc9e8f40230c6ead2873,52064aab11d284f64d088329,52064aab11d284f64d088329,GRANDCENTURYCAFE,cc9e8f40230c6ead2873,CHIPOTLEMEXICANGRILL,52064aab11d284f64d088329cc9e8f40230c6ead2873


In [40]:
def either_string_is_null(str1,str2):
        if pd.isnull(str1) or pd.isnull(str2):
            return True
        else:
            return False

In [41]:
def aff(str1,str2):
        if either_string_is_null(str1,str2):
            return np.nan
        else:
            return affinegap.affineGapDistance(str1,str2)

In [42]:
def lev(str1,str2):
        if either_string_is_null(str1,str2):
            return np.nan
        else:
            return lv.distance(str1, str2)

In [43]:
def sim(str1, str2):
        if either_string_is_null(str1,str2):
            return np.nan
        else:
            return SequenceMatcher(None, str1, str2).ratio()

In [44]:
def lenlongcommon(str1,str2):
        if either_string_is_null(str1,str2):
            return np.nan
        else:
        # initialize SequenceMatcher object with
         # input string
            seqMatch = SequenceMatcher(None,str1,str2)

             # find match of longest sub-string
             # output will be like Match(a=0, b=0, size=5)
            match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
        return match.size

In [45]:
def add_features(df):
        #FEATURE: PHONE

        #perfect match
        df['name_perfect_match'] = df['name_F'] == df['name_L']

        a = timeit.default_timer()

        for col in ['name']:
            print('aff,lev,sim,llc')
            print(col)
            df[col+'_aff'] = df[[col+'_F',col+'_L']].apply(lambda x: aff(*x), axis=1)
            #Your statements here

            b = timeit.default_timer()
            print(b-a)

            df[col+'_lev'] = df[[col+'_F',col+'_L']].apply(lambda x: lev(*x), axis=1)
            c = timeit.default_timer()
            print(c-b)

            df[col+'_sim'] = df[[col+'_F',col+'_L']].apply(lambda x: sim(*x), axis=1)
            d = timeit.default_timer()
            print(d-c)

            df[col+'_llc'] = df[[col+'_F',col+'_L']].apply(lambda x: lenlongcommon(*x), axis=1)
            e = timeit.default_timer()
            print(e-d)

        return df

In [46]:
df_train_with_created_features = add_features(df_train)
df_test_with_created_features = add_features(df_test)

aff,lev,sim,llc
name
7.985593176999828
6.836593752988847
22.67723712400766
13.841632766998373
aff,lev,sim,llc
name
8.110456251000869
7.048826088997885
24.33852089800348
13.30525373898854


In [47]:
#Add matches column
matches_path = pd.read_csv('data/training/match.csv')
matches_train_path, matches_test_path = train_test_split(matches_path, test_size=0.2)
matches_train = matches_train_path.rename(index=str,columns={'foursquare_id':'true_foursquare_id'})
df_train_with_created_features_with_FID = df_train_with_created_features.merge(matches_train, on='locu_id', how='inner')


In [60]:
df_test_with_created_features = df_test_with_created_features.merge(matches_test_path[['locu_id']], on='locu_id', how='inner')

In [64]:
df_test_with_created_features.head()
df_test_with_created_features.count()

locu_id               43200
foursquare_id         43200
id_F                  43200
name_F                43200
id_L                  43200
name_L                43200
unique_id             43200
name_perfect_match    43200
name_aff              43200
name_lev              43200
name_sim              43200
name_llc              43200
dtype: int64

In [48]:
df_train_with_created_features_with_FID.head()

Unnamed: 0,locu_id,foursquare_id,id_F,name_F,id_L,name_L,unique_id,name_perfect_match,name_aff,name_lev,name_sim,name_llc,true_foursquare_id
0,cc9e8f40230c6ead2873,4f328ea619836c91c7e3714a,4f328ea619836c91c7e3714a,CHENJINDIAORESTAURANT,cc9e8f40230c6ead2873,CHIPOTLEMEXICANGRILL,4f328ea619836c91c7e3714acc9e8f40230c6ead2873,False,200.125,17,0.292683,2,4a5382d2f964a5205cb21fe3
1,cc9e8f40230c6ead2873,4c37b5f6ae2da593a56affc5,4c37b5f6ae2da593a56affc5,WESTSIDESTEAKHOUSE,cc9e8f40230c6ead2873,CHIPOTLEMEXICANGRILL,4c37b5f6ae2da593a56affc5cc9e8f40230c6ead2873,False,191.0,17,0.157895,1,4a5382d2f964a5205cb21fe3
2,cc9e8f40230c6ead2873,4b41060df964a52098bf25e3,4b41060df964a52098bf25e3,PEARLSCHINESEANDSZECHUANCUISINE,cc9e8f40230c6ead2873,CHIPOTLEMEXICANGRILL,4b41060df964a52098bf25e3cc9e8f40230c6ead2873,False,208.875,24,0.313725,3,4a5382d2f964a5205cb21fe3
3,cc9e8f40230c6ead2873,3fd66200f964a520ece41ee3,3fd66200f964a520ece41ee3,SUSPENDERS,cc9e8f40230c6ead2873,CHIPOTLEMEXICANGRILL,3fd66200f964a520ece41ee3cc9e8f40230c6ead2873,False,100.0,17,0.266667,1,4a5382d2f964a5205cb21fe3
4,cc9e8f40230c6ead2873,52064aab11d284f64d088329,52064aab11d284f64d088329,GRANDCENTURYCAFE,cc9e8f40230c6ead2873,CHIPOTLEMEXICANGRILL,52064aab11d284f64d088329cc9e8f40230c6ead2873,False,160.75,18,0.111111,2,4a5382d2f964a5205cb21fe3


In [65]:
#add a target
df_train_with_created_features_with_FID['target'] = df_train_with_created_features_with_FID['foursquare_id']==df_train_with_created_features_with_FID['true_foursquare_id']

features_to_keep = ['name_perfect_match']+[col for col in df_train_with_created_features_with_FID.columns if 'aff' in col or 'lev' in col or 'sim' in col or 'llc' in col]

#clean train data
X_train = df_train_with_created_features_with_FID[features_to_keep]
id_mapping_train = df_train_with_created_features_with_FID[['foursquare_id', 'locu_id']]
# clean test data
X_test = df_test_with_created_features[features_to_keep]
id_mapping_test = df_test_with_created_features[['foursquare_id', 'locu_id']]
#Create target
target = df_train_with_created_features_with_FID['target']

#deal with NaN to pass into the random forest
X_train = X_train.fillna(-10000, axis =1).astype(int)
X_test = X_test.fillna(-10000, axis =1).astype(int)

In [50]:
X_train.head()

Unnamed: 0,name_perfect_match,name_aff,name_lev,name_sim,name_llc
0,0,200,17,0,2
1,0,191,17,0,1
2,0,208,24,0,3
3,0,100,17,0,1
4,0,160,18,0,2


In [66]:
import sklearn
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

In [67]:
from sklearn.feature_selection import SelectFromModel
clf1 = RandomForestClassifier()

# Set a minimum threshold of 0.25
sfm = SelectFromModel(clf1, threshold='.15*mean')
sfm.fit(X_train, target)
X_train = sfm.transform(X_train)
X_test = sfm.transform(X_test)
param_grid = {"max_depth": [None],
              "max_features": ['log2','sqrt'],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"],
             "class_weight": [None]}


In [68]:
grid = GridSearchCV(RandomForestClassifier(random_state=0),param_grid=param_grid, cv=StratifiedKFold())
grid.fit(X_train, target)
print(grid.cv_results_)

predict_train = grid.predict_proba(X_train)
predict_test = grid.predict_proba(X_test)
predict1 = pd.DataFrame(predict_train)
predict1_test = pd.DataFrame(predict_test)


X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
X_train.head()



{'mean_fit_time': array([0.35785596, 0.33656406, 0.26431346, 0.22284524, 0.40148107,
       0.38055992, 0.24303873, 0.24189385]), 'std_fit_time': array([0.03198127, 0.02045886, 0.02812229, 0.01639256, 0.05012084,
       0.04947943, 0.01319134, 0.0135726 ]), 'mean_score_time': array([0.02717932, 0.02875106, 0.0251263 , 0.02176229, 0.02559288,
       0.02410603, 0.021034  , 0.02115552]), 'std_score_time': array([0.00181796, 0.00348801, 0.00293528, 0.00171011, 0.00197031,
       0.00253554, 0.00059573, 0.00030244]), 'param_bootstrap': masked_array(data=[True, True, True, True, False, False, False, False],
             mask=[False, False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_class_weight': masked_array(data=[None, None, None, None, None, None, None, None],
             mask=[False, False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_criterion': masked_array(data=['gini', 'g

Unnamed: 0,0,1,2,3,4
0,0,200,17,0,2
1,0,191,17,0,1
2,0,208,24,0,3
3,0,100,17,0,1
4,0,160,18,0,2


In [69]:

X_train['predict_proba'] = predict1[1]
X_test['predict_proba'] = predict1_test[1]

# add id mapping
X_train_with_id = pd.concat([X_train,id_mapping_train],axis=1)
X_test_with_id = pd.concat([X_test,id_mapping_test],axis=1)

#add predicted probability
X_train_with_id['y_pred'] = (X_train_with_id['predict_proba']>.3).astype(int)
X_test_with_id['y_pred'] = (X_test_with_id['predict_proba']>.3).astype(int)

#create the final test output
final_test_output = X_test_with_id[X_test_with_id['y_pred']==1][['locu_id', 'foursquare_id']]

In [70]:
X_test_with_id.head()

Unnamed: 0,0,1,2,3,4,predict_proba,foursquare_id,locu_id,y_pred
0,0,210,18,0,2,0.0,4f328ea619836c91c7e3714a,5be7ca603ffa653eea64,0
1,0,172,15,0,2,0.0,4c37b5f6ae2da593a56affc5,5be7ca603ffa653eea64,0
2,0,200,23,0,3,0.0,4b41060df964a52098bf25e3,5be7ca603ffa653eea64,0
3,0,102,15,0,2,0.0,3fd66200f964a520ece41ee3,5be7ca603ffa653eea64,0
4,0,160,15,0,4,0.0,52064aab11d284f64d088329,5be7ca603ffa653eea64,0


In [71]:
final_test_output.to_csv('matches_test.csv', index = False)

In [80]:
X_predicted = X_test_with_id[['locu_id', 'foursquare_id', 'predict_proba', 'y_pred']]
X_predicted.head()

Unnamed: 0,locu_id,foursquare_id,predict_proba,y_pred
0,5be7ca603ffa653eea64,4f328ea619836c91c7e3714a,0.0,0
1,5be7ca603ffa653eea64,4c37b5f6ae2da593a56affc5,0.0,0
2,5be7ca603ffa653eea64,4b41060df964a52098bf25e3,0.0,0
3,5be7ca603ffa653eea64,3fd66200f964a520ece41ee3,0.0,0
4,5be7ca603ffa653eea64,52064aab11d284f64d088329,0.0,0


In [81]:
matches_test = matches_test_path.rename(columns = {'foursquare_id':'actual_foursquare_id',}, inplace = True)
compare_output = X_predicted.merge(matches_test_path, on='locu_id',how='inner')
compare_output['y_actual'] = (compare_output['foursquare_id'] == compare_output['actual_foursquare_id']).astype(int)
compare_output.head()




Unnamed: 0,locu_id,foursquare_id,predict_proba,y_pred,actual_foursquare_id,y_actual
0,5be7ca603ffa653eea64,4f328ea619836c91c7e3714a,0.0,0,4d39d3a4039eb60c288fe59c,0
1,5be7ca603ffa653eea64,4c37b5f6ae2da593a56affc5,0.0,0,4d39d3a4039eb60c288fe59c,0
2,5be7ca603ffa653eea64,4b41060df964a52098bf25e3,0.0,0,4d39d3a4039eb60c288fe59c,0
3,5be7ca603ffa653eea64,3fd66200f964a520ece41ee3,0.0,0,4d39d3a4039eb60c288fe59c,0
4,5be7ca603ffa653eea64,52064aab11d284f64d088329,0.0,0,4d39d3a4039eb60c288fe59c,0


In [75]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [82]:
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(compare_output['y_pred'], compare_output['y_actual'])
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(compare_output['y_pred'], compare_output['y_actual'])
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(compare_output['y_pred'], compare_output['y_actual'])
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(compare_output['y_pred'], compare_output['y_actual'])
print('F1 score: %f' % f1)

Accuracy: 0.999375
Precision: 0.888889
Recall: 0.771084
F1 score: 0.825806
