In [46]:
# Load data and set pandas options to display the full dataset if needed
import re
import pandas as pd
from Notebooks import utils

pd.set_option('display.max_columns', None)  
pd.set_option('display.max_rows', None)  
pd.set_option('display.max_colwidth', -1)  
restaurant_data = pd.read_csv('../Data/restaurants.tsv', sep='\t')
restaurant_data = restaurant_data.set_index(['id'], drop=False)
restaurant_data.head()
# Load the gold standard duplicates to calculate metrics
true_duplicates = pd.read_csv('../Data/restaurants_DPL.tsv', sep='\t')

In [47]:
restaurant_data.type.unique()

array(['american', 'steakhouses', 'delis', 'californian', 'french',
       'french bistro', 'pacific new wave', 'french (new)',
       'american (traditional)', 'asian', 'japanese', 'french (classic)',
       'italian', 'seafood', 'cafeterias', 'nuova cucina italian',
       'chinese', 'american (new)', 'continental', 'scandinavian',
       'coffee bar', 'coffeehouses', 'delicatessen', 'indian',
       'tel caribbean', 'caribbean', 'southwestern', 'mexican', 'russian',
       'mediterranean', 'or 212/632-5100 american', 'greek', 'bbq',
       'pacific rim', 'steak houses', 'international', 'eclectic',
       'southern', 'southern/soul', 'ext 6108 international', 'thai',
       'health food', 'dive american', 'cajun', 'latin american',
       'middle eastern', 'or 212/941-0772 american', 'eastern european',
       'east european', 'and 212/614-9345 asian', 'barbecue', 'fusion',
       'buffets', 'coffee shops/diners', 'only in las vegas',
       'old san francisco', 'mexican/latin ameri

In [48]:
# Get info about the data and datatypes of the DataFrame
restaurant_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 864 entries, 1 to 864
Data columns (total 6 columns):
id         864 non-null int64
name       864 non-null object
address    864 non-null object
city       864 non-null object
phone      864 non-null object
type       863 non-null object
dtypes: int64(1), object(5)
memory usage: 47.2+ KB


In [49]:
# Remove special characters from phone, address, name and city to detect duplicates
restaurant_data = utils.remove_special_characters(restaurant_data)
restaurant_data = utils.trim_multiple_blanks(restaurant_data)

In [52]:
# Remove the direction in address and name because it's inconsistent over the dataset
direction_regex = re.compile('(( |^)((south)|(east)|(west)|(north)|(ne)|(se)|(nw)|(sw)|s|w|e|n)( |$))')
restaurant_data.address = restaurant_data.address.map(lambda x: direction_regex.sub(' ', x))
restaurant_data.name = restaurant_data.name.map(lambda x: direction_regex.sub(' ', x))
restaurant_data.city = restaurant_data.city.map(lambda x: direction_regex.sub(' ', x))
restaurant_data = utils.trim_multiple_blanks(restaurant_data)

In [54]:
print('Duplicates after removing generic clearing')
duplicates_bool = utils.get_duplicates_bool(restaurant_data)

Duplicates after removing direction from address and name
Duplicates in address, city and phone0
Duplicates in address name and phone: 51
Duplicates in address city and name: 26
Duplicates in name city and phone41
Duplicates in address, city, name and phone: 25
duplicates in 3 columns of address, city, name and phone68


In [55]:
print('Metrics after generic clearing')
detected_duplicates = restaurant_data[duplicates_bool == True]
utils.print_metrics(restaurant_data, detected_duplicates, true_duplicates)

Metrics after removing direction from address and name
All entries in original dataset: 864
Detected duplicates (all): 68
Real duplicates (from gold standard): 112
True positives: 68
True negatives: 752
False positives: 0
False negatives: 44
Accuracy 0.9490740740740741
Precision: 1.0
Recall: 0.6071428571428571


In [56]:
# Map multiple occurrences of the same city in different writing
# It could also be a good idea to map all city parts to one city (i.e. hollywood -> los angeles)
city_map = {'la': 'los angeles', 'new york city': 'new york'}
restaurant_data.city = restaurant_data.city.replace(city_map)

In [57]:
print('Duplicates after mapping multiple occurrences of the same city name')
duplicates_bool = utils.get_duplicates_bool(restaurant_data)

Duplicates after mapping multiple occurrences of the same city name
Duplicates in address, city and phone0
Duplicates in address name and phone: 51
Duplicates in address city and name: 49
Duplicates in name city and phone78
Duplicates in address, city, name and phone: 48
duplicates in 3 columns of address, city, name and phone82


In [58]:
print('Metrics after mapping multiple occurrences of the same city name')
detected_duplicates = restaurant_data[duplicates_bool == True]
utils.print_metrics(restaurant_data, detected_duplicates, true_duplicates)

Metrics after mapping multiple occurrences of the same city name
All entries in original dataset: 864
Detected duplicates (all): 82
Real duplicates (from gold standard): 112
True positives: 82
True negatives: 752
False positives: 0
False negatives: 30
Accuracy 0.9652777777777778
Precision: 1.0
Recall: 0.7321428571428571


In [59]:
print('length of unique addresses before clearing ' + str(len(restaurant_data.address.unique())))

length of unique addresses before clearing 761


In [60]:
# Remove unnecessary explaination parts from the address string for a more accurate duplicate detection
restaurant_data['address'] = restaurant_data['address'].str.split(r' between| off| near| at| in').str[0]

In [61]:
# Remove appendixes of numbers because they are inconsistent over the dataset
restaurant_data.address = restaurant_data.address.map(lambda x: re.sub(r"(?<=\d)(st|nd|rd|th)\b", '', x))

In [62]:
# Standardize the address even more
address_num_map = {'first': '1', 'second': '2', 'third': '3', 'fourth': '4', 'fifth': '5', 'sixth': '6',
                   'seventh': '7','eighth' : '8', 'ninth': '9', 'tenth': '10', 'eleventh': '11', 'twelfth': '12'}
address_name_map = {'la': 'los angeles', 'ave': 'avenue',  'rd': 'road', 'blv': 'boulevard', 
                    'blvd': 'boulevard', 'st': 'street'}
address_map = {**address_name_map, **address_num_map}
restaurant_data.address = restaurant_data.address.map(lambda x: ' '.join([address_map.get(i, i) for i in x.split()]))

In [63]:
print('length of unique addresses after clearing ' + str(len(restaurant_data.address.unique())))

length of unique addresses after clearing 739


In [64]:
print('Duplicates after remapping the address')
duplicates_bool = utils.get_duplicates_bool(restaurant_data)

Duplicates after remapping the address
Duplicates in address, city and phone0
Duplicates in address name and phone: 68
Duplicates in address city and name: 67
Duplicates in name city and phone78
Duplicates in address, city, name and phone: 65
duplicates in 3 columns of address, city, name and phone83


In [65]:
print('Metrics after remapping the address')
detected_duplicates = restaurant_data[duplicates_bool == True]
utils.print_metrics(restaurant_data, detected_duplicates, true_duplicates)

Metrics after remapping the address
All entries in original dataset: 864
Detected duplicates (all): 83
Real duplicates (from gold standard): 112
True positives: 83
True negatives: 752
False positives: 0
False negatives: 29
Accuracy 0.9664351851851852
Precision: 1.0
Recall: 0.7410714285714286


In [66]:
restaurant_data.name = restaurant_data.name.str.split(r' between| off| near| at| in| of').str[0]
restaurant_data.name = restaurant_data.name.str.replace(re.compile(r'(los angeles)|(las vegas)|(new york city)|(new york)|(the)|(restaurant)|(and)'), '')

In [67]:
print('Duplicates after removing unnecessary words from name')
duplicates_bool = utils.get_duplicates_bool(restaurant_data)

Duplicates after removing unnecessary words from name
Duplicates in address, city and phone0
Duplicates in address name and phone: 70
Duplicates in address city and name: 68
Duplicates in name city and phone79
Duplicates in address, city, name and phone: 66
duplicates in 3 columns of address, city, name and phone85


In [68]:
print('Metrics after removing unnecessary words in name')
detected_duplicates = restaurant_data[duplicates_bool == True]
utils.print_metrics(restaurant_data, detected_duplicates, true_duplicates)

Metrics after removing unnecessary words in name
All entries in original dataset: 864
Detected duplicates (all): 85
Real duplicates (from gold standard): 112
True positives: 85
True negatives: 752
False positives: 0
False negatives: 27
Accuracy 0.96875
Precision: 1.0
Recall: 0.7589285714285714


In [69]:
restaurant_data.name = restaurant_data.name.apply(lambda x: ' '.join(sorted(x.split(' '))))
restaurant_data.name = restaurant_data.name.str.strip()

In [70]:
print('Duplicates after sorting the words in name')
duplicates_bool = utils.get_duplicates_bool(restaurant_data)

Duplicates after sorting the words in name
Duplicates in address, city and phone0
Duplicates in address name and phone: 85
Duplicates in address city and name: 81
Duplicates in name city and phone92
Duplicates in address, city, name and phone: 79
duplicates in 3 columns of address, city, name and phone100


In [71]:
print('Metrics after sorting the words in name')
detected_duplicates = restaurant_data[duplicates_bool == True]
utils.print_metrics(restaurant_data, detected_duplicates, true_duplicates)

Metrics after sorting the words in name
All entries in original dataset: 864
Detected duplicates (all): 100
Real duplicates (from gold standard): 112
True positives: 100
True negatives: 752
False positives: 0
False negatives: 12
Accuracy 0.9861111111111112
Precision: 1.0
Recall: 0.8928571428571429


In [72]:
# Check the dataset without duplicates
restaurant_data[restaurant_data['id'].isin(detected_duplicates['id'])== False]

Unnamed: 0_level_0,id,name,address,city,phone,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,arnie morton,435 los angeles cienega boulevard,los angeles,3102461501,american
3,3,art delicatessen,12224 ventura boulevard,studio city,8187621221,american
4,4,art deli,12224 ventura boulevard,studio city,8187621221,delis
5,5,belair hotel,701 stone canyon road,bel air,3104721211,californian
7,7,bizou cafe,14016 ventura boulevard,sherman oaks,8187883536,french
9,9,campanile,624 los angeles brea avenue,los angeles,2139381447,american
11,11,chinois main on,2709 main street,santa monica,3103929025,french
13,13,citrus,6703 melrose avenue,los angeles,2138570034,californian
15,15,fenix,8358 sunset boulevard,hollywood,2138486677,american
17,17,granita,23725 malibu road,malibu,3104560488,californian


In [73]:
import numpy as np
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_confusion_matrix

binary = np.array([[true_positives_count, false_negatives],
                   [false_positives_count, true_negatives]])

fig, ax = plot_confusion_matrix(conf_mat=binary,
                                show_absolute=True,
                                show_normed=True,
                                colorbar=True)
plt.show()

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
d'uplicate_indices = {}
duplicates_str_1 = ['address', 'city', 'phone']
duplicates_str_2 = ['address', 'name', 'phone']
duplicates_str_3 = ['address', 'city', 'name']
duplicates_str_4 = ['name', 'city', 'phone']

for row1_idx, row1 in restaurant_data.iterrows(): 
    cur_index = str(row1_idx)
    duplicate_indices[cur_index] = []
    row_1_imp_cols_1 = row1[duplicates_str_1]
    row_1_imp_cols_2 = row1[duplicates_str_2]
    row_1_imp_cols_3 = row1[duplicates_str_3]
    row_1_imp_cols_4 = row1[duplicates_str_4]
    for row2 in range(row1_idx + 1, len(restaurant_data)):
        row_2_imp_cols_1 = restaurant_data.loc[row2, duplicates_str_1]
        row_2_imp_cols_2 = restaurant_data.loc[row2, duplicates_str_2]
        row_2_imp_cols_3 = restaurant_data.loc[row2, duplicates_str_3]
        row_2_imp_cols_4 = restaurant_data.loc[row2, duplicates_str_4]
        if (row_1_imp_cols_1.equals(row_2_imp_cols_1) or 
                row_1_imp_cols_2.equals(row_2_imp_cols_2) or    
                row_1_imp_cols_3.equals(row_2_imp_cols_3) or 
                row_1_imp_cols_4.equals(row_2_imp_cols_4)):
                    duplicate_indices[cur_index].append(row2)
                    print('dup found')

In [None]:
duplicate_indices


In [None]:
restaurant_data.loc[[555, 553, 784]]
