In [27]:
# Load data and set pandas options to display the full dataset if needed
import re
import pandas as pd
from Notebooks import utils

pd.set_option('display.max_columns', None)  
pd.set_option('display.max_rows', None)  
pd.set_option('display.max_colwidth', -1)  
restaurant_data = pd.read_csv('../Data/restaurants.tsv', sep='\t')
restaurant_data = restaurant_data.set_index(['id'], drop=False)
restaurant_data.head()
# Load the gold standard duplicates to calculate metrics
true_duplicates = pd.read_csv('../Data/restaurants_DPL.tsv', sep='\t')

In [28]:
# Get info about the data and datatypes of the DataFrame
restaurant_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 864 entries, 1 to 864
Data columns (total 6 columns):
id         864 non-null int64
name       864 non-null object
address    864 non-null object
city       864 non-null object
phone      864 non-null object
type       863 non-null object
dtypes: int64(1), object(5)
memory usage: 47.2+ KB


In [29]:
# Remove special characters from phone, address, name and city to detect duplicates
restaurant_data = utils.remove_special_characters(restaurant_data)

In [30]:
print('Duplicates after removing special characters')
duplicates_bool = utils.get_duplicates_bool(restaurant_data)

Duplicates after removing special characters
Duplicates in address, city and phone44
Duplicates in address name and phone: 50
Duplicates in address city and name: 26
Duplicates in name city and phone41
Duplicates in address, city, name and phone: 25
duplicates in 3 columns of address, city, name and phone86


In [31]:
print('Metrics after removing special characters')
detected_duplicates = restaurant_data[duplicates_bool == True]
utils.print_metrics(restaurant_data, detected_duplicates, true_duplicates)

Metrics after removing special characters
All entries in original dataset: 864
Detected duplicates (all): 86
Real duplicates (from gold standard): 112
True positives: 80
True negatives: 746
False positives: 6
False negatives: 32
Accuracy 0.9560185185185185
Precision: 0.9302325581395349
Recall: 0.7142857142857143


In [32]:
# Check the unique values of city
restaurant_data.city.unique()

array(['los angeles', 'studio city', 'bel air', 'sherman oaks',
       'santa monica', 'hollywood', 'w hollywood', 'malibu',
       'beverly hills', 'los feliz', 'chinatown', 'pasadena', 'new york',
       'new york city', 'brooklyn', 'las vegas', 'atlanta',
       'san francisco', 'pacific palisades', 'toluca lake', 'west la',
       'westlake village', 'northridge', 'mar vista', 'venice', 'la',
       'redondo beach', 'westwood', 'culver city', 'long beach',
       'century city', 'st boyle hts', 'rancho park', 'st hermosa beach',
       'marina del rey', 'encino', 'monterey park', 'burbank',
       'seal beach', 'brentwood', 'manhattan beach', 'glendale', 'queens',
       'marietta', 'roswell', 'smyrna', 'duluth', 'decatur',
       'college park'], dtype=object)

In [33]:
# Map multiple occurrences of the same city in different writing
# It could also be a good idea to map all city parts to one city (i.e. hollywood -> los angeles)
city_map = {'la': 'los angeles', 'new york city': 'new york', 'west la': 'los angeles', 'w hollywood': 'hollywood'}
restaurant_data.city = restaurant_data.city.replace(city_map)

In [34]:
print('Duplicates after mapping multiple occurrences of the same city name')
duplicates_bool = utils.get_duplicates_bool(restaurant_data)

Duplicates after mapping multiple occurrences of the same city name
Duplicates in address, city and phone71
Duplicates in address name and phone: 50
Duplicates in address city and name: 49
Duplicates in name city and phone78
Duplicates in address, city, name and phone: 48
duplicates in 3 columns of address, city, name and phone104


In [35]:
print('Metrics after mapping multiple occurrences of the same city name')
detected_duplicates = restaurant_data[duplicates_bool == True]
utils.print_metrics(restaurant_data, detected_duplicates, true_duplicates)

Metrics after mapping multiple occurrences of the same city name
All entries in original dataset: 864
Detected duplicates (all): 104
Real duplicates (from gold standard): 112
True positives: 98
True negatives: 746
False positives: 6
False negatives: 14
Accuracy 0.9768518518518519
Precision: 0.9423076923076923
Recall: 0.875


In [36]:
# Remove unnecessary explaination parts from the address string for a more accurate duplicate detection
restaurant_data['address_cleared'] = restaurant_data['address'].str.split(r' between| off| near| at| in').str[0]

In [37]:
# Print the new and old address to ensure that there was an improvement in the data quality
restaurant_data[['address_cleared', 'address', 'city']].head(20)

Unnamed: 0_level_0,address_cleared,address,city
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,435 s la cienega blv,435 s la cienega blv,los angeles
2,435 s la cienega blvd,435 s la cienega blvd,los angeles
3,12224 ventura blvd,12224 ventura blvd,studio city
4,12224 ventura blvd,12224 ventura blvd,studio city
5,701 stone canyon rd,701 stone canyon rd,bel air
6,701 stone canyon rd,701 stone canyon rd,bel air
7,14016 ventura blvd,14016 ventura blvd,sherman oaks
8,14016 ventura blvd,14016 ventura blvd,sherman oaks
9,624 s la brea ave,624 s la brea ave,los angeles
10,624 s la brea ave,624 s la brea ave,los angeles


In [38]:
# Overwrite address and remove address_cleared for a more consistent dataframe
restaurant_data.address = restaurant_data.address_cleared
restaurant_data = restaurant_data.drop(['address_cleared'], axis=1)

In [39]:
print('Duplicates after removing appendixes from the address column')
duplicates_bool = utils.get_duplicates_bool(restaurant_data)

Duplicates after removing appendixes from the address column
Duplicates in address, city and phone73
Duplicates in address name and phone: 52
Duplicates in address city and name: 51
Duplicates in name city and phone78
Duplicates in address, city, name and phone: 50
duplicates in 3 columns of address, city, name and phone104


In [40]:
print('Metrics after appendixes from the address column')
detected_duplicates = restaurant_data[duplicates_bool == True]
utils.print_metrics(restaurant_data, detected_duplicates, true_duplicates)


Metrics after appendixes from the address column
All entries in original dataset: 864
Detected duplicates (all): 104
Real duplicates (from gold standard): 112
True positives: 98
True negatives: 746
False positives: 6
False negatives: 14
Accuracy 0.9768518518518519
Precision: 0.9423076923076923
Recall: 0.875


In [41]:
# Remove appendixes of numbers because they are inconsistent over the dataset
restaurant_data.address = restaurant_data.address.map(lambda x: re.sub(r"(?<=\d)(st|nd|rd|th)\b", '', x))


In [42]:
print('Duplicates after removing appendixes from the street number')
duplicates_bool = utils.get_duplicates_bool(restaurant_data)

Duplicates after removing appendixes from the street number
Duplicates in address, city and phone73
Duplicates in address name and phone: 52
Duplicates in address city and name: 51
Duplicates in name city and phone78
Duplicates in address, city, name and phone: 50
duplicates in 3 columns of address, city, name and phone104


In [43]:
print('Metrics after appendixes from the street number')
detected_duplicates = restaurant_data[duplicates_bool == True]
utils.print_metrics(restaurant_data, detected_duplicates, true_duplicates)

Metrics after appendixes from the street number
All entries in original dataset: 864
Detected duplicates (all): 104
Real duplicates (from gold standard): 112
True positives: 98
True negatives: 746
False positives: 6
False negatives: 14
Accuracy 0.9768518518518519
Precision: 0.9423076923076923
Recall: 0.875


In [44]:
# Remove the direction in address and name because it's inconsistent over the dataset
restaurant_data.address = restaurant_data.address.map(lambda x: re.sub('(( |^)((south)|(east)|(west)|(north)|(ne)|(se)|(nw)|(sw)|s|w|e|n)( |$))', ' ', x))
restaurant_data.name = restaurant_data.name.map(lambda x: re.sub('(( |^)((south)|(east)|(west)|(north)|(ne)|(se)|(nw)|(sw)|s|w|e|n)( |$))', ' ', x))

In [45]:
print('Duplicates after removing direction from address and name')
duplicates_bool = utils.get_duplicates_bool(restaurant_data)

Duplicates after removing direction from address and name
Duplicates in address, city and phone75
Duplicates in address name and phone: 52
Duplicates in address city and name: 51
Duplicates in name city and phone78
Duplicates in address, city, name and phone: 50
duplicates in 3 columns of address, city, name and phone106


In [46]:
print('Metrics after removing direction from address and name')
detected_duplicates = restaurant_data[duplicates_bool == True]
utils.print_metrics(restaurant_data, detected_duplicates, true_duplicates)

Metrics after removing direction from address and name
All entries in original dataset: 864
Detected duplicates (all): 106
Real duplicates (from gold standard): 112
True positives: 98
True negatives: 744
False positives: 8
False negatives: 14
Accuracy 0.9745370370370371
Precision: 0.9245283018867925
Recall: 0.875


In [47]:
# Standardize the address even more
address_num_map = {'first': '1', 'second': '2', 'third': '3', 'fourth': '4', 'fifth': '5', 'sixth': '6',
                   'seventh': '7','eighth' : '8', 'ninth': '9', 'tenth': '10', 'eleventh': '11', 'twelfth': '12'}
address_name_map = {'la': 'los angeles', 'ave': 'avenue',  'rd': 'road', 'blv': 'boulevard', 
                    'blvd': 'boulevard', 'st': 'street'}
address_map = {**address_name_map, **address_num_map}
restaurant_data.address = restaurant_data.address.map(lambda x: ' '.join([address_map.get(i, i) for i in x.split()]))

In [48]:
print('Duplicates after remapping the address')
duplicates_bool = utils.get_duplicates_bool(restaurant_data)

Duplicates after remapping the address
Duplicates in address, city and phone94
Duplicates in address name and phone: 67
Duplicates in address city and name: 67
Duplicates in name city and phone78
Duplicates in address, city, name and phone: 65
duplicates in 3 columns of address, city, name and phone111


In [49]:
print('Metrics after remapping the address')
detected_duplicates = restaurant_data[duplicates_bool == True]
utils.print_metrics(restaurant_data, detected_duplicates, true_duplicates)

Metrics after remapping the address
All entries in original dataset: 864
Detected duplicates (all): 111
Real duplicates (from gold standard): 112
True positives: 103
True negatives: 744
False positives: 8
False negatives: 9
Accuracy 0.9803240740740741
Precision: 0.9279279279279279
Recall: 0.9196428571428571


In [50]:
detected_duplicates.head(20)

Unnamed: 0_level_0,id,name,address,city,phone,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,2,arnie mortons of chicago,435 los angeles cienega boulevard,los angeles,3102461501,steakhouses
4,4,arts deli,12224 ventura boulevard,studio city,8187621221,delis
6,6,belair hotel,701 stone canyon road,bel air,3104721211,californian
8,8,cafe bizou,14016 ventura boulevard,sherman oaks,8187883536,french bistro
10,10,campanile,624 los angeles brea avenue,los angeles,2139381447,californian
12,12,chinois on main,2709 main street,santa monica,3103929025,pacific new wave
14,14,citrus,6703 melrose avenue,los angeles,2138570034,californian
16,16,fenix at the argyle,8358 sunset boulevard,hollywood,2138486677,french (new)
18,18,granita,23725 malibu road,malibu,3104560488,californian
24,24,lorangerie,903 los angeles cienega boulevard,hollywood,3106529770,french (classic)


In [51]:
# Check the dataset without duplicates
restaurant_data[restaurant_data['id'].isin(detected_duplicates['id'])== False].head(20)

Unnamed: 0_level_0,id,name,address,city,phone,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,arnie mortons of chicago,435 los angeles cienega boulevard,los angeles,3102461501,american
3,3,arts delicatessen,12224 ventura boulevard,studio city,8187621221,american
5,5,hotel belair,701 stone canyon road,bel air,3104721211,californian
7,7,cafe bizou,14016 ventura boulevard,sherman oaks,8187883536,french
9,9,campanile,624 los angeles brea avenue,los angeles,2139381447,american
11,11,chinois on main,2709 main street,santa monica,3103929025,french
13,13,citrus,6703 melrose avenue,los angeles,2138570034,californian
15,15,fenix,8358 sunset boulevard,hollywood,2138486677,american
17,17,granita,23725 malibu road,malibu,3104560488,californian
19,19,grill on the alley,9560 dayton way,los angeles,3102760615,american


In [52]:
import numpy as np
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_confusion_matrix

binary = np.array([[true_positives_count, false_negatives],
                   [false_positives_count, true_negatives]])

fig, ax = plot_confusion_matrix(conf_mat=binary,
                                show_absolute=True,
                                show_normed=True,
                                colorbar=True)
plt.show()

NameError: name 'true_positives_count' is not defined

In [53]:
d'uplicate_indices = {}
duplicates_str_1 = ['address', 'city', 'phone']
duplicates_str_2 = ['address', 'name', 'phone']
duplicates_str_3 = ['address', 'city', 'name']
duplicates_str_4 = ['name', 'city', 'phone']

for row1_idx, row1 in restaurant_data.iterrows(): 
    cur_index = str(row1_idx)
    duplicate_indices[cur_index] = []
    row_1_imp_cols_1 = row1[duplicates_str_1]
    row_1_imp_cols_2 = row1[duplicates_str_2]
    row_1_imp_cols_3 = row1[duplicates_str_3]
    row_1_imp_cols_4 = row1[duplicates_str_4]
    for row2 in range(row1_idx + 1, len(restaurant_data)):
        row_2_imp_cols_1 = restaurant_data.loc[row2, duplicates_str_1]
        row_2_imp_cols_2 = restaurant_data.loc[row2, duplicates_str_2]
        row_2_imp_cols_3 = restaurant_data.loc[row2, duplicates_str_3]
        row_2_imp_cols_4 = restaurant_data.loc[row2, duplicates_str_4]
        if (row_1_imp_cols_1.equals(row_2_imp_cols_1) or 
                row_1_imp_cols_2.equals(row_2_imp_cols_2) or    
                row_1_imp_cols_3.equals(row_2_imp_cols_3) or 
                row_1_imp_cols_4.equals(row_2_imp_cols_4)):
                    duplicate_indices[cur_index].append(row2)
                    print('dup found')

SyntaxError: EOL while scanning string literal (<ipython-input-53-af769fc16c28>, line 1)

In [None]:
duplicate_indices


In [54]:
restaurant_data.loc[[555, 553, 784]]


Unnamed: 0_level_0,id,name,address,city,phone,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
555,555,lillie langtrys,129 fremont street,las vegas,7023857111,asian
553,553,golden nugget hotel,129 fremont street,las vegas,7023857111,buffets
784,784,stefanos,129 fremont street,las vegas,7023857111,italian
