In [1]:
# Load data and set pandas options to display the full dataset if needed
import re
import pandas as pd
from Scripts import utils
from Scripts import Properties


properties = Properties.Properties
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_rows', None)  
pd.set_option('display.max_colwidth', -1)  
restaurant_data = pd.read_csv(properties.ORIGINAL_DATA_DIR, sep='\t')
restaurant_data = restaurant_data.set_index(['id'], drop=False)
restaurant_data.head()
restaurant_data_original = restaurant_data.copy()
# Load the gold standard duplicates to calculate metrics
true_duplicates = pd.read_csv(properties.GOLD_STANDARD_DIR, sep='\t')

In [2]:
print('Duplicates without any preprocessing')
duplicates_bool = utils.get_duplicates_bool(restaurant_data)

Duplicates without any preprocessing
Duplicates in address: 92
Duplicates in name: 88
Duplicates in phone: 7
Duplicates in city: 815
Duplicates in address name and phone: 0
Duplicates in address city and name: 24
Duplicates in name city and phone0
Duplicates in address, city, name and phone: 0
duplicates in 3 columns of address, city, name and phone24


In [3]:
print('Metrics without any preprocessing')
detected_duplicates = restaurant_data[duplicates_bool == True]
utils.print_metrics(restaurant_data, detected_duplicates, true_duplicates)

Metrics without any preprocessing
All entries in original dataset: 864
Detected duplicates (all): 24
Real duplicates (from gold standard): 112
True positives: 24
True negatives: 752
False positives: 0
False negatives: 88
Accuracy 0.8981481481481481
Precision: 1.0
Recall: 0.21428571428571427


In [4]:
# Get info about the data and datatypes of the DataFrame
restaurant_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 864 entries, 1 to 864
Data columns (total 6 columns):
id         864 non-null int64
name       864 non-null object
address    864 non-null object
city       864 non-null object
phone      864 non-null object
type       863 non-null object
dtypes: int64(1), object(5)
memory usage: 47.2+ KB


In [5]:
# Remove special characters from phone, address, name and city to detect duplicates
restaurant_data = utils.remove_special_characters(restaurant_data)
restaurant_data = utils.trim_multiple_blanks(restaurant_data)

In [6]:
# Remove the direction in address and name because it's inconsistent over the dataset
direction_regex = re.compile('(( |^)((south)|(east)|(west)|(north)|(ne)|(se)|(nw)|(sw)|s|w|e|n)( |$))')
restaurant_data.address = restaurant_data.address.map(lambda x: direction_regex.sub(' ', x))
restaurant_data.name = restaurant_data.name.map(lambda x: direction_regex.sub(' ', x))
restaurant_data.city = restaurant_data.city.map(lambda x: direction_regex.sub(' ', x))
restaurant_data = utils.trim_multiple_blanks(restaurant_data)

In [7]:
print('Duplicates after removing generic clearing')
duplicates_bool = utils.get_duplicates_bool(restaurant_data)

Duplicates after removing generic clearing
Duplicates in address: 106
Duplicates in name: 91
Duplicates in phone: 116
Duplicates in city: 817
Duplicates in address name and phone: 51
Duplicates in address city and name: 26
Duplicates in name city and phone41
Duplicates in address, city, name and phone: 25
duplicates in 3 columns of address, city, name and phone68


In [8]:
print('Metrics after generic clearing')
detected_duplicates = restaurant_data[duplicates_bool == True]
utils.print_metrics(restaurant_data, detected_duplicates, true_duplicates)

Metrics after generic clearing
All entries in original dataset: 864
Detected duplicates (all): 68
Real duplicates (from gold standard): 112
True positives: 68
True negatives: 752
False positives: 0
False negatives: 44
Accuracy 0.9490740740740741
Precision: 1.0
Recall: 0.6071428571428571


In [9]:
# Map multiple occurrences of the same city in different writing
# It could also be a good idea to map all city parts to one city (i.e. hollywood -> los angeles)
city_map = {'la': 'los angeles', 'new york city': 'new york'}
restaurant_data.city = restaurant_data.city.replace(city_map)
restaurant_data = utils.trim_multiple_blanks(restaurant_data)

In [10]:
print('Duplicates after clearing city')
duplicates_bool = utils.get_duplicates_bool(restaurant_data)

Duplicates after clearing city
Duplicates in address: 106
Duplicates in name: 91
Duplicates in phone: 116
Duplicates in city: 819
Duplicates in address name and phone: 51
Duplicates in address city and name: 49
Duplicates in name city and phone78
Duplicates in address, city, name and phone: 48
duplicates in 3 columns of address, city, name and phone82


In [11]:
print('Metrics after clearing city')
detected_duplicates = restaurant_data[duplicates_bool == True]
utils.print_metrics(restaurant_data, detected_duplicates, true_duplicates)

Metrics after clearing city
All entries in original dataset: 864
Detected duplicates (all): 82
Real duplicates (from gold standard): 112
True positives: 82
True negatives: 752
False positives: 0
False negatives: 30
Accuracy 0.9652777777777778
Precision: 1.0
Recall: 0.7321428571428571


In [12]:
print('length of unique addresses before clearing ' + str(len(restaurant_data.address.unique())))

length of unique addresses before clearing 758


In [13]:
# Remove unnecessary explaination parts from the address string for a more accurate duplicate detection
restaurant_data['address'] = restaurant_data['address'].str.split(r' between| off| near| at| in').str[0]

In [14]:
# Remove appendixes of numbers because they are inconsistent over the dataset
restaurant_data.address = restaurant_data.address.map(lambda x: re.sub(r"(?<=\d)(st|nd|rd|th)\b", '', x))

In [15]:
# Standardize the address even more
address_num_map = {'first': '1', 'second': '2', 'third': '3', 'fourth': '4', 'fifth': '5', 'sixth': '6',
                   'seventh': '7','eighth' : '8', 'ninth': '9', 'tenth': '10', 'eleventh': '11', 'twelfth': '12'}
address_name_map = {'la': 'los angeles', 'ave': 'avenue',  'rd': 'road', 'blv': 'boulevard', 
                    'blvd': 'boulevard', 'st': 'street'}
address_map = {**address_name_map, **address_num_map}
restaurant_data.address = restaurant_data.address.map(lambda x: ' '.join([address_map.get(i, i) for i in x.split()]))
restaurant_data = utils.trim_multiple_blanks(restaurant_data)

In [16]:
print('Unique addresses after clearing ' + str(len(restaurant_data.address.unique())))

Unique addresses after clearing 739


In [17]:
print('Duplicates after clearing the address')
duplicates_bool = utils.get_duplicates_bool(restaurant_data)

Duplicates after clearing the address
Duplicates in address: 125
Duplicates in name: 91
Duplicates in phone: 116
Duplicates in city: 819
Duplicates in address name and phone: 68
Duplicates in address city and name: 67
Duplicates in name city and phone78
Duplicates in address, city, name and phone: 65
duplicates in 3 columns of address, city, name and phone83


In [18]:
print('Metrics after clearing the address')
detected_duplicates = restaurant_data[duplicates_bool == True]
utils.print_metrics(restaurant_data, detected_duplicates, true_duplicates)

Metrics after clearing the address
All entries in original dataset: 864
Detected duplicates (all): 83
Real duplicates (from gold standard): 112
True positives: 83
True negatives: 752
False positives: 0
False negatives: 29
Accuracy 0.9664351851851852
Precision: 1.0
Recall: 0.7410714285714286


In [19]:
print('Unique names before clearing name: ' + str(len(restaurant_data.name.unique())))

Unique names before clearing name: 773


In [20]:
unique_city_list = restaurant_data.city.unique()
restaurant_data.name = restaurant_data.name.str.split(r' between| off| near| at| in| of').str[0]
city_regex = re.compile('the|restaurant|and|new york city|' +'|'.join(map(re.escape, unique_city_list)))
restaurant_data.name = [city_regex.sub('', name) for name in restaurant_data.name]

In [21]:
restaurant_data.name = restaurant_data.name.apply(lambda x: ' '.join(sorted(x.split(' '))))
restaurant_data = utils.trim_multiple_blanks(restaurant_data)

In [22]:
print('Unique names after clearing name: ' + str(len(restaurant_data.name.unique())))

Unique names after clearing name: 746


In [23]:
print('Duplicates after clearing name')
duplicates_bool = utils.get_duplicates_bool(restaurant_data)

Duplicates after clearing name
Duplicates in address: 125
Duplicates in name: 118
Duplicates in phone: 116
Duplicates in city: 819
Duplicates in address name and phone: 86
Duplicates in address city and name: 83
Duplicates in name city and phone94
Duplicates in address, city, name and phone: 81
duplicates in 3 columns of address, city, name and phone101


In [24]:
print('Metrics after clearing')
detected_duplicates = restaurant_data[duplicates_bool == True]
detected_non_duplicates = restaurant_data[duplicates_bool == False]
utils.print_metrics(restaurant_data, detected_duplicates, true_duplicates)

Metrics after clearing
All entries in original dataset: 864
Detected duplicates (all): 101
Real duplicates (from gold standard): 112
True positives: 101
True negatives: 752
False positives: 0
False negatives: 11
Accuracy 0.9872685185185185
Precision: 1.0
Recall: 0.9017857142857143


In [25]:
# Check the dataset without duplicates
restaurant_data[restaurant_data['id'].isin(detected_duplicates['id'])== False].head()

Unnamed: 0_level_0,id,name,address,city,phone,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,arnie morton,435 los angeles cienega boulevard,los angeles,3102461501,american
3,3,art delicatessen,12224 ventura boulevard,studio city,8187621221,american
4,4,art deli,12224 ventura boulevard,studio city,8187621221,delis
5,5,belair hotel,701 stone canyon road,bel air,3104721211,californian
7,7,bizou cafe,14016 ventura boulevard,sherman oaks,8187883536,french


In [26]:
final_data = utils.get_final_dataset(restaurant_data_original, restaurant_data)

212/315-1726, 
 212-245-7992
702/731-7547, 
 702-731-7110


In [27]:
final_data.head()

Unnamed: 0_level_0,id,name,address,city,phone,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,arnie morton's of chicago,435 s. la cienega blv.,los angeles,310/246-1501,american
3,3,art's delicatessen,12224 ventura blvd.,studio city,818/762-1221,american
4,4,art's deli,12224 ventura blvd.,studio city,818-762-1221,delis
5,5,hotel bel-air,701 stone canyon rd.,bel air,310/472-1211,californian
7,7,cafe bizou,14016 ventura blvd.,sherman oaks,818/788-3536,french


In [29]:
from pymongo import MongoClient
client = MongoClient(properties.MONGODB_CONNECTION_STRING, serverSelectionTimeoutMS=60)
db = client[properties.MONGODB_DB_NAME]
collection = db[properties.MONGODB_COLLECTION_NAME]

data = final_data.to_dict(orient='records') 
collection.insert_many(data)
client.close()