In [1]:
from domino.utils.jupyter import notebook_init
notebook_init()

In [105]:
import os.path
import re

from Levenshtein import distance as lev
import pandas as pd
np = pd.np

% matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from multiprocessing import Pool

from rdflib import Graph, URIRef, Literal
from rdflib.namespace import OWL, RDF, XSD

In [3]:
path = lambda filename, sep='/': os.path.join(*filename.split(sep))

In [4]:
restaurants = pd.read_csv(path('data/entity_resolution/restaurant.csv'))
transports = pd.read_csv(path('data/entity_resolution/transport.csv'))

In [5]:
restaurants['source'] = restaurants.restaurant.str.startswith('http://schema.org/resource/eatery_')

In [6]:
restaurants = restaurants.set_index('restaurant')

In [7]:
names = list(restaurants.name.str.lower())
coords = list(restaurants[['latitude', 'longitude']].values)
source = list(restaurants.source)

In [8]:
def compute(n1):
    name1, coords1, src1 = names[n1], coords[n1], source[n1]
    
    results = []
    
    for n2, (name2, coords2, src2) in enumerate(
        zip(names[n1 + 1:], coords[n1 + 1:], source[n1 + 1:])
    ):
        # don't compare two entities of the same source
        if src1 == src2: continue 
            
        n2 += n1 + 1
        
        d_lev = lev(name1, name2) / max(len(name1), len(name2))
        d_euc = ((coords1 - coords2)**2).sum()
        
        results.append((n1, n2, d_lev, d_euc))
    
    return results

In [9]:
pool = Pool(3)

In [10]:
n1s = list(
    np.random.choice(
        np.arange(len(names)),
        len(names),
        replace=False
    )
)

In [11]:
results = pool.map(compute, n1s)

In [12]:
sum(map(len, results))

5230827

In [13]:
dists = pd.concat(list(map(pd.DataFrame, results)))
dists.columns = ['n1', 'n2', 'lev', 'euc']

In [14]:
dists.shape

(5230827, 4)

In [15]:
dists.head()

Unnamed: 0,n1,n2,lev,euc
0,493,711,0.88,6.8e-05
1,493,712,0.92,0.000718
2,493,713,0.88,0.000854
3,493,714,0.82,0.001477
4,493,715,0.86,7.3e-05


In [16]:
dists.to_csv(os.path.join('tmp_data', 'dists.csv'), index=False)

In [17]:
dists['n1_name'] = np.array(names)[dists.n1]
dists['n2_name'] = np.array(names)[dists.n2]

In [18]:
dists = dists.sort_values(['lev', 'euc'])

If lev = 0, they are the same. The first pair (sorting by euc) is the one assigned.

In [19]:
dists[dists.lev > 0].head(50)

Unnamed: 0,n1,n2,lev,euc,n1_name,n2_name
6367,649,7078,0.037037,3.164942e-08,restaurant vegetalia *raval,restaurant vegetalia raval
7213,73,7924,0.041667,1.518312e-09,restaurant el paraguayo,restaurante el paraguayo
6774,288,7485,0.041667,1.860292e-09,restaurant tantarantana,restaurante tantarantana
2157,568,2868,0.043478,1.414656e-08,restaurant bistrot bcn,restaurante bistrot bcn
4165,613,4876,0.045455,2.005852e-09,restaurant margherita,restaurante margherita
7138,70,7849,0.045455,0.0001090557,restaurant pa i trago,restaurante pa i trago
4920,93,5631,0.047619,5.387999e-10,restaurant biocenter,restaurante biocenter
5681,629,6392,0.047619,1.060777e-09,restaurant mano rota,restaurante mano rota
6040,87,6751,0.047619,2.739807e-09,restaurant cal boter,restaurante cal boter
7114,213,7825,0.047619,3.038132e-09,restaurant silvestre,restaurante silvestre


lev = 0.1 is the first threshold in which they're not the same

In [20]:
same_as = {
    (row['n1'], row['n2'])
    for _, row in dists[dists.lev <= 0.1].iterrows()
}

In [21]:
paired = {x for x, _ in same_as}.union({y for _, y in same_as})

In [22]:
dists = dists[~(dists.n1.isin(paired) | dists.n2.isin(paired))]

In [23]:
# Now let's try sorting the other way around
dists.sort_values(['euc', 'lev']).head(20)

# except for some few pairs, there's no more coincidences
# let's leave it like that

Unnamed: 0,n1,n2,lev,euc,n1_name,n2_name
5909,197,6620,0.76,8.250224e-12,restaurant reina selecció,tap de suro
1477,635,2188,0.823529,1.068346e-11,restaurant zeruko,crudo bar
7265,277,7976,0.529412,1.537408e-11,restaurant el rebujito de moncho's,restaurante al austriaco
7265,246,7976,0.581395,1.537408e-11,restaurant barnabier-la cerveseria del port,restaurante al austriaco
181,277,892,0.764706,1.537408e-11,restaurant el rebujito de moncho's,mexican steak house
181,246,892,0.813953,1.537408e-11,restaurant barnabier-la cerveseria del port,mexican steak house
997,246,1708,0.860465,1.537408e-11,restaurant barnabier-la cerveseria del port,talaia
997,277,1708,0.882353,1.537408e-11,restaurant el rebujito de moncho's,talaia
2348,222,3059,0.625,1.830136e-11,bar restaurant horiginal,horiginal
2764,71,3475,0.891304,2.28179e-11,restaurant mesón morriña - marisqueria gallega,peix fresc


In [24]:
len(same_as)

55

In [25]:
str(OWL)

'http://www.w3.org/2002/07/owl#'

In [26]:
graph = Graph()

graph.bind('owl', str(OWL))
graph.bind('', 'http://schema.org/resource/')
graph.bind('ont', 'http://schema.org/ontology/')

for x, y in same_as:
    x = restaurants.index[x]
    y = restaurants.index[y]
    
    graph.add((URIRef(x), OWL.sameAs, URIRef(y)))

In [27]:
graph.serialize(os.path.join('mapped_sources', 'restaurant_er.ttl'), format='turtle')