# Entity Resolution- Preliminary Modeling with PySolr

Import Statements

In [1]:
# Make sure all the necessary packages have been installed, and that Solr is running on port 8984
import numpy as np
import pandas as pd
import pysolr
import csv
import operator

Import the data and configure a dictionary of address fields

In [2]:
dict_list = []
fields = ['id', 'name', 'addr', 'city', 'ctry', 'code']
with open('companies_final.csv', mode='r') as infile:
    reader = csv.reader(infile)
    next(reader)
    for rows in reader:
        dictionary = {}
        for i in range(6):
            dictionary[fields[i]] = rows[i]
        dict_list.append(dictionary)

Establishing Solr Connection and Indexing

In [3]:
solr = pysolr.Solr
conn = solr('http://localhost:8984/solr/new_core')
# Clear what is currently in the index and add the dictionary of addresses
conn.delete(q="*:*")
conn.add(dict_list)

'<?xml version="1.0" encoding="UTF-8"?>\n<response>\n\n<lst name="responseHeader">\n  <int name="status">0</int>\n  <int name="QTime">29</int>\n</lst>\n</response>\n'

# User Queries

### Exact Search

In [7]:
# Results is a list of all the potential matches, the closest match being the first in the list
results = conn.search('name:"1 MOBILE LIMITED" addr:"30 CITY ROAD"')

i = 1
for result in results:
    if i == 1:
        suggestion = (" ".join(result['name'] + result['addr'] + result['city'] + result['ctry'] + result['code']))
    i += 1
    
print("User entry: 1 MOBILE LIMITED 30 CITY ROAD")
print("Response:", suggestion)

User entry: 1 MOBILE LIMITED 30 CITY ROAD
Response: 1 MOBILE LIMITED 30 CITY ROAD LONDON UK EC1Y 2AB


### Insensitive to Capitalization

In [8]:
results = conn.search('name:"1 MobILE LimITED" addr:"30 CITY ROAD"')

i = 1
for result in results:
    if i == 1:
        suggestion = (" ".join(result['name'] + result['addr'] + result['city'] + result['ctry'] + result['code']))
    i += 1
    
print("User entry: 1 MobILE LimITED 30 CITY ROAD")
print("Response:", suggestion)

User entry: 1 MobILE LimITED 30 CITY ROAD
Response: 1 MOBILE LIMITED 30 CITY ROAD LONDON UK EC1Y 2AB


### Insensitive to Excessive White Space

In [9]:
results = conn.search('name:"1 MOBILE            LIMITED" addr:"30 CITY ROAD"')

i = 1
for result in results:
    if i == 1:
        suggestion = (" ".join(result['name'] + result['addr'] + result['city'] + result['ctry'] + result['code']))
    i += 1
    
print("User entry: 1 MOBILE            LIMITED 30 CITY ROAD")
print("Response:", suggestion)

User entry: 1 MOBILE            LIMITED 30 CITY ROAD
Response: 1 MOBILE LIMITED 30 CITY ROAD LONDON UK EC1Y 2AB


### Substrings

In [10]:
results = conn.search('name:"MOBILE" addr:"30 CITY ROAD"')

i = 1
for result in results:
    if i == 1:
        suggestion = (" ".join(result['name'] + result['addr'] + result['city'] + result['ctry'] + result['code']))
    i += 1
    
print("User entry: MOBILE 30 CITY ROAD")
print("Response:", suggestion)

User entry: MOBILE 30 CITY ROAD
Response: 1 MOBILE LIMITED 30 CITY ROAD LONDON UK EC1Y 2AB


### Special Characters

In [11]:
results = conn.search('name:"#1 MOBILE LIMITED" addr:"30 CITY ROAD"')

i = 1
for result in results:
    if i == 1:
        suggestion = (" ".join(result['name'] + result['addr'] + result['city'] + result['ctry'] + result['code']))
    i += 1
    
print("User entry: #1 MOBILE LIMITED 30 CITY ROAD")
print("Response:", suggestion)

User entry: #1 MOBILE LIMITED 30 CITY ROAD
Response: 1 MOBILE LIMITED 30 CITY ROAD LONDON UK EC1Y 2AB


### Extra wording, prefixes, suffixes

In [12]:
results = conn.search('name:"1 AMOBILE LIMITEDs Company" addr:"30 CITY RD"')

i = 1
for result in results:
    if i == 1:
        suggestion = (" ".join(result['name'] + result['addr'] + result['city'] + result['ctry'] + result['code']))
    i += 1
    
print("User entry: 1 AMOBILE LIMITED Company 30 CITY ROADS")
print("Response:", suggestion)

User entry: 1 AMOBILE LIMITED Company 30 CITY ROADS
Response: 1 MOBILE LIMITED 30 CITY ROAD LONDON UK EC1Y 2AB


### Exact, Unique Addresses

In [13]:
results = conn.search('name:"" addr:"COURTYARD SUITE 100 HATTON GARDEN"')

i = 1
for result in results:
    if i == 1:
        suggestion = (" ".join(result['name'] + result['addr'] + result['city'] + result['ctry'] + result['code']))
    i += 1
    
print("User entry: COURTYARD SUITE 100 HATTON GARDEN")
print("Response:", suggestion)

User entry: COURTYARD SUITE 100 HATTON GARDEN
Response: ACTURIS LIMITED COURTYARD SUITE 100 HATTON GARDEN LONDON UK EC1N 8NX


# Time Operations

In [5]:
import time

In [8]:
start = time.time()
results = conn.search('name:"1 MOBILE LIMITED" addr:"30 CITY ROAD" city:LONDON')

i = 1
for result in results:
    if i == 1:
        suggestion = (" ".join(result['name'] + result['addr'] + result['city'] + result['ctry'] + result['code']))
    i += 1
    
print("User entry: 1 MOBILE LIMITED 30 CITY ROAD")
print("Response:", suggestion)
end = time.time()
print((end - start)*1000)

User entry: 1 MOBILE LIMITED 30 CITY ROAD
Response: 1 MOBILE LIMITED 30 CITY ROAD LONDON UK EC1Y 2AB
3.7238597869873047


In [20]:
start = time.time()
results = conn.search('name:"1 MOBILE LIMTED" addr:"30 CITY STREET" city:"LONDON"')

i = 1
for result in results:
    if i == 1:
        suggestion = (" ".join(result['name'] + result['addr'] + result['city'] + result['ctry'] + result['code']))
    i += 1
    
print("User entry: 1 MOBILE LIMITED 30 CITY ROAD")
print("Response:", suggestion)
end = time.time()
print(end - start)

User entry: 1 MOBILE LIMITED 30 CITY ROAD
Response: DIGITAL HUB MEDIA PRIVATE LIMITED KEMP HOUSE 160 CITY ROAD LONDON|LONDON UK EC1V 2NX
0.0035712718963623047


In [21]:
start = time.time()
results = conn.search('name:"1 mobil lim" addr:"300 city rd" city:london ctry:"uk"')

i = 1
for result in results:
    if i == 1:
        suggestion = (" ".join(result['name'] + result['addr'] + result['city'] + result['ctry'] + result['code']))
    i += 1
    
print("User entry: 1 MOBILE LIMITED 30 CITY ROAD")
print("Response:", suggestion)
end = time.time()
print(end - start)

User entry: 1 MOBILE LIMITED 30 CITY ROAD
Response: DIGITAL HUB MEDIA PRIVATE LIMITED KEMP HOUSE 160 CITY ROAD LONDON|LONDON UK EC1V 2NX
0.004781246185302734
