# Entity Resolution- Preliminary Modeling with PySolr

Import Statements

In [2]:
import numpy as np
import pandas as pd
import pysolr
import csv
import operator

Import the data and configure a dictionary of address fields

In [3]:
dict_list = []
fields = ['id', 'name', 'addr', 'city', 'ctry', 'code']
with open('companies_final.csv', mode='r') as infile:
    reader = csv.reader(infile)
    next(reader)
    for rows in reader:
        dictionary = {}
        for i in range(6):
            dictionary[fields[i]] = rows[i]
        dict_list.append(dictionary)

Establishing Solr Connection and Indexing

In [4]:
solr = pysolr.Solr
conn = solr('http://localhost:8984/solr/new_core')
# Clear what is currently in the index and add the dictionary of addresses
conn.delete(q="*:*")
conn.add(dict_list)

'<?xml version="1.0" encoding="UTF-8"?>\n<response>\n\n<lst name="responseHeader">\n  <int name="status">0</int>\n  <int name="QTime">96</int>\n</lst>\n</response>\n'

# User Queries

Exact Search

In [15]:
# Results is a list of all the potential matches, the closest match being the first in the list
results = conn.search('name:"1 MOBILE LIMITED" addr:"30 CITY ROAD"')

i = 1
for result in results:
    if i == 1:
        suggestion = (" ".join(result['name'] + result['addr'] + result['city'] + result['ctry'] + result['code']))
    i += 1
    
print("User entry: 1 MOBILE LIMITED 30 CITY ROAD")
print("Response:", suggestion)

User entry: 1 MOBILE LIMITED 30 CITY ROAD
Response: 1 MOBILE LIMITED 30 CITY ROAD LONDON UK EC1Y 2AB


Insensitive to Capitalization

In [18]:
results = conn.search('name:"1 MobILE LimITED" addr:"30 CITY ROAD"')

i = 1
for result in results:
    if i == 1:
        suggestion = (" ".join(result['name'] + result['addr'] + result['city'] + result['ctry'] + result['code']))
    i += 1
    
print("User entry: 1 MobILE LimITED 30 CITY ROAD")
print("Response:", suggestion)

User entry: 1 MobILE LimITED 30 CITY ROAD
Response: 1 MOBILE LIMITED 30 CITY ROAD LONDON UK EC1Y 2AB


Insensitive to Excessive White Space

In [20]:
results = conn.search('name:"1 MOBILE            LIMITED" addr:"30 CITY ROAD"')

i = 1
for result in results:
    if i == 1:
        suggestion = (" ".join(result['name'] + result['addr'] + result['city'] + result['ctry'] + result['code']))
    i += 1
    
print("User entry: 1 MOBILE            LIMITED 30 CITY ROAD")
print("Response:", suggestion)

User entry: 1 MOBILE            LIMITED 30 CITY ROAD
Response: 1 MOBILE LIMITED 30 CITY ROAD LONDON UK EC1Y 2AB


Substrings

In [21]:
results = conn.search('name:"MOBILE" addr:"30 CITY ROAD"')

i = 1
for result in results:
    if i == 1:
        suggestion = (" ".join(result['name'] + result['addr'] + result['city'] + result['ctry'] + result['code']))
    i += 1
    
print("User entry: 1 MOBILE 30 CITY ROAD")
print("Response:", suggestion)

User entry: 1 MOBILE 30 CITY ROAD
Response: 1 MOBILE LIMITED 30 CITY ROAD LONDON UK EC1Y 2AB


Special Characters

In [24]:
results = conn.search('name:"#1 MOBILE LIMITED" addr:"30 CITY ROAD"')

i = 1
for result in results:
    if i == 1:
        suggestion = (" ".join(result['name'] + result['addr'] + result['city'] + result['ctry'] + result['code']))
    i += 1
    
print("User entry: #1 MOBILE LIMITED 30 CITY ROAD")
print("Response:", suggestion)

User entry: #1 MOBILE LIMITED 30 CITY ROAD
Response: 1 MOBILE LIMITED 30 CITY ROAD LONDON UK EC1Y 2AB
