## 1. Import the Libraries

In [1]:
import pandas as pd
import numpy as np

import recordlinkage

## 2. Load the datasets for dictionary and the user search

In [2]:
df_dictionary = pd.read_csv("dictionary_companies.csv")
df_search_entries = pd.read_csv("search_entry.csv")
dfD = pd.read_csv("companies_dict.csv")

# Using dfA and dfB for easy reference
dfA = df_dictionary
dfB = df_search_entries

In [3]:
dfA

Unnamed: 0,id,name,addr,city,ctry,code
0,1,1 MOBILE LIMITED,30 CITY ROAD,LONDON,UK,EC1Y 2AB
1,2,1 TECH LTD,57 CHARTERHOUSE STREET,LONDON,UK,EC1M 6HA
2,3,23 SNAPS LIMITED,16 BOWLING GREEN LANE,LONDON,UK,EC1R 0BD
3,4,2E2 SERVICES LIMITED,200 200 ALDERSGATE ALDERSGATE STREET,LONDON,UK,EC1A 4HD
4,5,2E2 UK LIMITED,200 ALDERSGATE ALDERSGATE STREET,LONDON,UK,EC1A 4HD
5,6,40 50 MEDIA LTD,145-157 ST JOHN STREET,LONDON,UK,EC1V 4PW
6,7,4D DATA CENTRES LIMITED,30 CITY ROAD,LONDON,UK,EC1 2AB
7,8,4GETMOBILE LIMITED,152 KEMP HOUSE CITY ROAD,LONDON,UK,EC1V 2NX
8,9,4SL CONSULTING LIMITED,4 SNOW HILL,LONDON,UK,EC1A 2DJ
9,10,4TC SERVICES LIMITED,355 GOSWELL ROAD,LONDON,UK,EC1V 7JL


In [4]:
dfB

Unnamed: 0,id,name,addr,city,ctry,code
0,1,company,30 CITY ROAD,LONDON,UK,EC1Y 2AB


## 3. Make Record Pairs

### 3.1 Block Indexing for Record Pairs

In [5]:
#############   BLOCK INDEXING
def runBlock():  
    block = 0
    
    #Next, check for exact match in name
    blockName = recordlinkage.BlockIndex(on=['name'])
    blockNamePairs = blockName.index(dfA, dfB)
    if len(blockNamePairs) > 0:
         block += 1
    
    #Next, check for exact match in addr    
    blockAddr = recordlinkage.BlockIndex(on=['addr'])
    blockAddrPairs = blockAddr.index(dfA, dfB)
    if len(blockAddrPairs) > 0:
        block += 2
        
    #Next, check for exact match in name AND addr    
    blockNA = recordlinkage.BlockIndex(on=['name','addr'])
    blockNAPairs = blockNA.index(dfA, dfB)
    if len(blockNAPairs) > 0:
        block = 4    
    
    
    if block == 0:
    #run sorted neighborhood both
        runSort()
    if block == 1:
    #grab the id and contents of name match
        print("block name match")
        return(returnResultsMI(blockNamePairs))
        print(blockNamePairs)
    if block == 2:
        #grab the id and contents of addr match
        print("block addr match")
        return(returnResultsMI(blockAddrPairs))
    if block == 3:
        #grab the id and contents of addr match
        print("block different name and addr match")        
    if block == 4:
    #grab the id and contents of addr match
        print("block name and addr match")
        return(returnResultsMI(blockNAPairs))
###############

### 3.2 Sorted Neighborhood Indexing

In [6]:
#############   SORTED NEIGHBORHOOD INDEXING
def runSort():
    sort = 0

    sortedNameIndexer = recordlinkage.SortedNeighbourhoodIndex(on='name')
    sortedNamePairs = sortedNameIndexer.index(dfA, dfB)
    if len(sortedNamePairs) > 0:
        sort += 1

    sortedAddrIndexer = recordlinkage.SortedNeighbourhoodIndex(on='addr')
    sortedAddrPairs = sortedAddrIndexer.index(dfA, dfB)
    if len(sortedAddrPairs) > 0:
        sort += 2

    if sort == 0:
        #run sorted neighborhood both
        print("full index NEXT")
        runFull()
    if sort == 1:
        #compare with name pairs
        print("sort name match")
        runCompare(sortedNamePairs)
    if sort == 2:
        #compare with addr pairs
        print("sort addr match")
        runCompare(sortedAddrPairs)
    if sort == 3:
        #compare with both pairs?
        print("sort both match")
        runFull()

### 4. Compare Record Pairs

In [7]:
######### COMPARE
def runCompare():
    
    compare = recordlinkage.Compare()

    compare.string('name', 'name', method='jarowinkler', threshold=0.95)
    compare.string('addr', 'addr', method='jarowinkler', threshold=0.95)
    compare.exact('city', 'city')
    compare.exact('ctry', 'ctry')
    compare.string('code', 'code', method='jarowinkler', threshold=0.90)

    ######### Specify to neighborhood

    # The comparison vectors for name
    featuresName = compare.compute(sortedNamePairs, dfA, dfB)

    # The comparison vectors for addr
    featuresAddr = compare.compute(sortedAddrPairs, dfA, dfB)

    ########## Classification

    featuresName.sum(axis=1).value_counts().sort_index(ascending=False)
    featuresAddr.sum(axis=1).value_counts().sort_index(ascending=False)
    
    runMatch()

### 5. Classification and Matching

In [8]:
# Classification step
def runMatch():
    match = 0

    matchesNameAll = featuresName[featuresName.sum(axis=1) > 4]
    if len(matchesNameAll) > 0:
        match += 1 
        matchesName = matchesNameAll #overwriting the larger set of results
    else:
        matchesName = featuresName[featuresName.sum(axis=1) > 3]
        if len(matchesName) > 0:
            match += 1
    print(len(matchesName))

    matchesAddrAll = featuresAddr[featuresAddr.sum(axis=1) > 4]
    if len(matchesAddrAll) > 0:
        match += 2 
        matchesAddr = matchesAddrAll #overwriting the larger set of results
    else:
        matchesAddr = featuresAddr[featuresAddr.sum(axis=1) > 3]
        if len(matchesAddr) > 0:
            match += 2
    print(len(matchesAddr))

    ########## FSM

    if match == 0:
        #run sorted neighborhood both
        print("Full Index NEXT")
        runFull()
    if match == 1:
        #grab the id and contents of name match
        print("sort name match")
        return(returnResultsDF(matchesName))
    if match > 1:
        #grab the id and contents of addr match, 11 = preference towards addr
        print("sort addr match")
        return(returnResultsDF(matchesAddr))

### 6. Full Index Pairs and Classification

In [9]:
####### If nothing else finds matches, run FULL INDEX
def runFull():
    compare = recordlinkage.Compare()

    compare.string('name', 'name', method='jarowinkler', threshold=0.95)
    compare.string('addr', 'addr', method='jarowinkler', threshold=0.95)
    compare.exact('city', 'city')
    compare.exact('ctry', 'ctry')
    compare.string('code', 'code', method='jarowinkler', threshold=0.90)
    
    fullIndexer = recordlinkage.FullIndex()
    fullIndexPairs = fullIndexer.index(dfA, dfB)

    featuresFull = compare.compute(fullIndexPairs, dfA, dfB)

    matchesFullAll = featuresFull[featuresFull.sum(axis=1) > 4]
    if len(matchesFullAll) > 0:
        print("full match all")
        return(returnResultsDF(matchesFullAll))
    else:
        matchesFull = featuresFull[featuresFull.sum(axis=1) > 3]
        print("full match")
        #print(type(matchesFull))
        return(returnResultsDF(matchesFull))
    #return the match/matches with highest sum. Maybe try >4 first then >3. for row in frame, 
    #grab id then return the full dict entry of the id
    

### 7. Return Results

In [10]:
######## RETURN FROM MULTIINDEX

def returnResultsMI(pairs):
    data = pairs.to_frame(index = False)[0]
    i = 0
    grab_ids = []
    while i < len(data):
        grab_ids.append(data[i])
        i+=1   
    for grab_id in grab_ids:
        result = dfD.loc[grab_id].to_string(header = False, index = False)
        results.append(result)
    return results

def returnResultsDF(pairs):
    data = pairs[0]
    i = 0
    grab_ids = []
    while i < len(data):
        grab_ids.append(data[i])
        i+=1   
    for grab_id in grab_ids:
        result = dfD.loc[grab_id].to_string(header = False, index = False)
        results.append(result)
    return results

### 8. Main Method to Run

In [11]:
### MAIN METHOD
# First, check for exact match overall FULL
results = []
blockIndexer = recordlinkage.BlockIndex(on=['name', 'addr', 'city', 'ctry', 'code'])
blockIndexPairs = blockIndexer.index(dfA, dfB)
if len(blockIndexPairs) > 0:
    #continue
    print(returnResults(blockIndexPairs))
else:
    runBlock()

print(results)

block addr match
['1 MOBILE LIMITED 30 CITY ROAD LONDON EC1Y 2AB', '4D DATA CENTRES LIMITED 30 CITY ROAD LONDON EC...', 'NCSOFT EUROPE LIMITED 30 CITY ROAD LONDON EC1Y...']


### 9. Output to the UI, Variable outputRL, Based on Result Length

In [12]:
## OUTPUT TO UI
outputRL = ""

if len(results) == 0:
    outputRL = "No results were found using Record Linkage"
if len(results) == 1:
    outputRL = "Did you mean: " + results[0] + " ?"
if len(results) == 2:
    outputRL = "Did you mean: " + results[0] + " ?" + "\n" + "Or possibly: " + results[1] + " ?"    
if len(results) == 3:
    outputRL = "Did you mean: " + results[0] + " ?" + "\n" + "Or possibly: " + results[1] + " ?" + "\n" + "Or possibly this: " + results[2] + " ?"    
if len(results) > 3:
    outputRL = "Number of results: " + str(len(results)) + ". Too many match results found. Please enter new data."

print(outputRL)
    

Did you mean: 1 MOBILE LIMITED 30 CITY ROAD LONDON EC1Y 2AB ?
Or possibly: 4D DATA CENTRES LIMITED 30 CITY ROAD LONDON EC... ?
Or possibly this: NCSOFT EUROPE LIMITED 30 CITY ROAD LONDON EC1Y... ?
