In [1]:
import pandas as pd
import recordlinkage
from recordlinkage.preprocessing import clean, phonetic

In [2]:
left_dataset_path = './data/left_dataset.csv'
right_dataset_path = './data/right_dataset.csv'
left = pd.read_csv(left_dataset_path)
right = pd.read_csv(right_dataset_path)

In [3]:
left

Unnamed: 0,entity_id,name,address,city,state,postal_code,categories
0,1,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123.0,"Shipping Centers, Local Services, Notaries, Ma..."
1,2,St Honore Pastries,935 Race St,Philadelphia,PA,19107.0,"Restaurants, Food, Bubble Tea, Coffee & Tea, B..."
2,3,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054.0,"Brewpubs, Breweries, Food"
3,4,Sonic Drive-In,615 S Main St,Ashland City,TN,37015.0,"Burgers, Fast Food, Sandwiches, Food, Ice Crea..."
4,5,Famous Footwear,"8522 Eager Road, Dierbergs Brentwood Point",Brentwood,MO,63144.0,"Sporting Goods, Fashion, Shoe Stores, Shopping..."
...,...,...,...,...,...,...,...
94580,94581,Adelita Taqueria & Restaurant,1108 S 9th St,Philadelphia,PA,19147.0,"Restaurants, Mexican"
94581,94582,INSPcenter/Thai Clinical Massage,"2625 N Meridian St, Unit 50",Indianapolis,IN,46208.0,"Massage Therapy, Rolfing, Beauty & Spas, Refle..."
94582,94583,Wild Birds Unlimited,2813 Bransford Ave,Nashville,TN,37204.0,"Pets, Nurseries & Gardening, Pet Stores, Hobby..."
94583,94584,Claire's Boutique,"6020 E 82nd St, Ste 46",Indianapolis,IN,46250.0,"Shopping, Jewelry, Piercing, Toy Stores, Beaut..."


In [4]:
right

Unnamed: 0,business_id,name,address,city,state,zip_code,size
0,1,SOURINI PAINTING INC.,12800 44th St N,Clearwater,FL,33762-4726,11.0
1,2,WOLFF DOLLA BILL LLC,1905 E 19th Ave,Tampa,FL,33605-2700,8.0
2,3,"COMPREHENSIVE SURGERY CENTER, LLC","1988 GULF TO BAY BLVD, Ste 1",CLEARWATER,FL,33765-3550,8.0
3,4,FRANK & ADAM APPAREL LLC,13640 Wright Cir,Tampa,FL,33626-3030,12.0
4,5,MORENO PLUS TRANSPORT INC,8608 Huron Court unite 58,Tampa,FL,33614,8.0
...,...,...,...,...,...,...,...
91786,91787,DEH TRANSPORT LLC,737 E Statue Ct,Franklin,TN,37067-5637,2.0
91787,91788,TM INC. LLC,4911 Georgia Ave,Nashville,TN,37209-2135,4.0
91788,91789,"PASSION TRANSPORTATION SERVICES, LLC",2005 QUAIL DR,NASHVILLE,TN,37207,2.0
91789,91790,ROSIE CLEANING SERVICE/COMMERCIAL,705 N 9TH ST,NASHVILLE,TN,37206-3907,5.0


In [5]:
def preprocess_data(df):
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = df[column].str.replace(r'[^a-zA-Z0-9]', '', regex=True).str.lower()
    return df

In [6]:
left = preprocess_data(left)
right = preprocess_data(right)

In [7]:
left

Unnamed: 0,entity_id,name,address,city,state,postal_code,categories
0,1,theupsstore,87grassoplazashoppingcenter,affton,mo,63123.0,shippingcenterslocalservicesnotariesmailboxcen...
1,2,sthonorepastries,935racest,philadelphia,pa,19107.0,restaurantsfoodbubbleteacoffeeteabakeries
2,3,perkiomenvalleybrewery,101walnutst,greenlane,pa,18054.0,brewpubsbreweriesfood
3,4,sonicdrivein,615smainst,ashlandcity,tn,37015.0,burgersfastfoodsandwichesfoodicecreamfrozenyog...
4,5,famousfootwear,8522eagerroaddierbergsbrentwoodpoint,brentwood,mo,63144.0,sportinggoodsfashionshoestoresshoppingsportswe...
...,...,...,...,...,...,...,...
94580,94581,adelitataqueriarestaurant,1108s9thst,philadelphia,pa,19147.0,restaurantsmexican
94581,94582,inspcenterthaiclinicalmassage,2625nmeridianstunit50,indianapolis,in,46208.0,massagetherapyrolfingbeautyspasreflexologyheal...
94582,94583,wildbirdsunlimited,2813bransfordave,nashville,tn,37204.0,petsnurseriesgardeningpetstoreshobbyshopsbirds...
94583,94584,clairesboutique,6020e82ndstste46,indianapolis,in,46250.0,shoppingjewelrypiercingtoystoresbeautyspasacce...


In [8]:
right

Unnamed: 0,business_id,name,address,city,state,zip_code,size
0,1,sourinipaintinginc,1280044thstn,clearwater,fl,337624726,11.0
1,2,wolffdollabillllc,1905e19thave,tampa,fl,336052700,8.0
2,3,comprehensivesurgerycenterllc,1988gulftobayblvdste1,clearwater,fl,337653550,8.0
3,4,frankadamapparelllc,13640wrightcir,tampa,fl,336263030,12.0
4,5,morenoplustransportinc,8608huroncourtunite58,tampa,fl,33614,8.0
...,...,...,...,...,...,...,...
91786,91787,dehtransportllc,737estatuect,franklin,tn,370675637,2.0
91787,91788,tmincllc,4911georgiaave,nashville,tn,372092135,4.0
91788,91789,passiontransportationservicesllc,2005quaildr,nashville,tn,37207,2.0
91789,91790,rosiecleaningservicecommercial,705n9thst,nashville,tn,372063907,5.0


In [9]:
print(left.columns.tolist())

['entity_id', 'name', 'address', 'city', 'state', 'postal_code', 'categories']


In [10]:
print(right.columns.tolist())

['business_id', 'name', 'address', 'city', 'state', 'zip_code', 'size']


In [11]:
left.rename(columns={'postal_code': 'zip_code'}, inplace=True)

In [12]:
left

Unnamed: 0,entity_id,name,address,city,state,zip_code,categories
0,1,theupsstore,87grassoplazashoppingcenter,affton,mo,63123.0,shippingcenterslocalservicesnotariesmailboxcen...
1,2,sthonorepastries,935racest,philadelphia,pa,19107.0,restaurantsfoodbubbleteacoffeeteabakeries
2,3,perkiomenvalleybrewery,101walnutst,greenlane,pa,18054.0,brewpubsbreweriesfood
3,4,sonicdrivein,615smainst,ashlandcity,tn,37015.0,burgersfastfoodsandwichesfoodicecreamfrozenyog...
4,5,famousfootwear,8522eagerroaddierbergsbrentwoodpoint,brentwood,mo,63144.0,sportinggoodsfashionshoestoresshoppingsportswe...
...,...,...,...,...,...,...,...
94580,94581,adelitataqueriarestaurant,1108s9thst,philadelphia,pa,19147.0,restaurantsmexican
94581,94582,inspcenterthaiclinicalmassage,2625nmeridianstunit50,indianapolis,in,46208.0,massagetherapyrolfingbeautyspasreflexologyheal...
94582,94583,wildbirdsunlimited,2813bransfordave,nashville,tn,37204.0,petsnurseriesgardeningpetstoreshobbyshopsbirds...
94583,94584,clairesboutique,6020e82ndstste46,indianapolis,in,46250.0,shoppingjewelrypiercingtoystoresbeautyspasacce...


In [13]:
right

Unnamed: 0,business_id,name,address,city,state,zip_code,size
0,1,sourinipaintinginc,1280044thstn,clearwater,fl,337624726,11.0
1,2,wolffdollabillllc,1905e19thave,tampa,fl,336052700,8.0
2,3,comprehensivesurgerycenterllc,1988gulftobayblvdste1,clearwater,fl,337653550,8.0
3,4,frankadamapparelllc,13640wrightcir,tampa,fl,336263030,12.0
4,5,morenoplustransportinc,8608huroncourtunite58,tampa,fl,33614,8.0
...,...,...,...,...,...,...,...
91786,91787,dehtransportllc,737estatuect,franklin,tn,370675637,2.0
91787,91788,tmincllc,4911georgiaave,nashville,tn,372092135,4.0
91788,91789,passiontransportationservicesllc,2005quaildr,nashville,tn,37207,2.0
91789,91790,rosiecleaningservicecommercial,705n9thst,nashville,tn,372063907,5.0


In [14]:
string_columns = ['address', 'city', 'state', 'zip_code']
for column in string_columns:
    left[column] = left[column].astype(str)
    right[column] = right[column].astype(str)

In [15]:
left['address'] = left['address'].str.cat(left[['city', 'state', 'zip_code']], sep=' ')
right['address'] = right['address'].str.cat(right[['city', 'state', 'zip_code']], sep=' ')

In [16]:
left

Unnamed: 0,entity_id,name,address,city,state,zip_code,categories
0,1,theupsstore,87grassoplazashoppingcenter affton mo 63123.0,affton,mo,63123.0,shippingcenterslocalservicesnotariesmailboxcen...
1,2,sthonorepastries,935racest philadelphia pa 19107.0,philadelphia,pa,19107.0,restaurantsfoodbubbleteacoffeeteabakeries
2,3,perkiomenvalleybrewery,101walnutst greenlane pa 18054.0,greenlane,pa,18054.0,brewpubsbreweriesfood
3,4,sonicdrivein,615smainst ashlandcity tn 37015.0,ashlandcity,tn,37015.0,burgersfastfoodsandwichesfoodicecreamfrozenyog...
4,5,famousfootwear,8522eagerroaddierbergsbrentwoodpoint brentwood...,brentwood,mo,63144.0,sportinggoodsfashionshoestoresshoppingsportswe...
...,...,...,...,...,...,...,...
94580,94581,adelitataqueriarestaurant,1108s9thst philadelphia pa 19147.0,philadelphia,pa,19147.0,restaurantsmexican
94581,94582,inspcenterthaiclinicalmassage,2625nmeridianstunit50 indianapolis in 46208.0,indianapolis,in,46208.0,massagetherapyrolfingbeautyspasreflexologyheal...
94582,94583,wildbirdsunlimited,2813bransfordave nashville tn 37204.0,nashville,tn,37204.0,petsnurseriesgardeningpetstoreshobbyshopsbirds...
94583,94584,clairesboutique,6020e82ndstste46 indianapolis in 46250.0,indianapolis,in,46250.0,shoppingjewelrypiercingtoystoresbeautyspasacce...


In [17]:
right

Unnamed: 0,business_id,name,address,city,state,zip_code,size
0,1,sourinipaintinginc,1280044thstn clearwater fl 337624726,clearwater,fl,337624726,11.0
1,2,wolffdollabillllc,1905e19thave tampa fl 336052700,tampa,fl,336052700,8.0
2,3,comprehensivesurgerycenterllc,1988gulftobayblvdste1 clearwater fl 337653550,clearwater,fl,337653550,8.0
3,4,frankadamapparelllc,13640wrightcir tampa fl 336263030,tampa,fl,336263030,12.0
4,5,morenoplustransportinc,8608huroncourtunite58 tampa fl 33614,tampa,fl,33614,8.0
...,...,...,...,...,...,...,...
91786,91787,dehtransportllc,737estatuect franklin tn 370675637,franklin,tn,370675637,2.0
91787,91788,tmincllc,4911georgiaave nashville tn 372092135,nashville,tn,372092135,4.0
91788,91789,passiontransportationservicesllc,2005quaildr nashville tn 37207,nashville,tn,37207,2.0
91789,91790,rosiecleaningservicecommercial,705n9thst nashville tn 372063907,nashville,tn,372063907,5.0


In [18]:
columns_to_keep_left = ['entity_id', 'name', 'address']
columns_to_keep_right = ['business_id', 'name', 'address']
left_common = left[columns_to_keep_left]
right_common = right[columns_to_keep_right]

In [19]:
left_common

Unnamed: 0,entity_id,name,address
0,1,theupsstore,87grassoplazashoppingcenter affton mo 63123.0
1,2,sthonorepastries,935racest philadelphia pa 19107.0
2,3,perkiomenvalleybrewery,101walnutst greenlane pa 18054.0
3,4,sonicdrivein,615smainst ashlandcity tn 37015.0
4,5,famousfootwear,8522eagerroaddierbergsbrentwoodpoint brentwood...
...,...,...,...
94580,94581,adelitataqueriarestaurant,1108s9thst philadelphia pa 19147.0
94581,94582,inspcenterthaiclinicalmassage,2625nmeridianstunit50 indianapolis in 46208.0
94582,94583,wildbirdsunlimited,2813bransfordave nashville tn 37204.0
94583,94584,clairesboutique,6020e82ndstste46 indianapolis in 46250.0


In [20]:
right_common

Unnamed: 0,business_id,name,address
0,1,sourinipaintinginc,1280044thstn clearwater fl 337624726
1,2,wolffdollabillllc,1905e19thave tampa fl 336052700
2,3,comprehensivesurgerycenterllc,1988gulftobayblvdste1 clearwater fl 337653550
3,4,frankadamapparelllc,13640wrightcir tampa fl 336263030
4,5,morenoplustransportinc,8608huroncourtunite58 tampa fl 33614
...,...,...,...
91786,91787,dehtransportllc,737estatuect franklin tn 370675637
91787,91788,tmincllc,4911georgiaave nashville tn 372092135
91788,91789,passiontransportationservicesllc,2005quaildr nashville tn 37207
91789,91790,rosiecleaningservicecommercial,705n9thst nashville tn 372063907


In [21]:
left_common['block_key'] = left_common['name'].str[:5]
right_common['block_key'] = right_common['name'].str[:5]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  left_common['block_key'] = left_common['name'].str[:5]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  right_common['block_key'] = right_common['name'].str[:5]


In [22]:
indexer = recordlinkage.Index()
indexer.block('block_key')
candidate_links = indexer.index(left_common, right_common)

In [23]:
comparer = recordlinkage.Compare()
comparer.string('name', 'name', method='jarowinkler', label='name_similarity')
comparer.string('address', 'address', method='jarowinkler', label='address_similarity')

<Compare>

In [24]:
features = comparer.compute(candidate_links, left_common, right_common)

In [25]:
features['confidence_score'] = features.mean(axis=1)

In [26]:
threshold = 0.8
matches = features[features['confidence_score'] >= threshold]

In [27]:
matches

Unnamed: 0,Unnamed: 1,name_similarity,address_similarity,confidence_score
7,81911,0.877419,0.747351,0.812385
7,82926,0.863158,0.928363,0.895760
7,84020,0.864865,0.921199,0.893032
3260,82926,0.863158,0.754873,0.809016
3260,83734,0.888889,0.728115,0.808502
...,...,...,...,...
94033,79875,1.000000,0.988235,0.994118
94034,72786,0.964706,0.970231,0.967469
94036,53965,0.925000,0.981602,0.953301
94167,69598,0.950000,0.970231,0.960116


In [28]:
matches['entity_id'] = matches.index.get_level_values(0)
matches['business_id'] = matches.index.get_level_values(1)
triplets = matches[['entity_id', 'business_id', 'confidence_score']].reset_index(drop=True)
triplets_list = [list(row) for row in triplets.values]
print(triplets_list)

[[7.0, 81911.0, 0.8123852124106793], [7.0, 82926.0, 0.8957602339181288], [7.0, 84020.0, 0.8930318476371109], [3260.0, 82926.0, 0.8090155945419104], [3260.0, 83734.0, 0.8085021694019996], [3260.0, 83848.0, 0.9171692745376956], [14768.0, 66427.0, 0.8370291693208359], [18149.0, 83954.0, 0.889871365423148], [18149.0, 84352.0, 0.9226278060743256], [27426.0, 82396.0, 0.9309582309582309], [27426.0, 82926.0, 0.8424564915792987], [27426.0, 83734.0, 0.8422737874350779], [34431.0, 83954.0, 0.8483571068036264], [34431.0, 84352.0, 0.823699930830661], [39881.0, 82926.0, 0.8042657289776165], [44101.0, 83954.0, 0.8069187956284731], [44101.0, 84352.0, 0.8012880931556654], [50104.0, 83954.0, 0.9017636926971864], [50104.0, 84020.0, 0.8008832002734443], [50104.0, 84352.0, 0.8458521401852388], [56401.0, 83848.0, 0.8370120120120121], [63579.0, 83954.0, 0.8380814175805686], [63579.0, 84352.0, 0.8078054938835924], [65096.0, 82396.0, 0.8327694527694528], [65096.0, 82926.0, 0.8211816409711147], [65096.0, 83734.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matches['entity_id'] = matches.index.get_level_values(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matches['business_id'] = matches.index.get_level_values(1)


In [29]:
columns=columns = ['entity_id', 'business_id', 'confidence_score']
matched = pd.DataFrame(matches[['entity_id', 'business_id', 'confidence_score']], columns=columns)

In [30]:
print(matched)

             entity_id  business_id  confidence_score
7     81911          7        81911          0.812385
      82926          7        82926          0.895760
      84020          7        84020          0.893032
3260  82926       3260        82926          0.809016
      83734       3260        83734          0.808502
...                ...          ...               ...
94033 79875      94033        79875          0.994118
94034 72786      94034        72786          0.967469
94036 53965      94036        53965          0.953301
94167 69598      94167        69598          0.960116
94538 79362      94538        79362          0.946438

[33298 rows x 3 columns]


In [31]:
matched.rename(columns={'entity_id':'left_dataset','business_id':'right_dataset','confidence_score':'confidence_score'})

Unnamed: 0,Unnamed: 1,left_dataset,right_dataset,confidence_score
7,81911,7,81911,0.812385
7,82926,7,82926,0.895760
7,84020,7,84020,0.893032
3260,82926,3260,82926,0.809016
3260,83734,3260,83734,0.808502
...,...,...,...,...
94033,79875,94033,79875,0.994118
94034,72786,94034,72786,0.967469
94036,53965,94036,53965,0.953301
94167,69598,94167,69598,0.960116
