In [1]:
pip install jellyfish

Collecting jellyfish
  Obtaining dependency information for jellyfish from https://files.pythonhosted.org/packages/e3/c3/17d56326246f5682c058894e3a73c6932550f0fe73cefc491fc9cd387f73/jellyfish-1.0.3-cp312-cp312-macosx_11_0_arm64.whl.metadata
  Downloading jellyfish-1.0.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.5 kB)
Downloading jellyfish-1.0.3-cp312-cp312-macosx_11_0_arm64.whl (342 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m343.0/343.0 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: jellyfish
Successfully installed jellyfish-1.0.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import jellyfish

In [3]:
# Load datasets 
left_df = pd.read_csv('left_dataset.csv')
right_df = pd.read_csv('right_dataset.csv')

In [4]:
left_df.head()

Unnamed: 0,entity_id,name,address,city,state,postal_code,categories
0,1,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123.0,"Shipping Centers, Local Services, Notaries, Ma..."
1,2,St Honore Pastries,935 Race St,Philadelphia,PA,19107.0,"Restaurants, Food, Bubble Tea, Coffee & Tea, B..."
2,3,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054.0,"Brewpubs, Breweries, Food"
3,4,Sonic Drive-In,615 S Main St,Ashland City,TN,37015.0,"Burgers, Fast Food, Sandwiches, Food, Ice Crea..."
4,5,Famous Footwear,"8522 Eager Road, Dierbergs Brentwood Point",Brentwood,MO,63144.0,"Sporting Goods, Fashion, Shoe Stores, Shopping..."


In [5]:
right_df.head()

Unnamed: 0,business_id,name,address,city,state,zip_code,size
0,1,SOURINI PAINTING INC.,12800 44th St N,Clearwater,FL,33762-4726,11.0
1,2,WOLFF DOLLA BILL LLC,1905 E 19th Ave,Tampa,FL,33605-2700,8.0
2,3,"COMPREHENSIVE SURGERY CENTER, LLC","1988 GULF TO BAY BLVD, Ste 1",CLEARWATER,FL,33765-3550,8.0
3,4,FRANK & ADAM APPAREL LLC,13640 Wright Cir,Tampa,FL,33626-3030,12.0
4,5,MORENO PLUS TRANSPORT INC,8608 Huron Court unite 58,Tampa,FL,33614,8.0


In [6]:
# Fill NaN values with empty strings
left_df['address'] = left_df['address'].fillna('')
right_df['address'] = right_df['address'].fillna('')

# Convert 'postal_code' to string type to prevent the potential float issue
left_df['postal_code'] = left_df['postal_code'].astype(str)
right_df['zip_code'] = right_df['zip_code'].astype(str)

In [7]:
# Function to calculate similarity score
#Mainly think of comparing the rate of similarity between the name, and the address using the jellyfish package
#And return the average of the similarity rate
def calculate_similarity(row):
    left_name = row['name_x']
    left_address = row['address_x']
    right_name = row['name_y']
    right_address = row['address_y']
    
    name_similarity = jellyfish.jaro_winkler_similarity(left_name.lower(), right_name.lower())
    address_similarity = jellyfish.jaro_winkler_similarity(left_address.lower(), right_address.lower())
    
    return (name_similarity + address_similarity) / 2

In [8]:
# Merge datasets on first five digits of zip code
left_df['zip_prefix'] = left_df['postal_code'].str[:5]
right_df['zip_prefix'] = right_df['zip_code'].str[:5]
merged_df = pd.merge(left_df, right_df, how='inner', on='zip_prefix')

# Calculate similarity score
merged_df['similarity_score'] = merged_df.apply(calculate_similarity, axis=1)

# Filter high confidence outcomes, set the threshold to 0.82
high_confidence_matches = merged_df[merged_df['similarity_score'] > 0.80]


In [9]:
len(high_confidence_matches)

25659

In [12]:
high_confidence_matches

Unnamed: 0,entity_id,name_x,address_x,city_x,state_x,postal_code,categories,zip_prefix,business_id,name_y,address_y,city_y,state_y,zip_code,size,similarity_score
1183,2,St Honore Pastries,935 Race St,Philadelphia,PA,19107.0,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",19107,78912,NEW ST HONORE PASTRIES INC.,935 Race St,Philadelphia,PA,19107-1805,4.0,0.870370
1184,2,St Honore Pastries,935 Race St,Philadelphia,PA,19107.0,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",19107,78913,NEW ST HONORE PASTRIES INC,935 RACE ST,PHILADELPHIA,PA,19107-1805,2.0,0.874644
1697,7,Tsevi's Pub And Grill,8025 Mackenzie Rd,Affton,MO,63123.0,"Pubs, Restaurants, Italian, Bars, American (Tr...",63123,36752,BRAVE SUNRISE LLC,8025 MACKENZIE RD,SAINT LOUIS,MO,63123-3518,30.0,0.823996
2161,8,Sonic Drive-In,2312 Dickerson Pike,Nashville,TN,37207.0,"Ice Cream & Frozen Yogurt, Fast Food, Burgers,...",37207,82927,"SONIC DRIVE-IN, NASHVILLE, DICKERSON ROAD #2, LLC",3904 Dickerson Pike,NASHVILLE,TN,37207,27.0,0.875940
2192,8,Sonic Drive-In,2312 Dickerson Pike,Nashville,TN,37207.0,"Ice Cream & Frozen Yogurt, Fast Food, Burgers,...",37207,84021,"SONIC DRIVE-IN, NASHVILLE, DICKERSON ROAD LLC",2312 Dickerson Road,NASHVILLE,TN,37207,21.0,0.889006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34699424,94560,Tata Cafe,7201 Germantown Ave,Philadelphia,PA,19119.0,"Sandwiches, Restaurants, Italian",19119,74887,MORINA CAFE LLC,7201 Germantown Ave,Philadelphia,PA,19119,3.0,0.816667
34704127,94578,Town & Country Dental Care,"2821 N Ballas Rd, Ste 163",Saint Louis,MO,63131.0,"Pediatric Dentists, Dentists, General Dentistr...",63131,37588,MIDWEST THERAPEUTIC ENDOSCOPY CONSULTANTS LLC,2821 N Ballas Rd Ste 110,Saint Louis,MO,63131-2314,7.0,0.800659
34704206,94578,Town & Country Dental Care,"2821 N Ballas Rd, Ste 163",Saint Louis,MO,63131.0,"Pediatric Dentists, Dentists, General Dentistr...",63131,39631,TOWN AND COUNTRY PEDIATRICS PC,3009 N BALLAS RD SUITE 131 A,SAINT LOUIS,MO,63131,7.0,0.815330
34705583,94579,Shanti Yoga and Ayurveda,"1638 Pine St, Fl 1",Philadelphia,PA,19103.0,"Health & Medical, Yoga, Shopping, Naturopathic...",19103,80357,SHANTI YOGA AND AYURVEDA LLC,1638 PIne Sr 1st fl,Philadelphia,PA,19103,8.0,0.951985


In [13]:
high_confidence_matches[['entity_id', 'business_id', 'similarity_score']].to_csv('Jellyfish_matches.csv', index=False)