In [43]:
pip install jellyfish


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [44]:
import pandas as pd
import jellyfish

In [45]:
def clean_and_merge(left_df, right_df):
    # Fill NaN values with empty strings
    left_df['address'] = left_df['address'].fillna('')
    right_df['address'] = right_df['address'].fillna('')

    # Convert 'postal_code' to string type to prevent the potential float issue
    left_df['postal_code'] = left_df['postal_code'].astype(str)
    right_df['zip_code'] = right_df['zip_code'].astype(str)
    
    # Create 'zip_prefix' column for both dataframes
    left_df['zip_prefix'] = left_df['postal_code'].str[:5]
    right_df['zip_prefix'] = right_df['zip_code'].str[:5]

    # Merge datasets on 'zip_prefix'
    merged_df = pd.merge(left_df, right_df, how='inner', on='zip_prefix')

    return merged_df

In [46]:
# Function to calculate similarity score
#Mainly think of comparing the rate of similarity between the name, and the address using the jellyfish package
#And return the average of the similarity rate
def calculate_similarity(row):
    left_name = row['name_x']
    left_address = row['address_x']
    right_name = row['name_y']
    right_address = row['address_y']
    
    name_similarity = jellyfish.jaro_winkler_similarity(left_name.lower(), right_name.lower())
    address_similarity = jellyfish.jaro_winkler_similarity(left_address.lower(), right_address.lower())
    
    return (name_similarity + address_similarity) / 2

In [47]:
def find_high_confidence_matches(merged_df, threshold=0.80):
    # Calculate similarity score
    merged_df['similarity_score'] = merged_df.apply(calculate_similarity, axis=1)

    # Filter high confidence outcomes
    high_confidence_matches = merged_df[merged_df['similarity_score'] > threshold]

    # Selecting only the desired columns
    high_confidence_matches = high_confidence_matches[['entity_id', 'business_id', 'similarity_score']]
    
    return high_confidence_matches

In [48]:
# Load datasets 
left_df = pd.read_csv('left_dataset.csv')
right_df = pd.read_csv('right_dataset.csv')

In [50]:
left_df.head()

Unnamed: 0,entity_id,name,address,city,state,postal_code,categories,zip_prefix
0,1,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123.0,"Shipping Centers, Local Services, Notaries, Ma...",63123
1,2,St Honore Pastries,935 Race St,Philadelphia,PA,19107.0,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",19107
2,3,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054.0,"Brewpubs, Breweries, Food",18054
3,4,Sonic Drive-In,615 S Main St,Ashland City,TN,37015.0,"Burgers, Fast Food, Sandwiches, Food, Ice Crea...",37015
4,5,Famous Footwear,"8522 Eager Road, Dierbergs Brentwood Point",Brentwood,MO,63144.0,"Sporting Goods, Fashion, Shoe Stores, Shopping...",63144


In [51]:
right_df.head()

Unnamed: 0,business_id,name,address,city,state,zip_code,size,zip_prefix
0,1,SOURINI PAINTING INC.,12800 44th St N,Clearwater,FL,33762-4726,11.0,33762
1,2,WOLFF DOLLA BILL LLC,1905 E 19th Ave,Tampa,FL,33605-2700,8.0,33605
2,3,"COMPREHENSIVE SURGERY CENTER, LLC","1988 GULF TO BAY BLVD, Ste 1",CLEARWATER,FL,33765-3550,8.0,33765
3,4,FRANK & ADAM APPAREL LLC,13640 Wright Cir,Tampa,FL,33626-3030,12.0,33626
4,5,MORENO PLUS TRANSPORT INC,8608 Huron Court unite 58,Tampa,FL,33614,8.0,33614


In [52]:
# Clean and merge datasets
merged_df = clean_and_merge(left_df, right_df)

# Find high confidence matches
high_confidence_matches = find_high_confidence_matches(merged_df, threshold=0.80)

In [53]:
# Display high confidence matches
print(high_confidence_matches)

          entity_id  business_id  similarity_score
1183              2        78912          0.870370
1184              2        78913          0.874644
1697              7        36752          0.823996
2161              8        82927          0.875940
2192              8        84021          0.889006
...             ...          ...               ...
34699424      94560        74887          0.816667
34704127      94578        37588          0.800659
34704206      94578        39631          0.815330
34705583      94579        80357          0.951985
34706871      94583        85069          0.803968

[25659 rows x 3 columns]


In [54]:
high_confidence_matches.to_csv('Jellyfish_matches.csv', index=False)