# Dataset Import

In [1]:
import pandas as pd
# Load the datasets
left_dataset_path = './data/left_dataset.csv'
right_dataset_path = './data/right_dataset.csv'

left_dataset = pd.read_csv(left_dataset_path)
right_dataset = pd.read_csv(right_dataset_path)

In [2]:
left_dataset.head()

Unnamed: 0,entity_id,name,address,city,state,postal_code,categories
0,1,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123.0,"Shipping Centers, Local Services, Notaries, Ma..."
1,2,St Honore Pastries,935 Race St,Philadelphia,PA,19107.0,"Restaurants, Food, Bubble Tea, Coffee & Tea, B..."
2,3,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054.0,"Brewpubs, Breweries, Food"
3,4,Sonic Drive-In,615 S Main St,Ashland City,TN,37015.0,"Burgers, Fast Food, Sandwiches, Food, Ice Crea..."
4,5,Famous Footwear,"8522 Eager Road, Dierbergs Brentwood Point",Brentwood,MO,63144.0,"Sporting Goods, Fashion, Shoe Stores, Shopping..."


In [3]:
right_dataset.head()

Unnamed: 0,business_id,name,address,city,state,zip_code,size
0,1,SOURINI PAINTING INC.,12800 44th St N,Clearwater,FL,33762-4726,11.0
1,2,WOLFF DOLLA BILL LLC,1905 E 19th Ave,Tampa,FL,33605-2700,8.0
2,3,"COMPREHENSIVE SURGERY CENTER, LLC","1988 GULF TO BAY BLVD, Ste 1",CLEARWATER,FL,33765-3550,8.0
3,4,FRANK & ADAM APPAREL LLC,13640 Wright Cir,Tampa,FL,33626-3030,12.0
4,5,MORENO PLUS TRANSPORT INC,8608 Huron Court unite 58,Tampa,FL,33614,8.0


## Results from different packages

### Rapidfuzz

In [4]:
# Import necessary functions from the module created by Leo Zhou
from src.rapidfuzz import clean_text, standardize_zip_code, clean_address, create_enhanced_block_keys, fuzzy_match_with_rapidfuzz

In [5]:
# Import dataset in different name to avoid duplication and conflicts
left_df = pd.read_csv(left_dataset_path)
right_df = pd.read_csv(right_dataset_path)

In [6]:
# Apply cleaning steps
left_df['name'] = left_df['name'].apply(clean_text)
left_df['state'] = left_df['state'].apply(clean_text)
left_df['city'] = left_df['city'].apply(clean_text)
right_df['name'] = right_df['name'].apply(clean_text)
right_df['state'] = right_df['state'].apply(clean_text)
right_df['city'] = right_df['city'].apply(clean_text)

# Apply zip code standardizing
right_df['zip_code'] = right_df['zip_code'].apply(standardize_zip_code)
left_df['postal_code'] = left_df['postal_code'].apply(standardize_zip_code)

# Apply the address cleaning function to the DataFrame columns
left_df['address'] = left_df['address'].apply(clean_address)
right_df['address'] = right_df['address'].apply(clean_address)

# Extra Step to drop unwanted columns and standardize column names
left_df.drop(columns=['categories'], inplace=True)
right_df.drop(columns=['size'], inplace=True)

left_df.rename(columns={'postal_code': 'zip_code'}, inplace=True)

In [7]:
# Apply enhanced blocking
left_df = create_enhanced_block_keys(left_df)
right_df = create_enhanced_block_keys(right_df)

In [8]:
# Creating column for rapidfuzz application
left_df['combined'] = left_df['name'].apply(clean_text) + " " + left_df['address'].apply(clean_address)
right_df['combined'] = right_df['name'].apply(clean_text) + " " + right_df['address'].apply(clean_address)

In [9]:
from rapidfuzz import process, fuzz

# Execute fuzzy matching
matched_results = fuzzy_match_with_rapidfuzz(left_df, right_df)

In [10]:
print(matched_results)

       left_dataset  right_dataset  confidence_score
0              8703          20197         85.500000
1             19631          20197         85.500000
2             73038          20197         85.500000
3             85862          20197         85.500000
4               903          63344         85.500000
...             ...            ...               ...
47616          1115          23915         95.000000
47617         17529           4609         85.500000
47618         59493          22503         95.238095
47619         92447          12152         96.721311
47620         61857          38198         85.500000

[47621 rows x 3 columns]


### recordlinkage

In [3]:
import pandas as pd
import recordlinkage
from recordlinkage.preprocessing import clean, phonetic
from src.recordlinkage import preprocess_data

In [4]:
left = pd.read_csv(left_dataset_path)
right = pd.read_csv(right_dataset_path)

In [8]:
left = preprocess_data(left)
right = preprocess_data(right)

In [9]:
left.rename(columns={'postal_code': 'zip_code'}, inplace=True)

In [10]:
string_columns = ['address', 'city', 'state', 'zip_code']
for column in string_columns:
    left[column] = left[column].astype(str)
    right[column] = right[column].astype(str)

In [11]:
left['address'] = left['address'].str.cat(left[['city', 'state', 'zip_code']], sep=' ')
right['address'] = right['address'].str.cat(right[['city', 'state', 'zip_code']], sep=' ')


In [12]:
columns_to_keep_left = ['entity_id', 'name', 'address']
columns_to_keep_right = ['business_id', 'name', 'address']
left_common = left[columns_to_keep_left]
right_common = right[columns_to_keep_right]

In [13]:
left_common['block_key'] = left_common['name'].str[:5]
right_common['block_key'] = right_common['name'].str[:5]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  left_common['block_key'] = left_common['name'].str[:5]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  right_common['block_key'] = right_common['name'].str[:5]


In [14]:
indexer = recordlinkage.Index()
indexer.block('block_key')
candidate_links = indexer.index(left_common, right_common)

In [15]:
comparer = recordlinkage.Compare()
comparer.string('name', 'name', method='jarowinkler', label='name_similarity')
comparer.string('address', 'address', method='jarowinkler', label='address_similarity')
features = comparer.compute(candidate_links, left_common, right_common)

In [16]:
features['confidence_score'] = features.mean(axis=1)

In [17]:
threshold = 0.8
matches = features[features['confidence_score'] >= threshold]

In [19]:
matches['entity_id'] = matches.index.get_level_values(0)
matches['business_id'] = matches.index.get_level_values(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matches['entity_id'] = matches.index.get_level_values(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matches['business_id'] = matches.index.get_level_values(1)


In [20]:
columns=columns = ['entity_id', 'business_id', 'confidence_score']
matched = pd.DataFrame(matches[['entity_id', 'business_id', 'confidence_score']], columns=columns)
print(matched)

             entity_id  business_id  confidence_score
7     81911          7        81911          0.812385
      82926          7        82926          0.895760
      84020          7        84020          0.893032
3260  82926       3260        82926          0.809016
      83734       3260        83734          0.808502
...                ...          ...               ...
94033 79875      94033        79875          0.994118
94034 72786      94034        72786          0.967469
94036 53965      94036        53965          0.953301
94167 69598      94167        69598          0.960116
94538 79362      94538        79362          0.946438

[33298 rows x 3 columns]


### Jellyfish

In [1]:
import pandas as pd
import jellyfish
from src.jellyfish import clean_and_merge, calculate_similarity, find_high_confidence_matches

In [3]:
# Load the datasets
left_dataset = pd.read_csv(left_dataset_path)
right_dataset = pd.read_csv(right_dataset_path)

In [4]:
# Clean and merge dataset using the function
merged_df = clean_and_merge(left_dataset, right_dataset)

# Find high confidence matches with the set up of the threshold
high_confidence_matches = find_high_confidence_matches(merged_df, threshold=0.80)

In [5]:
print(high_confidence_matches)

          entity_id  business_id  similarity_score
424               7        36752          0.823996
2606           2887        40775          0.976190
2863           2887        49464          0.817110
3462           3241        41513          0.857944
3693           3241        49438          0.975000
...             ...          ...               ...
34706430      39137        48887          0.962963
34706661      45570        51316          0.975238
34706730      49626        48119          0.921447
34706746      49626        49649          0.988889
34707154      79058        49649          0.843804

[25659 rows x 3 columns]


### Difflib

In [4]:
import pandas as pd
import re
from difflib import SequenceMatcher
from src.difflib import clean_text, standardize_zip_code, clean_address, create_enhanced_block_keys, fuzzy_match_with_difflib

# Load the datasets
left_dataset_path = '/Users/martinng/Desktop/Columbia/2024_Spring/Python for Data Analysis/Project/Data/GroupProject/entity-resolution/left_dataset.csv'
right_dataset_path = '/Users/martinng/Desktop/Columbia/2024_Spring/Python for Data Analysis/Project/Data/GroupProject/entity-resolution/right_dataset.csv'

left_df = pd.read_csv(left_dataset_path)
right_df = pd.read_csv(right_dataset_path)

In [6]:
# Apply text cleaning
left_df['name'] = left_df['name'].apply(clean_text)
left_df['state'] = left_df['state'].apply(clean_text)
left_df['city'] = left_df['city'].apply(clean_text)
right_df['name'] = right_df['name'].apply(clean_text)
right_df['state'] = right_df['state'].apply(clean_text)
right_df['city'] = right_df['city'].apply(clean_text)

left_df['postal_code'] = left_df['postal_code'].apply(standardize_zip_code)
right_df['zip_code'] = right_df['zip_code'].apply(standardize_zip_code)

left_df['address'] = left_df['address'].apply(clean_address)
right_df['address'] = right_df['address'].apply(clean_address)

# Extra Step to drop unwanted columns and standardize column names
left_df.drop(columns=['categories'], inplace=True)
right_df.drop(columns=['size'], inplace=True)
left_df.rename(columns={'postal_code': 'zip_code'}, inplace=True)

left_df = create_enhanced_block_keys(left_df)
right_df = create_enhanced_block_keys(right_df)

left_df['combined'] = left_df['name'].apply(clean_text) + " " + left_df['address'].apply(clean_address)
right_df['combined'] = right_df['name'].apply(clean_text) + " " + right_df['address'].apply(clean_address)

In [7]:
matched_results = fuzzy_match_with_difflib(left_df, right_df,0.6)
print(matched_results)

       left_id  right_id  match_score
0         1941     77575     0.843750
1        44647     77575     0.620690
2        69578     65157     0.842105
3        85017     65157     0.629213
4         1551     51504     0.948718
...        ...       ...          ...
22951    13437      8755     0.633663
22952    38931     18080     0.666667
22953    51923      8755     0.622222
22954    93260      8755     0.972477
22955    94192     22264     0.754098

[22956 rows x 3 columns]
