# Dataset Import

In [1]:
import pandas as pd
# Load the datasets
left_dataset_path = '../data/left_dataset.csv'
right_dataset_path = '../data/right_dataset.csv'

left_dataset = pd.read_csv(left_dataset_path)
right_dataset = pd.read_csv(right_dataset_path)

In [2]:
left_dataset.head()

Unnamed: 0,entity_id,name,address,city,state,postal_code,categories
0,1,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123.0,"Shipping Centers, Local Services, Notaries, Ma..."
1,2,St Honore Pastries,935 Race St,Philadelphia,PA,19107.0,"Restaurants, Food, Bubble Tea, Coffee & Tea, B..."
2,3,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054.0,"Brewpubs, Breweries, Food"
3,4,Sonic Drive-In,615 S Main St,Ashland City,TN,37015.0,"Burgers, Fast Food, Sandwiches, Food, Ice Crea..."
4,5,Famous Footwear,"8522 Eager Road, Dierbergs Brentwood Point",Brentwood,MO,63144.0,"Sporting Goods, Fashion, Shoe Stores, Shopping..."


In [3]:
right_dataset.head()

Unnamed: 0,business_id,name,address,city,state,zip_code,size
0,1,SOURINI PAINTING INC.,12800 44th St N,Clearwater,FL,33762-4726,11.0
1,2,WOLFF DOLLA BILL LLC,1905 E 19th Ave,Tampa,FL,33605-2700,8.0
2,3,"COMPREHENSIVE SURGERY CENTER, LLC","1988 GULF TO BAY BLVD, Ste 1",CLEARWATER,FL,33765-3550,8.0
3,4,FRANK & ADAM APPAREL LLC,13640 Wright Cir,Tampa,FL,33626-3030,12.0
4,5,MORENO PLUS TRANSPORT INC,8608 Huron Court unite 58,Tampa,FL,33614,8.0


## Results from different packages

### Rapidfuzz

In [4]:
# Import necessary functions from the module created by Leo Zhou
from src.rapidfuzz import clean_text, standardize_zip_code, clean_address, create_enhanced_block_keys, fuzzy_match_with_rapidfuzz

In [5]:
# Import dataset in different name to avoid duplication and conflicts
left_df = pd.read_csv(left_dataset_path)
right_df = pd.read_csv(right_dataset_path)

In [6]:
# Apply cleaning steps
left_df['name'] = left_df['name'].apply(clean_text)
left_df['state'] = left_df['state'].apply(clean_text)
left_df['city'] = left_df['city'].apply(clean_text)
right_df['name'] = right_df['name'].apply(clean_text)
right_df['state'] = right_df['state'].apply(clean_text)
right_df['city'] = right_df['city'].apply(clean_text)

# Apply zip code standardizing
right_df['zip_code'] = right_df['zip_code'].apply(standardize_zip_code)
left_df['postal_code'] = left_df['postal_code'].apply(standardize_zip_code)

# Apply the address cleaning function to the DataFrame columns
left_df['address'] = left_df['address'].apply(clean_address)
right_df['address'] = right_df['address'].apply(clean_address)

# Extra Step to drop unwanted columns and standardize column names
left_df.drop(columns=['categories'], inplace=True)
right_df.drop(columns=['size'], inplace=True)

left_df.rename(columns={'postal_code': 'zip_code'}, inplace=True)

In [7]:
# Apply enhanced blocking
left_df = create_enhanced_block_keys(left_df)
right_df = create_enhanced_block_keys(right_df)

In [8]:
# Creating column for rapidfuzz application
left_df['combined'] = left_df['name'].apply(clean_text) + " " + left_df['address'].apply(clean_address)
right_df['combined'] = right_df['name'].apply(clean_text) + " " + right_df['address'].apply(clean_address)

In [9]:
from rapidfuzz import process, fuzz

# Execute fuzzy matching
matched_results = fuzzy_match_with_rapidfuzz(left_df, right_df)

In [10]:
print(matched_results)

       left_dataset  right_dataset  confidence_score
0              8703          20197         85.500000
1             19631          20197         85.500000
2             73038          20197         85.500000
3             85862          20197         85.500000
4               903          63344         85.500000
...             ...            ...               ...
47616          1115          23915         95.000000
47617         17529           4609         85.500000
47618         59493          22503         95.238095
47619         92447          12152         96.721311
47620         61857          38198         85.500000

[47621 rows x 3 columns]
