In [1]:
import pandas as pd

# Load the datasets
left_dataset_path = './data/left_dataset.csv'
right_dataset_path = './data/right_dataset.csv'

left_df = pd.read_csv(left_dataset_path)
right_df = pd.read_csv(right_dataset_path)

In [2]:
left_df.head()

Unnamed: 0,entity_id,name,address,city,state,postal_code,categories
0,1,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123.0,"Shipping Centers, Local Services, Notaries, Ma..."
1,2,St Honore Pastries,935 Race St,Philadelphia,PA,19107.0,"Restaurants, Food, Bubble Tea, Coffee & Tea, B..."
2,3,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054.0,"Brewpubs, Breweries, Food"
3,4,Sonic Drive-In,615 S Main St,Ashland City,TN,37015.0,"Burgers, Fast Food, Sandwiches, Food, Ice Crea..."
4,5,Famous Footwear,"8522 Eager Road, Dierbergs Brentwood Point",Brentwood,MO,63144.0,"Sporting Goods, Fashion, Shoe Stores, Shopping..."


In [3]:
right_df.head()

Unnamed: 0,business_id,name,address,city,state,zip_code,size
0,1,SOURINI PAINTING INC.,12800 44th St N,Clearwater,FL,33762-4726,11.0
1,2,WOLFF DOLLA BILL LLC,1905 E 19th Ave,Tampa,FL,33605-2700,8.0
2,3,"COMPREHENSIVE SURGERY CENTER, LLC","1988 GULF TO BAY BLVD, Ste 1",CLEARWATER,FL,33765-3550,8.0
3,4,FRANK & ADAM APPAREL LLC,13640 Wright Cir,Tampa,FL,33626-3030,12.0
4,5,MORENO PLUS TRANSPORT INC,8608 Huron Court unite 58,Tampa,FL,33614,8.0


In [4]:
import re

def clean_text(text):
    """Lowercase, remove special characters, and strip whitespace."""
    text = text.lower()
    text = re.sub(r'[\W_]+', ' ', text)  # Replace all non-word characters with space
    return text.strip()

# Apply text cleaning
left_df['name'] = left_df['name'].apply(clean_text)
left_df['state'] = left_df['state'].apply(clean_text)
left_df['city'] = left_df['city'].apply(clean_text)
right_df['name'] = right_df['name'].apply(clean_text)
right_df['state'] = right_df['state'].apply(clean_text)
right_df['city'] = right_df['city'].apply(clean_text)

In [6]:
def standardize_zip_code(zip_code):
    """Ensure zip code is a string, truncate or pad to 5 digits."""
    zip_code = str(zip_code)
    return zip_code[:5].zfill(5)  # Pad or truncate to ensure 5 characters

right_df['zip_code'] = right_df['zip_code'].apply(standardize_zip_code)
left_df['postal_code'] = left_df['postal_code'].apply(standardize_zip_code)

In [7]:
import pandas as pd
import re

def clean_address(address):
    """Standardize and clean the address, handling NaN values gracefully."""
    if pd.isna(address):
        return ""  # Return an empty string or some other placeholder for NaN addresses
    address = address.lower()  # Convert to lowercase
    address = re.sub(r'\bstreet\b', 'st', address)
    address = re.sub(r'\broad\b', 'rd', address)
    address = re.sub(r'\bavenue\b', 'ave', address)
    address = re.sub(r'\bdrive\b', 'dr', address)
    address = re.sub(r'[^a-zA-Z0-9\s]', '', address)  # Remove non-alphanumeric characters except space
    address = re.sub(r'\s+', ' ', address).strip()  # Replace multiple spaces with a single space
    return address

# Now apply the cleaning function to the DataFrame columns
left_df['address'] = left_df['address'].apply(clean_address)
right_df['address'] = right_df['address'].apply(clean_address)

In [8]:
# Extra Step to drop unwanted columns and standardize column names
left_df.drop(columns=['categories'], inplace=True)
right_df.drop(columns=['size'], inplace=True)

left_df.rename(columns={'postal_code': 'zip_code'}, inplace=True)

In [18]:
def create_enhanced_block_keys(df):
    """Generate block keys by combining several fields."""
    df['block_key'] = df.apply(lambda x: 
                                (x['name'][0].lower() if pd.notna(x['name']) and x['name'] != "" else '0') + 
                                "_" + (x['state'].upper() if pd.notna(x['state']) else 'Unknown') +
                                "_" + (x['city'][0].lower() if pd.notna(x['city']) and x['city'] != "" else '0') +
                                "_" + (str(x['zip_code'])[:5] if pd.notna(x['zip_code']) else '00000'), axis=1)
    return df

In [22]:
# Apply enhanced blocking
left_df = create_enhanced_block_keys(left_df)
right_df = create_enhanced_block_keys(right_df)

In [25]:
left_df['combined'] = left_df['name'].apply(clean_text) + " " + left_df['address'].apply(clean_address)
right_df['combined'] = right_df['name'].apply(clean_text) + " " + right_df['address'].apply(clean_address)

In [26]:
left_df.head()

Unnamed: 0,entity_id,name,address,city,state,zip_code,block_key,combined
0,1,the ups store,87 grasso plaza shopping center,affton,mo,63123,t_MO_a_63123,the ups store 87 grasso plaza shopping center
1,2,st honore pastries,935 race st,philadelphia,pa,19107,s_PA_p_19107,st honore pastries 935 race st
2,3,perkiomen valley brewery,101 walnut st,green lane,pa,18054,p_PA_g_18054,perkiomen valley brewery 101 walnut st
3,4,sonic drive in,615 s main st,ashland city,tn,37015,s_TN_a_37015,sonic drive in 615 s main st
4,5,famous footwear,8522 eager rd dierbergs brentwood point,brentwood,mo,63144,f_MO_b_63144,famous footwear 8522 eager rd dierbergs brentw...


In [27]:
right_df.head()

Unnamed: 0,business_id,name,address,city,state,zip_code,block_key,combined
0,1,sourini painting inc,12800 44th st n,clearwater,fl,33762,s_FL_c_33762,sourini painting inc 12800 44th st n
1,2,wolff dolla bill llc,1905 e 19th ave,tampa,fl,33605,w_FL_t_33605,wolff dolla bill llc 1905 e 19th ave
2,3,comprehensive surgery center llc,1988 gulf to bay blvd ste 1,clearwater,fl,33765,c_FL_c_33765,comprehensive surgery center llc 1988 gulf to ...
3,4,frank adam apparel llc,13640 wright cir,tampa,fl,33626,f_FL_t_33626,frank adam apparel llc 13640 wright cir
4,5,moreno plus transport inc,8608 huron court unite 58,tampa,fl,33614,m_FL_t_33614,moreno plus transport inc 8608 huron court uni...


In [29]:
from rapidfuzz import process, fuzz

In [39]:
def fuzzy_match_with_rapidfuzz(left_df, right_df, threshold=85):
    results = []
    # Identify common blocks to minimize comparisons
    common_blocks = set(left_df['block_key']).intersection(set(right_df['block_key']))
    
    # Perform matching within each common block
    for block in common_blocks:
        left_block = left_df[left_df['block_key'] == block]
        right_block = right_df[right_df['block_key'] == block]
        for _, left_row in left_block.iterrows():
            # Using RapidFuzz to find the best match in the right block
            best_match = process.extractOne(
                left_row['combined'], 
                {idx: row['combined'] for idx, row in right_block.iterrows()}, 
                scorer=fuzz.WRatio,
                score_cutoff=threshold
            )
            if best_match:
                # Accessing details of the best match
                match_data = {
                    'left_id': left_row['entity_id'],
                    'right_id': right_block.loc[best_match[2]]['business_id'],
                    'match_score': best_match[1]
                }
                results.append(match_data)
                
    return pd.DataFrame(results)


In [40]:
# Execute fuzzy matching
matched_results = fuzzy_match_with_rapidfuzz(left_df, right_df)


   left_id  right_id  match_score
0     5129     37591         85.5
1     6841     36726         85.5
2     7029     37591         85.5
3    18117     38860         85.5
4    26555     37591         85.5


In [46]:
import pandas as pd
matched_results.rename(columns={'left_id':'left_dataset','right_id':'right_dataset','match_score':'confidence_score'})

Unnamed: 0,left_dataset,right_dataset,confidence_score
0,5129,37591,0.855000
1,6841,36726,0.855000
2,7029,37591,0.855000
3,18117,38860,0.855000
4,26555,37591,0.855000
...,...,...,...
47616,67082,30302,0.855000
47617,67087,30236,0.855000
47618,76460,33489,0.855000
47619,46223,41185,0.902439


In [45]:
matched_results['match_score'] = matched_results['match_score'] / 100.0

In [47]:
matched_results.to_csv('rapidfuzz_result.csv',index=False)