In [1]:
import pandas as pd
from src.utils import (preprocess_text, format_zip_code,
                       standardize_address, generate_block_keys, perform_fuzzy_matching)

In [2]:
# Load the left dataset
left_dataset_path = './data/left_dataset.csv'
left_df = pd.read_csv(left_dataset_path)
left_df.head()  # Display the first few rows of the left dataset

Unnamed: 0,entity_id,name,address,city,state,postal_code,categories
0,1,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123.0,"Shipping Centers, Local Services, Notaries, Ma..."
1,2,St Honore Pastries,935 Race St,Philadelphia,PA,19107.0,"Restaurants, Food, Bubble Tea, Coffee & Tea, B..."
2,3,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054.0,"Brewpubs, Breweries, Food"
3,4,Sonic Drive-In,615 S Main St,Ashland City,TN,37015.0,"Burgers, Fast Food, Sandwiches, Food, Ice Crea..."
4,5,Famous Footwear,"8522 Eager Road, Dierbergs Brentwood Point",Brentwood,MO,63144.0,"Sporting Goods, Fashion, Shoe Stores, Shopping..."


In [3]:
# Load the right dataset
right_dataset_path = './data/right_dataset.csv'
right_df = pd.read_csv(right_dataset_path)
right_df.head()  # Display the first few rows of the right dataset

Unnamed: 0,business_id,name,address,city,state,zip_code,size
0,1,SOURINI PAINTING INC.,12800 44th St N,Clearwater,FL,33762-4726,11.0
1,2,WOLFF DOLLA BILL LLC,1905 E 19th Ave,Tampa,FL,33605-2700,8.0
2,3,"COMPREHENSIVE SURGERY CENTER, LLC","1988 GULF TO BAY BLVD, Ste 1",CLEARWATER,FL,33765-3550,8.0
3,4,FRANK & ADAM APPAREL LLC,13640 Wright Cir,Tampa,FL,33626-3030,12.0
4,5,MORENO PLUS TRANSPORT INC,8608 Huron Court unite 58,Tampa,FL,33614,8.0


In [4]:
# List of columns to preprocess
columns_to_preprocess = ["name", "state", "city"]

# Apply the preprocess_text function to specified columns in both dataframes
for col in columns_to_preprocess:
    # Apply preprocessing to the 'left_df'
    left_df[col] = left_df[col].apply(preprocess_text)
    # Apply preprocessing to the 'right_df'
    right_df[col] = right_df[col].apply(preprocess_text)


In [5]:
# right_df['zip_code'] = right_df['zip_code'].apply(format_zip_code)
# left_df['postal_code'] = left_df['postal_code'].apply(format_zip_code)
# Apply the format_zip_code function to the zip code columns of both dataframes
# Format the 'zip_code' in the right dataframe
right_df['zip_code'] = right_df['zip_code'].apply(format_zip_code)

# Format the 'postal_code' in the left dataframe and rename it to 'zip_code' for consistency
left_df['postal_code'] = left_df['postal_code'].apply(format_zip_code)
left_df.rename(columns={'postal_code': 'zip_code'}, inplace=True)


In [6]:
# Standardize the 'address' columns in both left and right dataframes
# This enhances data consistency for potential matching or analysis tasks
# Clean the address in the left dataframe
left_df['address'] = left_df['address'].apply(standardize_address)
# Clean the address in the right dataframe
right_df['address'] = right_df['address'].apply(standardize_address)


In [7]:
# Drop unwanted columns from each dataframe to clean and streamline the datasets
# Remove the 'categories' column from the left dataframe
left_df.drop(columns=['categories'], inplace=True)

# Remove the 'size' column from the right dataframe
right_df.drop(columns=['size'], inplace=True)


In [8]:
# Apply the generate_block_keys function to both dataframes to enhance data matching efficiency
# This function creates a new 'block_key' column based on combinations of other columns,
# which is used to reduce the comparison space for potential matching operations.

# Apply enhanced blocking to the left dataframe
left_df = generate_block_keys(left_df)

# Apply enhanced blocking to the right dataframe
right_df = generate_block_keys(right_df)


In [9]:
# Combine and preprocess the 'name' and 'address' columns to create a 'combined' column for each dataframe
# This combined column will be used for improved data matching and comparison by providing a unified format.

# Combine and preprocess 'name' and 'address' for the left dataframe
left_df['combined'] = left_df['name'].apply(preprocess_text) + " " + left_df['address'].apply(preprocess_text)

# Combine and preprocess 'name' and 'address' for the right dataframe
right_df['combined'] = right_df['name'].apply(preprocess_text) + " " + right_df['address'].apply(preprocess_text)


In [10]:
%%time
# Execute fuzzy matching between the left and right dataframes using the perform_fuzzy_matching function
# This process will identify potential matches based on the 'combined' column created previously,
# using a set threshold to determine match quality.

matched_results = perform_fuzzy_matching(left_df, right_df)


CPU times: total: 4min 57s
Wall time: 4min 57s


In [11]:
# Rename columns in the matched_results DataFrame to enhance readability and data understanding
# This change helps clarify the meanings of the columns in subsequent analyses or reports.

matched_results = matched_results.rename(columns={
    'left_id': 'left_dataset',      # Rename 'left_id' to 'left_dataset' to indicate the source of the data
    'right_id': 'right_dataset',   # Rename 'right_id' to 'right_dataset' to indicate the target of the data
    'match_score': 'confidence_score'  # Rename 'match_score' to 'confidence_score' to reflect its purpose as a measure of match quality
})
