# Dataset Import

In [None]:
import pandas as pd
# Load the datasets
left_dataset_path = './data/left_dataset.csv'
right_dataset_path = './data/right_dataset.csv'

left_dataset = pd.read_csv(left_dataset_path)
right_dataset = pd.read_csv(right_dataset_path)

In [None]:
left_dataset.head()

In [None]:
right_dataset.head()

## Results from different packages

### Rapidfuzz

In [None]:
# Import necessary functions from the module created by Leo Zhou
from src.rapidfuzz import clean_text, standardize_zip_code, clean_address, create_enhanced_block_keys, fuzzy_match_with_rapidfuzz

In [None]:
# Import dataset in different name to avoid duplication and conflicts
left_df = pd.read_csv(left_dataset_path)
right_df = pd.read_csv(right_dataset_path)

In [None]:
# Apply cleaning steps
left_df['name'] = left_df['name'].apply(clean_text)
left_df['state'] = left_df['state'].apply(clean_text)
left_df['city'] = left_df['city'].apply(clean_text)
right_df['name'] = right_df['name'].apply(clean_text)
right_df['state'] = right_df['state'].apply(clean_text)
right_df['city'] = right_df['city'].apply(clean_text)

# Apply zip code standardizing
right_df['zip_code'] = right_df['zip_code'].apply(standardize_zip_code)
left_df['postal_code'] = left_df['postal_code'].apply(standardize_zip_code)

# Apply the address cleaning function to the DataFrame columns
left_df['address'] = left_df['address'].apply(clean_address)
right_df['address'] = right_df['address'].apply(clean_address)

# Extra Step to drop unwanted columns and standardize column names
left_df.drop(columns=['categories'], inplace=True)
right_df.drop(columns=['size'], inplace=True)

left_df.rename(columns={'postal_code': 'zip_code'}, inplace=True)

In [None]:
# Apply enhanced blocking
left_df = create_enhanced_block_keys(left_df)
right_df = create_enhanced_block_keys(right_df)

In [None]:
# Creating column for rapidfuzz application
left_df['combined'] = left_df['name'].apply(clean_text) + " " + left_df['address'].apply(clean_address)
right_df['combined'] = right_df['name'].apply(clean_text) + " " + right_df['address'].apply(clean_address)

In [None]:
from rapidfuzz import process, fuzz

# Execute fuzzy matching
matched_results = fuzzy_match_with_rapidfuzz(left_df, right_df)

In [None]:
print(matched_results)

### Difflib

In [None]:
# Import necessary functions from the module created by Martin Ng
from src.difflib import clean_text, standardize_zip_code, clean_address, create_enhanced_block_keys, fuzzy_match_with_difflib

In [None]:
left_df = pd.read_csv(left_dataset_path)
right_df = pd.read_csv(right_dataset_path)

In [None]:
left_df['name'] = left_df['name'].apply(clean_text)
left_df['state'] = left_df['state'].apply(clean_text)
left_df['city'] = left_df['city'].apply(clean_text)
right_df['name'] = right_df['name'].apply(clean_text)
right_df['state'] = right_df['state'].apply(clean_text)
right_df['city'] = right_df['city'].apply(clean_text)

left_df['postal_code'] = left_df['postal_code'].apply(standardize_zip_code)
right_df['zip_code'] = right_df['zip_code'].apply(standardize_zip_code)

left_df['address'] = left_df['address'].apply(clean_address)
right_df['address'] = right_df['address'].apply(clean_address)

left_df.drop(columns=['categories'], inplace=True)
right_df.drop(columns=['size'], inplace=True)

left_df.rename(columns={'postal_code': 'zip_code'}, inplace=True)

left_df = create_enhanced_block_keys(left_df)
right_df = create_enhanced_block_keys(right_df)

left_df['combined'] = left_df['name'].apply(clean_text) + " " + left_df['address'].apply(clean_address)
right_df['combined'] = right_df['name'].apply(clean_text) + " " + right_df['address'].apply(clean_address)

In [None]:
from difflib import SequenceMatcher

# Execute fuzzy matching
matched_results = fuzzy_match_with_difflib(left_df, right_df)

In [None]:
print(matched_results)