In [None]:
!pip install usaddress
!pip install rapidfuzz

In [3]:
import pandas as pd
import usaddress
from rapidfuzz import process, fuzz

In [None]:
# Load the CSV files into Pandas dataframes
google_df = pd.read_csv('data/Google_Processed.csv')
tripadvisor_df = pd.read_csv('data/Trip_Advisor_Processed.csv')
yelp_df = pd.read_csv('data/Yelp_Processed.csv')

# Display the first few rows of each dataframe
google_df.head(), tripadvisor_df.head(), yelp_df.head()

In [None]:
# Check neighborhood columns
print(sorted(google_df['Neighborhood'].unique()))
print(sorted(tripadvisor_df['Neighborhood'].unique()))
print(sorted(yelp_df['Neighborhood'].unique()))

# Check dataset shape
print(google_df.shape)
print(tripadvisor_df.shape)
print(yelp_df.shape)
print(google_df.shape[0] + tripadvisor_df.shape[0] + yelp_df.shape[0])

In [6]:
# Function to extract street names
def extract_street_name(address):
    try:
        parsed_address = usaddress.parse(address)
        # Extract the street name from the parsed address components
        street_name = ' '.join([part[0] for part in parsed_address if 'StreetName' in part[1]])
        return street_name
    except usaddress.RepeatedLabelError as e:
        print(f"Error parsing address: {address}")
        return None

# Apply the function to the address columns
google_df['Street Name'] = google_df['Address'].apply(extract_street_name)
tripadvisor_df['Street Name'] = tripadvisor_df['Address'].apply(extract_street_name)
yelp_df['Street Name'] = yelp_df['Address'].apply(extract_street_name)

In [7]:
def get_best_match(row, target_df, column_name, scorer=fuzz.WRatio, score_cutoff=70):
    """
    Find the best match for a given name in a target DataFrame's column.
    
    Parameters:
    - row: The row of the source DataFrame.
    - target_df: DataFrame containing potential matches.
    - column_name: The name of the column in target_df to search for matches.
    - scorer: The RapidFuzz scoring function to use.
    - score_cutoff: The minimum score to consider a match (0-100).
    
    Returns:
    A tuple of the best match and its score, or None if no match above the cutoff.
    """
    best_match = process.extractOne(row[column_name], target_df[column_name], 
                                    scorer=scorer, score_cutoff=score_cutoff)
    return best_match[0] if best_match else row[column_name]

In [8]:
google_df['Name'] = google_df.apply(get_best_match, target_df=tripadvisor_df, column_name='Name', axis=1)


In [None]:
# Merge strategy based on the Street Name, Zip Code, and Name (fuzzy-match) as keys for merging

merged_df = pd.merge(google_df, tripadvisor_df, left_on=['Name', 'Zip Code', 'Street Name'], right_on=['Name', 'Zip Code', 'Street Name'], how='outer', suffixes=('_google', '_tripadvisor'))
merged_df['Name'] = merged_df.apply(get_best_match, target_df=yelp_df, column_name='Name', axis=1)
merged_df = pd.merge(merged_df, yelp_df, on=['Name', 'Zip Code', 'Street Name'], how='outer')
merged_df

In [None]:
merged_df = merged_df.drop_duplicates(keep='last')
merged_df

In [13]:
merged_df.to_csv("CompleteRestaurantData.csv")