In [1]:
#Alex Netzley
#2/18/2024

import os
import numpy as np
import pandas as pd
import openpyxl
from openpyxl import load_workbook


Loading in the Datasets

In [42]:
# Cleaning the new google dataset
data_folder = os.path.join(os.path.dirname(os.getcwd())+'/data/')
#Read in google Data
yelp_data_raw = pd.read_csv(os.path.join(data_folder, 'yelp_final.csv'))
trip_advisor_data_raw = pd.read_csv(os.path.join(data_folder, 'trip_advisor_full.csv'))
google_data_raw = pd.read_csv(os.path.join(data_folder, 'google_full.csv'))

Helper Functions

In [37]:
def get_neighborhood(df):
    neighborhood_zipcode = {}
    zipcode = ['98101', '98102', '98103', '98104', '98105', '98107', '98109', '98112', '98121', '98122', '98125', '98133']
    neighborhood = ['Downtown', 'Capitol Hill', 'Fremont/Wallingford', 'Chinatown', 'University District', 'Ballard', 'Queen Anne/South Lake Union', 'Capitol Hill', 'Belltown', 'Capitol Hill', 'Northgate', 'Bitter Lake']
    for i in range(len(zipcode)):
        neighborhood_zipcode[zipcode[i]] = neighborhood[i]
    df['Neighborhood'] = 'Other'
    df['Neighborhood'] = df['Zip Code'].map(neighborhood_zipcode).fillna(df['Neighborhood'])

    return df

Cleaning the Yelp Dataset

In [None]:
#Remove all rows without ratings
yelp_data = yelp_data_raw[yelp_data_raw['Yelp Rating'].notna()]
yelp_data = yelp_data_raw[yelp_data_raw['Yelp Num Reviews'].notna()]

#Drop irrelevant Columns
yelp_data = yelp_data.drop(columns=['Yelp URL', 'Yelp Name'])

#Rename columns to fit naming convention
yelp_data = yelp_data.rename(columns={'Restaurant':'Name', 
                                              'Yelp Rating':'Rating',
                                              'Yelp Num Reviews':'Num Reviews',
                                              'Yelp Cost':'Cost',
                                              'Yelp Tags':'Tags',
                                              'Yelp Distribution':'Distribution'  })

#Drop (near) Duplicate entries
yelp_data["add_beg"] = yelp_data['Address'].str.strip().str[:5]
yelp_data["name_beg"] = yelp_data['Name'].str.strip().str[:5]
yelp_data = yelp_data.drop_duplicates(subset = ['name_beg', 'add_beg'])
yelp_data = yelp_data.drop(columns=['add_beg', 'name_beg'])

#Drop entries that are not from Seattle
yelp_data = yelp_data[yelp_data['Address'].str.contains("Seattle", na=False)]
yelp_data["Zip Code"] = yelp_data['Address'].str.strip().str.extract(r'(\b981\d{2}\b)')

yelp_data = get_neighborhood(yelp_data)

#Export to csv
yelp_data.to_csv(os.path.join(data_folder, 'Yelp_Processed.csv'))

Cleaning the Trip Advisor Dataset

In [39]:
#Remove all rows without ratings
trip_advisor_data = trip_advisor_data_raw[trip_advisor_data_raw['Rating'].notna()]
trip_advisor_data = trip_advisor_data_raw[trip_advisor_data_raw['Number_of_reviews'].notna()]

#Drop irrelevant Columns
trip_advisor_data = trip_advisor_data.drop(columns=['URL'])

#Rename columns to fit naming convention
trip_advisor_data = trip_advisor_data.rename(columns={'Restaurant':'Name', 
                                              'Number_of_reviews':'Num Reviews',
                                              'DollarSigns':'Cost',
                                              'RestaurantType':'Tags' })

#Drop (near) Duplicate entries
trip_advisor_data["add_beg"] = trip_advisor_data['Address'].str.strip().str[:4]
trip_advisor_data["name_beg"] = trip_advisor_data['Name'].str.strip().str[:4]
trip_advisor_data = trip_advisor_data.drop_duplicates(subset = ['name_beg', 'add_beg'])
trip_advisor_data = trip_advisor_data.drop(columns=['add_beg', 'name_beg'])

#Drop entries that are not from Seattle
trip_advisor_data = trip_advisor_data[trip_advisor_data['Address'].str.contains("Seattle", na=False)]
trip_advisor_data["Zip Code"] = trip_advisor_data['Address'].str.strip().str.extract(r'(\b981\d{2}\b)')

trip_advisor_data = get_neighborhood(trip_advisor_data)

#Export to csv
trip_advisor_data.to_csv(os.path.join(data_folder, 'Trip_Advisor_Processed.csv'))

Cleaning the Google Dataset

In [50]:
#Remove all rows without ratings
google_data = google_data_raw[google_data_raw['rating'].notna()]
google_data = google_data_raw[google_data_raw['reviews'].notna()]

#Drop irrelevant Columns
google_data = google_data.drop(columns=['link', 'place_id'])

#Rename columns to fit naming convention
google_data = google_data.rename(columns={'name':'Name', 
                                              'reviews':'Num Reviews',
                                              'categories':'Tags',
                                               'main_category':'Main Tag',
                                                'rating':'Rating',
                                                'address':'Address'})

#Drop (near) Duplicate entries
google_data["add_beg"] = google_data['Address'].str.strip().str[:15]
google_data["name_beg"] = google_data['Name'].str.strip().str[:15]
google_data = google_data.drop_duplicates(subset = ['name_beg', 'add_beg'])
google_data = google_data.drop(columns=['add_beg', 'name_beg'])

#Drop entries that are not from Seattle
google_data = google_data[google_data['Address'].str.contains("Seattle", na=False)]
google_data["Zip Code"] = google_data['Address'].str.strip().str.extract(r'(\b981\d{2}\b)')

google_data = get_neighborhood(google_data)

#Export to csv
google_data.to_csv(os.path.join(data_folder, 'Google_Processed.csv'))