In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import gmaps
import json
from pprint import pprint

five_cities = pd.read_csv("output_data/restaurants_category_10_combined_cities.csv")

In [6]:
 #Reorganising columns inplace for future use - dropping irrelevant columns (unnamed and zipcode)  
combined = five_cities[['Restaurant_id',"Name", "Locality", "Address", "City", "Latitude", "Longitude", "Price Range", 
                      "Average Cost for two", "User Rating", "Rating Text", "Votes", "all_reviews_count", "Cuisines"]]

 #Removing the duplicates sorted by relevant fields
final_clean = combined.drop_duplicates(subset = ['Restaurant_id', 'Address'], keep = "first")       

 #Cleaning data for any empty cells in critical fields 
final_clean.dropna(subset=['Price Range','Average Cost for two','User Rating','Rating Text','Votes','all_reviews_count'],
                how = 'any')
final_clean.to_csv('output_data/Top_500.csv')
final_clean.head()



Unnamed: 0,Restaurant_id,Name,Locality,Address,City,Latitude,Longitude,Price Range,Average Cost for two,User Rating,Rating Text,Votes,all_reviews_count,Cuisines
0,15547004,Restaurant Hubert,CBD,"15 Bligh Street, CBD, Sydney",Sydney,-33.865348,151.210624,4,150,4.9,Excellent,590,224,"French, European"
1,16558798,Quay,Circular Quay,"Upper Level, Overseas Passenger Terminal 5 Hic...",Sydney,-33.858029,151.20997,4,500,4.9,Excellent,1366,454,Modern Australian
2,16559171,Tetsuya's,CBD,"529 Kent Street, CBD, Sydney",Sydney,-33.875143,151.204932,4,440,4.9,Excellent,1234,329,Japanese
3,16569454,LuMi Bar & Dining,Pyrmont,"56 Pirrama Road, \tPyrmont, Pyrmont, Sydney",Sydney,-33.867137,151.197517,4,190,4.9,Excellent,452,196,"Italian, Japanese"
4,15545439,Manpuku,Chatswood,"226 Victoria Avenue, Chatswood, Sydney",Sydney,-33.794417,151.189542,2,40,4.9,Excellent,486,190,"Japanese, Ramen"


In [3]:
cleaned_df = final_clean.loc[(final_clean["Average Cost for two"] != 0) &
                        (final_clean["Average Cost for two"] != 25000017)]
cleaned_df.to_csv('output_data/Top_497.csv')
cleaned_df.head()

Unnamed: 0,Restaurant_id,Name,Locality,Address,City,Latitude,Longitude,Price Range,Average Cost for two,User Rating,Rating Text,Votes,all_reviews_count,Cuisines
0,15547004,Restaurant Hubert,CBD,"15 Bligh Street, CBD, Sydney",Sydney,-33.865348,151.210624,4,150,4.9,Excellent,590,224,"French, European"
1,16558798,Quay,Circular Quay,"Upper Level, Overseas Passenger Terminal 5 Hic...",Sydney,-33.858029,151.20997,4,500,4.9,Excellent,1366,454,Modern Australian
2,16559171,Tetsuya's,CBD,"529 Kent Street, CBD, Sydney",Sydney,-33.875143,151.204932,4,440,4.9,Excellent,1234,329,Japanese
3,16569454,LuMi Bar & Dining,Pyrmont,"56 Pirrama Road, \tPyrmont, Pyrmont, Sydney",Sydney,-33.867137,151.197517,4,190,4.9,Excellent,452,196,"Italian, Japanese"
4,15545439,Manpuku,Chatswood,"226 Victoria Avenue, Chatswood, Sydney",Sydney,-33.794417,151.189542,2,40,4.9,Excellent,486,190,"Japanese, Ramen"


In [4]:
 #Printing how many rows were lost during the process
    #Finding total amount of restaurants included 
total_restaurants = len(combined)
    #Calculating total amount of duplicates
dup_result = len(combined.drop_duplicates())
    #Finding the total amount of duplicates
dup_amount =  total_restaurants - dup_result
new_rest_count1 = dup_result
    #Finding how many rows dropped due by invalid values        
dropna_count = len(final_clean.dropna(subset = ['Price Range','Average Cost for two','User Rating','Rating Text','Votes',
                                             'all_reviews_count'], how = 'any'))
items_dropped = new_rest_count1 - dropna_count
new_rest_count2 = dropna_count

    #Removing invalid values from average cost for two
drop_invalid = new_rest_count2 - len(cleaned_df)

    #tallying invalid totals
invalid_count = drop_invalid + items_dropped

    #rows removed count
rows_removed = (dup_amount + invalid_count)    
#Showing how many rows needed to be removed during the cleaning process
print(f'Following the cleaning process, {rows_removed} rows were removed as there was {dup_amount} duplicates and {invalid_count} invalid values')


Following the cleaning process, 3 rows were removed as there was 0 duplicates and 3 invalid values


In [5]:
#Creating Data Frame of Top100 Restaurants among all cities
#Selecting the top 100 restaurants in Australia based on User ratings and number of votes
ranked_df = cleaned_df.sort_values(["User Rating", "Votes"], ascending = False)

#Reset the index
new_index_ranked = ranked_df.reset_index(drop=True)
new_index_ranked

#Select the top 100 restaurants
top_100 = new_index_ranked.loc[new_index_ranked.index <100]
top_100.to_csv('output_data/Top_100.csv')
top_100

Unnamed: 0,Restaurant_id,Name,Locality,Address,City,Latitude,Longitude,Price Range,Average Cost for two,User Rating,Rating Text,Votes,all_reviews_count,Cuisines
0,16572612,Vue de monde,CBD,"Level 55, Rialto, 525 Collins Street, CBD, Mel...",Melbourne,-37.818954,144.957934,4,600,4.9,Excellent,3225,986,"Australian, Contemporary"
1,16585905,Tipo 00,CBD,"361 Little Bourke Street, CBD, Melbourne",Melbourne,-37.813528,144.961973,4,150,4.9,Excellent,1927,717,Italian
2,17881527,Dexter,Preston,"456 High Street, Preston, Melbourne",Melbourne,-37.736196,145.004456,4,110,4.9,Excellent,1473,684,"American, BBQ"
3,16558798,Quay,Circular Quay,"Upper Level, Overseas Passenger Terminal 5 Hic...",Sydney,-33.858029,151.209970,4,500,4.9,Excellent,1366,454,Modern Australian
4,16559171,Tetsuya's,CBD,"529 Kent Street, CBD, Sydney",Sydney,-33.875143,151.204932,4,440,4.9,Excellent,1234,329,Japanese
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,16590137,Zum Kaiser,Woolloongabba,"416 Vulture Street, Woolloongabba, Brisbane",Brisbane,-27.484477,153.036863,3,60,4.7,Excellent,296,83,German
96,16561357,Ormeggio At The Spit,"D'Albora Marinas, Mosman","D'Albora Marinas The Spit, Spit Road, Mosman, ...",Sydney,-33.804225,151.245839,4,300,4.7,Excellent,286,131,Italian
97,16574463,Katik Take Away Food,Campbellfield,"349 Barry Road, Campbellfield, Melbourne, VIC",Melbourne,-37.666874,144.948233,2,40,4.7,Excellent,284,109,"Middle Eastern, Turkish"
98,16589254,127 Days,Croydon Park,"127 Days Road, Croydon Park, Adelaide",Adelaide,-34.875846,138.566375,2,50,4.7,Excellent,267,123,"American, Burger, Sandwich"
