In [325]:
import pandas as pd
import numpy as np
import requests
from sklearn.cluster import KMeans
import folium
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors
import altair as alt

**Function to get the correct name of the suburbs from the suburb list**

In [326]:
def get_suburbs(suburbs_list):
    keywords = ['Point', 'Bay', 'Ridge', 'Hill', 'Hills', 'Heights', 'Beach','South',
            'Hacking', 'Pilli','Waters','Valley','Park','Farms', 'Sydney',
           'Old', 'North']
    suburbs = []
    new_val=''
    for idx, val in enumerate(suburbs_list):    
        if val in keywords:              
            if val == 'Old' or val == 'North' or val == 'Sydney':
                new_val=val+' '
            else:
                new_val=''
                name = suburbs.pop() 
                if name == 'Siverwater':
                    new_suburb = 'Silverwater'
                    new_val=val+' '
                else:
                    new_suburb = name+' '+val            
        else:
            new_suburb = new_val+val
            new_val=''
        suburbs.append(new_suburb)
    return suburbs

**Function to add NSW to the suburb in order to get the proper address for coordinate search**

In [327]:
def get_suburb_address(suburbs):
    address = []
    for suburb in suburbs:    
        address.append(suburb+',NSW')
    return address

**Function to get the coordinates from the address**

In [328]:
def get_cordinates(address):
    geolocator = Nominatim(user_agent="sydney_explorer")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    return [latitude, longitude]    

**Function to get the list of suburbs from the wiki url**

In [329]:
def get_suburb_list(url):
    dfs = pd.read_html(url)
    df = dfs[0]
    suburbs_string = df.iloc[0,1]
    suburbs_list = suburbs_string.split()
    return suburbs_list

**Function to get population density, median household income, status related to hospitability industry from ABS website**

In [330]:
def getLGAHospitalityStats(LGA,url):
    df = pd.read_csv(url)
    df.insert(0,'LGA', LGA)
    df = df.loc[df['Description'].str.contains('persons/km2|Accommodation and food services|Median equivalised total household income|Unemployment rate', case = False)]
    return df

**Use the ABS links for Parramatta and Sutherland LGAs to retrive the stats required to choose which suburb is best to start the Indian restuarant business**

In [334]:
parraLGA = 'https://itt.abs.gov.au/itt/query.jsp?method=GetGenericData&datasetid=ABS_REGIONAL_LGA2019&or=MEASURE&and=LGA_2019.16260,FREQUENCY.A&TIME_FORMAT=P1Y&periods=2014,2015,2016,2017,2018,2019&format=csv&order=chunked&filename=Parramatta%20(C)'
sutherlandLGA = 'https://itt.abs.gov.au/itt/query.jsp?method=GetGenericData&datasetid=ABS_REGIONAL_LGA2019&or=MEASURE&and=LGA_2019.17150,FREQUENCY.A&TIME_FORMAT=P1Y&periods=2014,2015,2016,2017,2018,2019&format=csv&order=chunked&filename=Sutherland%20Shire%20(A)'
parraLGAdf = getLGAHospitalityStats('Parramatta', parraLGA)
sutherlandLGAdf = getLGAHospitalityStats('Sutherland',sutherlandLGA)
LGAdf = pd.concat([parraLGAdf,sutherlandLGAdf], ignore_index = True)
LGAdf.head(20)
#LGAdf.groupby('Parent Description')['Description'].head()

Unnamed: 0,LGA,Parent Description,Description,2011,2014,2015,2016,2017,2018,2019
0,Parramatta,Population Density - As at 30 June,Persons (persons/km2),,2637.6,2715.6,2796.7,2905.4,2995.0,3068.1
1,Parramatta,Number of Businesses by Industry - As at 30 June,Accommodation and food services (no.),,,956.0,1002.0,1021.0,1068.0,1079.0
2,Parramatta,Industry of Employment - Proportion of Employe...,Accommodation and food services (%),5.7,,,5.9,,,
3,Parramatta,Equivalised Total Household Income - Census,Median equivalised total household income (wee...,871.0,,,1012.0,,,
4,Parramatta,Jobs In Australia - Year ended 30 June,Number of Employee Jobs - Accommodation and fo...,,10148.0,10871.0,10861.0,12354.0,,
5,Parramatta,Labour Force Status - Persons aged 15 years an...,Unemployment rate (%),5.9,,,7.0,,,
6,Parramatta,Labour Force Status of Persons Born Overseas -...,Unemployment Rate (%),7.1,,,8.1,,,
7,Sutherland,Population Density - As at 30 June,Persons (persons/km2),,671.2,675.1,678.9,682.6,686.4,691.3
8,Sutherland,Number of Businesses by Industry - As at 30 June,Accommodation and food services (no.),,,709.0,721.0,747.0,740.0,722.0
9,Sutherland,Industry of Employment - Proportion of Employe...,Accommodation and food services (%),5.2,,,5.4,,,


**Remove the unwanted data**

In [335]:
LGAdf = LGAdf.loc[LGAdf['Description'] != 'Unemployment Rate (%)']
LGAdf = LGAdf.loc[LGAdf['Description'] != 'Accommodation and food services (%)']
LGAdf

Unnamed: 0,LGA,Parent Description,Description,2011,2014,2015,2016,2017,2018,2019
0,Parramatta,Population Density - As at 30 June,Persons (persons/km2),,2637.6,2715.6,2796.7,2905.4,2995.0,3068.1
1,Parramatta,Number of Businesses by Industry - As at 30 June,Accommodation and food services (no.),,,956.0,1002.0,1021.0,1068.0,1079.0
3,Parramatta,Equivalised Total Household Income - Census,Median equivalised total household income (wee...,871.0,,,1012.0,,,
4,Parramatta,Jobs In Australia - Year ended 30 June,Number of Employee Jobs - Accommodation and fo...,,10148.0,10871.0,10861.0,12354.0,,
5,Parramatta,Labour Force Status - Persons aged 15 years an...,Unemployment rate (%),5.9,,,7.0,,,
7,Sutherland,Population Density - As at 30 June,Persons (persons/km2),,671.2,675.1,678.9,682.6,686.4,691.3
8,Sutherland,Number of Businesses by Industry - As at 30 June,Accommodation and food services (no.),,,709.0,721.0,747.0,740.0,722.0
10,Sutherland,Equivalised Total Household Income - Census,Median equivalised total household income (wee...,977.0,,,1136.0,,,
11,Sutherland,Jobs In Australia - Year ended 30 June,Number of Employee Jobs - Accommodation and fo...,,9920.0,10200.0,9640.0,9932.0,,
12,Sutherland,Labour Force Status - Persons aged 15 years an...,Unemployment rate (%),3.5,,,3.5,,,


**Stack the LGA dataframe to convert the Year into columns**

In [183]:


LGAdf = LGAdf.pivot_table(index='LGA', columns='Description', values=['2011','2014','2015','2016','2017','2018','2019'])

LGAdf = LGAdf.stack().stack()

LGAdf = LGAdf.to_frame().reset_index()

LGAdf.rename(columns={'level_2':'Year', 0:'Count'}, inplace=True)


**Retrive the population density details for performing an exploratory data analysis**

In [None]:
population = LGAdf.loc[LGAdf['Description'].str.contains('Person')]

**Retrieve the business stats for hospitality industry for performing EDA**

In [None]:
business = LGAdf.loc[LGAdf['Description'].str.contains('services \(no.\)')]

**Retrieve the median household income for performing EDA**

In [None]:
household_income = LGAdf.loc[LGAdf['Description'].str.contains('income')]

**Retrieve the jobs stats for performing EDA**

In [None]:
jobs =  LGAdf.loc[LGAdf['Description'].str.contains('Jobs')]

**Display the EDA grouped bar charts**

In [None]:
def displayChart(y_title, df):
    return alt.Chart(df).mark_bar(size=30).encode(
        x=alt.X('LGA',axis=alt.Axis( labelAngle=300)),
        y=alt.Y('Count:Q', axis=alt.Axis( title=y_title)),
        color='LGA',
        column='Year'
    ).properties(width=alt.Step(50))

**Display the grouped bar chart for hospitality business in Parramatta and Sutherland**

In [195]:
displayChart('Food and accommodation businesses',business)

**Display the grouped bar chart for job status in hospitality industry for Parramatta and Sutherland**

In [196]:
displayChart('Food and accommodation jobs',jobs)

**Display the grouped bar chart for median household incomes in Parramatta and Sutherland**

In [197]:
displayChart('Median weekly household income',household_income)

**Display the grouped bar chart for population density in Parramatta and Sutherland**

In [198]:
displayChart('Population density per sqkm',population)

**Retrieve the suburb names from Parramatta and Sutherland LGAs and append the 'NSW' to make it an address so that we can get the coordinates for the suburbs**

In [336]:
## Webpage url                                                                                                               
sutherland_url = 'https://en.wikipedia.org/wiki/Template:Sydney_Sutherland_suburbs'
parra_url = 'https://en.wikipedia.org/wiki/Template:Sydney_Parramatta_suburbs'
suburbs_list = get_suburb_list(sutherland_url)
sutherland_suburbs = get_suburb_address(get_suburbs(suburbs_list))
suburbs_list = get_suburb_list(parra_url)
parra_suburbs = get_suburb_address(get_suburbs(suburbs_list))

**Get the coordinates for Sutherland LGA suburbs**

In [337]:
sutherland_df = pd.DataFrame(columns={'Suburb','Latitude','Longitude'})
for item in sutherland_suburbs:
    cordinates = get_cordinates(item+',NSW')
    cordinates.insert(0, item)
    sutherland_df = sutherland_df.append({'Suburb':cordinates[0],'Latitude':cordinates[1], 'Longitude':cordinates[2]},ignore_index=True)

In [338]:
sutherland_df.head()

Unnamed: 0,Suburb,Longitude,Latitude
0,"Alfords Point,NSW",151.024161,-33.983909
1,"Bangor,NSW",151.033487,-34.015503
2,"Barden Ridge,NSW",151.010527,-34.031173
3,"Bundeena,NSW",151.151235,-34.084544
4,"Bonnet Bay,NSW",151.053572,-34.010109


**Get the cordinates for Parramatta LGA suburbs**

In [339]:
parra_df = pd.DataFrame(columns={'Suburb','Latitude','Longitude'})
for item in parra_suburbs:
    cordinates = get_cordinates(item+',NSW')
    cordinates.insert(0, item)
    parra_df = parra_df.append({'Suburb':cordinates[0],'Latitude':cordinates[1], 'Longitude':cordinates[2]},ignore_index=True)

In [340]:
parra_df.head()

Unnamed: 0,Suburb,Longitude,Latitude
0,"Camellia,NSW",151.034649,-33.81978
1,"Carlingford,NSW",151.047521,-33.774495
2,"Clyde,NSW",151.017066,-33.835975
3,"Constitution Hill,NSW",151.24632,-31.908768
4,"Dundas,NSW",151.044059,-33.802949


In [341]:
CLIENT_ID = 'LVHMSRL5JLW0SSV1KNE4C5PZKPDQIKVYIZ4QYWXBK5LTGBAB' # your Foursquare ID
CLIENT_SECRET = 'VOAYGFK4HGQXVFBFMIKJWTOHG2DXG3YLYFTMACVWQITHFVBR' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: LVHMSRL5JLW0SSV1KNE4C5PZKPDQIKVYIZ4QYWXBK5LTGBAB
CLIENT_SECRET:VOAYGFK4HGQXVFBFMIKJWTOHG2DXG3YLYFTMACVWQITHFVBR


**Retrieve the venues within 1km for the given suburb**

In [342]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Suburb', 
                  'Suburb Latitude', 
                  'Suburb Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)


**Retrieve the venue details for suburbs in both Sutherland and Parramatta LGAs**

In [343]:
sutherland_venues = getNearbyVenues(names=sutherland_df['Suburb'],
                                   latitudes=sutherland_df['Latitude'],
                                   longitudes=sutherland_df['Longitude'],
                                    radius=1000
                                  )
parra_venues = getNearbyVenues(names=parra_df['Suburb'],
                                   latitudes=parra_df['Latitude'],
                                   longitudes=parra_df['Longitude'],
                                   radius=1000
                                  )

Alfords Point,NSW
Bangor,NSW
Barden Ridge,NSW
Bundeena,NSW
Bonnet Bay,NSW
Burraneer,NSW
Caringbah,NSW
Caringbah South,NSW
Como,NSW
Cronulla,NSW
Dolans Bay,NSW
Engadine,NSW
Grays Point,NSW
Greenhills Beach,NSW
Gymea,NSW
Gymea Bay,NSW
Heathcote,NSW
Illawong,NSW
Jannali,NSW
Kangaroo Point,NSW
Kareela,NSW
Kirrawee,NSW
Kurnell,NSW
Lilli Pilli,NSW
Loftus,NSW
Lucas Heights,NSW
Maianbar,NSW
Menai,NSW
Miranda,NSW
Oyster Bay,NSW
Port Hacking,NSW
Sandy Point,NSW
Sutherland,NSW
Sylvania,NSW
Sylvania Waters,NSW
Taren Point,NSW
Waterfall,NSW
Woolooware,NSW
Woronora,NSW
Woronora Heights,NSW
Yarrawarrah,NSW
Yowie Bay,NSW
Camellia,NSW
Carlingford,NSW
Clyde,NSW
Constitution Hill,NSW
Dundas,NSW
Dundas Valley,NSW
Eastwood,NSW
Epping,NSW
Ermington,NSW
Granville,NSW
Harris Park,NSW
Model Farms,NSW
Newington,NSW
Northmead,NSW
Oatlands,NSW
Oatlands,NSW
Old Toongabbie,NSW
Parramatta,NSW
Parramatta,NSW
North Parramatta,NSW
Pendle Hill,NSW
Rosehill,NSW
Rydalmere,NSW
Silverwater,NSW
South Granville,NSW
South Gran

**Use one hot encoding to get the count of the venues**

In [344]:
# one hot encoding
parra_onehot = pd.get_dummies(parra_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
parra_onehot['Suburb'] = parra_venues['Suburb'] 

# move neighborhood column to the first column
fixed_columns = [parra_onehot.columns[-1]] + list(parra_onehot.columns[:-1])
parra_onehot = parra_onehot[fixed_columns]

parra_onehot.head()

Unnamed: 0,Suburb,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Australian Restaurant,BBQ Joint,Badminton Court,Bakery,Bar,Baseball Field,...,Taiwanese Restaurant,Tennis Court,Thai Restaurant,Track,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Waterfront
0,"Camellia,NSW",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Camellia,NSW",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Camellia,NSW",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Camellia,NSW",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Camellia,NSW",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [345]:
parra_grouped = parra_onehot.groupby('Suburb').mean().reset_index()
parra_grouped

Unnamed: 0,Suburb,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Australian Restaurant,BBQ Joint,Badminton Court,Bakery,Bar,Baseball Field,...,Taiwanese Restaurant,Tennis Court,Thai Restaurant,Track,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Waterfront
0,"Camellia,NSW",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Carlingford,NSW",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.027778,0.0,0.0,0.0,0.027778,0.027778,0.0,0.027778,0.0
2,"Clyde,NSW",0.0,0.030303,0.0,0.0,0.0,0.0,0.030303,0.0,0.0,...,0.0,0.0,0.030303,0.0,0.0,0.060606,0.0,0.0,0.0,0.0
3,"Dundas Valley,NSW",0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Dundas,NSW",0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.153846,0.0,0.0,0.0,0.0
5,"Eastwood,NSW",0.0,0.042553,0.0,0.0,0.021277,0.0,0.021277,0.0,0.0,...,0.021277,0.0,0.0,0.0,0.0,0.021277,0.0,0.0,0.06383,0.0
6,"Epping,NSW",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.1,0.0,0.0,0.033333,0.0,0.0,0.066667,0.0
7,"Ermington,NSW",0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,"Granville,NSW",0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0
9,"Harris Park,NSW",0.0,0.045455,0.0,0.0,0.0,0.0,0.030303,0.0,0.0,...,0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.015152,0.030303,0.0


In [346]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

**Retrieve the 5 most common venues in the suburbs**

In [358]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Suburb']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Suburb'] = parra_grouped['Suburb']

for ind in np.arange(parra_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(parra_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Suburb,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Camellia,NSW",Café,Electronics Store,Dentist's Office,Pier,Shipping Store
1,"Carlingford,NSW",Chinese Restaurant,Fast Food Restaurant,Pizza Place,Café,Sandwich Place
2,"Clyde,NSW",Dessert Shop,Lebanese Restaurant,Furniture / Home Store,Fast Food Restaurant,Convenience Store
3,"Dundas Valley,NSW",Burger Joint,Athletics & Sports,Pub,Park,Café
4,"Dundas,NSW",Train Station,Sports Club,Home Service,Australian Restaurant,Café


**Use k means clustering to identify 5 clusters of suburbs in similarity** 

In [359]:
# set number of clusters
kclusters = 5

parra_grouped_clustering = parra_grouped.drop('Suburb', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(parra_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:5] 

array([1, 1, 0, 3, 0])

In [360]:
neighborhoods_venues_sorted.head()

Unnamed: 0,Suburb,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Camellia,NSW",Café,Electronics Store,Dentist's Office,Pier,Shipping Store
1,"Carlingford,NSW",Chinese Restaurant,Fast Food Restaurant,Pizza Place,Café,Sandwich Place
2,"Clyde,NSW",Dessert Shop,Lebanese Restaurant,Furniture / Home Store,Fast Food Restaurant,Convenience Store
3,"Dundas Valley,NSW",Burger Joint,Athletics & Sports,Pub,Park,Café
4,"Dundas,NSW",Train Station,Sports Club,Home Service,Australian Restaurant,Café


**Now merge the suburbs coordinates, cluster labels and the five common veneus together into one dataframe**

In [361]:

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

parra_merged = parra_df

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
parra_merged = parra_merged.join(neighborhoods_venues_sorted.set_index('Suburb'), on='Suburb')

#parra_merged['Cluster Labels'].astype('int32')
parra_merged.head()

Unnamed: 0,Suburb,Longitude,Latitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Camellia,NSW",151.034649,-33.81978,1.0,Café,Electronics Store,Dentist's Office,Pier,Shipping Store
1,"Carlingford,NSW",151.047521,-33.774495,1.0,Chinese Restaurant,Fast Food Restaurant,Pizza Place,Café,Sandwich Place
2,"Clyde,NSW",151.017066,-33.835975,0.0,Dessert Shop,Lebanese Restaurant,Furniture / Home Store,Fast Food Restaurant,Convenience Store
3,"Constitution Hill,NSW",151.24632,-31.908768,,,,,,
4,"Dundas,NSW",151.044059,-33.802949,0.0,Train Station,Sports Club,Home Service,Australian Restaurant,Café


In [362]:
address = 'Paramatta,NSW'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Parramatta are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Parramatta are -33.8107936, 151.0007299.


In [363]:
parra_merged.dropna(inplace=True)

**Generate the folium map for Parramatta LGA with 5 clusters denoting similar suburbs**

In [364]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(parra_merged['Latitude'], parra_merged['Longitude'], parra_merged['Suburb'], parra_merged['Cluster Labels']):
    #print(cluster)
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

**Find the cluster that has restaurants as the most common venue**

In [365]:
parra_merged.loc[parra_merged['Cluster Labels'] == 0.0].head(20)

Unnamed: 0,Suburb,Longitude,Latitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
2,"Clyde,NSW",151.017066,-33.835975,0.0,Dessert Shop,Lebanese Restaurant,Furniture / Home Store,Fast Food Restaurant,Convenience Store
4,"Dundas,NSW",151.044059,-33.802949,0.0,Train Station,Sports Club,Home Service,Australian Restaurant,Café
21,"Rosehill,NSW",151.030556,-33.826389,0.0,Hotel,Platform,Train Station,Stadium,Racetrack
27,"Telopea,NSW",151.040944,-33.793922,0.0,Gas Station,Train Station,Convenience Store,Grocery Store,Soccer Field
30,"Wentworthville,NSW",150.967778,-33.806667,0.0,Thai Restaurant,Imported Food Shop,Grocery Store,Pizza Place,Platform


In [366]:
parra_merged.loc[parra_merged['Cluster Labels'] == 1.0].head(20)

Unnamed: 0,Suburb,Longitude,Latitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Camellia,NSW",151.034649,-33.81978,1.0,Café,Electronics Store,Dentist's Office,Pier,Shipping Store
1,"Carlingford,NSW",151.047521,-33.774495,1.0,Chinese Restaurant,Fast Food Restaurant,Pizza Place,Café,Sandwich Place
6,"Eastwood,NSW",151.084444,-33.79,1.0,Chinese Restaurant,Korean Restaurant,Café,Vietnamese Restaurant,Korean BBQ Restaurant
7,"Epping,NSW",151.074537,-33.771855,1.0,Thai Restaurant,Platform,Indian Restaurant,Vietnamese Restaurant,Pizza Place
8,"Ermington,NSW",151.060056,-33.810116,1.0,Café,Park,Fast Food Restaurant,Liquor Store,Italian Restaurant
9,"Granville,NSW",151.006011,-33.83451,1.0,Lebanese Restaurant,Fast Food Restaurant,Dessert Shop,Convenience Store,Platform
10,"Harris Park,NSW",151.007654,-33.823338,1.0,Indian Restaurant,Chinese Restaurant,Café,Sandwich Place,Asian Restaurant
11,"Model Farms,NSW",150.995659,-33.775558,1.0,Pizza Place,Coffee Shop,Bus Stop,Bowling Green,Café
13,"Northmead,NSW",150.998329,-33.784442,1.0,Gym,Bowling Green,Shopping Mall,Farmers Market,Park
16,"Old Toongabbie,NSW",150.969953,-33.785855,1.0,Park,Fast Food Restaurant,Chinese Restaurant,Gym,Grocery Store


In [367]:
parra_merged.loc[parra_merged['Cluster Labels'] == 3.0].head(20)

Unnamed: 0,Suburb,Longitude,Latitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
5,"Dundas Valley,NSW",151.055907,-33.793674,3.0,Burger Joint,Athletics & Sports,Pub,Park,Café
12,"Newington,NSW",151.055872,-33.83423,3.0,Café,Sandwich Place,Japanese Restaurant,Seafood Restaurant,Shopping Mall
14,"Oatlands,NSW",151.025653,-33.797339,3.0,Café,Grocery Store,Golf Course,Bus Stop,Waterfront
15,"Oatlands,NSW",151.025653,-33.797339,3.0,Café,Grocery Store,Golf Course,Bus Stop,Waterfront
17,"Parramatta,NSW",151.021363,-33.80697,3.0,Café,Pizza Place,Arts & Crafts Store,Bakery,Sandwich Place
18,"Parramatta,NSW",151.021363,-33.80697,3.0,Café,Pizza Place,Arts & Crafts Store,Bakery,Sandwich Place
19,"North Parramatta,NSW",151.011665,-33.795275,3.0,Café,Lake,Gym,Bus Stop,Pet Store
26,"Sydney Olympic Park,NSW",151.069092,-33.83874,3.0,Café,Italian Restaurant,Stadium,Scenic Lookout,Athletics & Sports
29,"Wentworth Point,NSW",151.077435,-33.826896,3.0,Café,Park,Waterfront,Japanese Restaurant,Shopping Mall
32,"Westmead,NSW",150.987727,-33.80765,3.0,Café,Bus Station,Steakhouse,Platform,Australian Restaurant


**Now we have to filter just the restuarants and the suburbs whose top two venues include resturants to find out which suburbs has most of the resturants.**


In [369]:
parra_count = parra_onehot.groupby('Suburb').sum().reset_index()

filtered_suburbs = parra_count.loc[parra_count['Suburb'].str.contains('Harris Park|Epping|Carlingford|Eastwood|Wentworthville|Granville|Old Toongabbie')]

Restaurants = [col for col in filtered_suburbs.columns if 'Restaurant' in col]

Restaurants.insert(0,'Suburb')

filtered_suburbs = filtered_suburbs.sort_values(Restaurants, ascending=True)[Restaurants]

filtered_suburbs.set_index('Suburb', inplace=True)

filtered_suburbs = pd.DataFrame(filtered_suburbs.stack())

filtered_suburbs.reset_index(inplace=True)

filtered_suburbs.rename(columns={'level_1':'Restaurants', 0:'Count'}, inplace=True)

filtered_suburbs = filtered_suburbs.loc[filtered_suburbs['Count'] !=0]

filtered_suburbs

Unnamed: 0,Suburb,Restaurants,Count
2,"Carlingford,NSW",Chinese Restaurant,6
4,"Carlingford,NSW",Fast Food Restaurant,3
6,"Carlingford,NSW",Italian Restaurant,1
7,"Carlingford,NSW",Japanese Restaurant,1
8,"Carlingford,NSW",Korean BBQ Restaurant,1
18,"Carlingford,NSW",Sushi Restaurant,1
22,"Carlingford,NSW",Vegetarian / Vegan Restaurant,1
23,"Carlingford,NSW",Vietnamese Restaurant,1
24,"Eastwood,NSW",Asian Restaurant,2
26,"Eastwood,NSW",Chinese Restaurant,8


**Function to display the stacked bar chart to identify the suburbs with most restaurants and cusine types**

In [323]:
def displayStackedBar(filtered_suburbs):
    range_ = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
            '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf',        
            '#aec7e8', '#ffbb78', '#98df8a', '#ff9896', '#c5b0d5',
            '#c49c94', '#f7b6d2', '#c7c7c7', '#dbdb8d', '#9edae5',]

    bars = alt.Chart(filtered_suburbs).mark_bar().encode(
        x=alt.X('Count', stack='zero'),
        y=alt.Y('Suburb'),
        color=alt.Color('Restaurants', scale=alt.Scale(range=range_))
    )

    text = alt.Chart(filtered_suburbs).mark_text(dx=-15, dy=3, color='white').encode(
        x=alt.X('Count', stack='zero'),
        y=alt.Y('Suburb'),
        detail='Restaurants',
        text=alt.Text('Count', format='1d')
    )

    return (bars + text)

In [370]:
displayStackedBar(filtered_suburbs)