# Capstone Project - The Battle of Neighborhoods (Week 2)

### Opening a new Fruit & Vegetable store in Auckland, New Zealand

#### STEPS INVOLVED ARE:
 - Build a dataframe of neighborhoods in Auckland, New Zealand by web scraping the data from Wikipedia page
 - Get the geographical coordinates of the neighborhoods
 - Obtain the venue data for the neighborhoods from Foursquare API
 - Explore and cluster the neighborhoods
 - Select the best cluster to open a new Fruit & Vegetable store

## 1. Import Libraries

In [68]:
import requests # to handle requests
import pandas as pd # for data analsysis
import numpy as np # to handle data in a vectorized manner


from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
#tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

from bs4 import BeautifulSoup # library to parse HTML and XML documents
import folium # plotting library

# import k-means from clustering stage
from sklearn.cluster import KMeans
import geocoder # to get coordinates

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [6]:
address = 'Auckland'
geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Auckland city are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Auckland city are -36.852095, 174.7631803.


## 2. Scrapping data from Wikipedia page into a DataFrame

In [7]:
# send the GET request
data = requests.get("https://en.wikipedia.org/wiki/List_of_suburbs_of_Auckland").text

In [10]:
# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

In [11]:
# create a list to store neighborhood data
neighborhoodList = []

In [15]:
# append the data into the list
for row in soup.find_all("div", class_="div-col columns column-width")[0].findAll("li"):
    neighborhoodList.append(row.text)

In [16]:
# create a new DataFrame from the list
ac_df = pd.DataFrame({"Neighborhood": neighborhoodList})

ac_df.head()

Unnamed: 0,Neighborhood
0,Arch Hill
1,Auckland CBD
2,Avondale
3,Balmoral
4,Blackpool


In [17]:
# print the number of rows of the dataframe
ac_df.shape

(64, 1)

## 3. Obtain the geographical coordinates

In [18]:
# define a function to get coordinates
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Auckland, New Zealand'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [21]:
# call the function to get the coordinates, store in a new list using list comprehension
coordinates_ac = [ get_latlng(neighborhood) for neighborhood in ac_df["Neighborhood"].tolist() ]
coordinates_ac

[[-36.86301999999995, 174.74858000000006],
 [-36.84839904099994, 174.76438785300002],
 [-36.89044821899995, 174.6870174820001],
 [-36.88819999999998, 174.7401900000001],
 [-37.05156439523053, 174.88439705033372],
 [-36.92819999999995, 174.70019000000002],
 [-36.86413508299995, 174.75743271800002],
 [-36.86956533042306, 174.77598845331576],
 [-36.896377420999954, 174.81373710200012],
 [-36.88845407399998, 174.77293816600002],
 [-36.85287929799995, 174.75035326400007],
 [-36.86036095299994, 174.87265930000012],
 [-36.87394120099998, 174.86095625600012],
 [-36.85329999999993, 174.77975000000004],
 [-36.89491322499998, 174.79262520600003],
 [-36.92865648363817, 174.64980099283827],
 [-36.861568621999936, 174.73955514300008],
 [-36.83819999999997, 174.72019000000012],
 [-36.92389779599995, 174.75536291900005],
 [-36.87019698199998, 174.7452094570001],
 [-36.85725985999994, 174.84097833500005],
 [-36.93035918099997, 174.71985815800008],
 [-36.86950058399998, 174.82957281000006],
 [-36.838199

In [22]:
# create temporary dataframe to populate the coordinates into Latitude and Longitude
df_coordinates = pd.DataFrame(coordinates_ac, columns=['Latitude', 'Longitude'])

In [23]:
# merge the coordinates into the original dataframe
ac_df['Latitude'] = df_coordinates['Latitude']
ac_df['Longitude'] = df_coordinates['Longitude']

In [24]:
# check the neighborhoods and the coordinates
print(ac_df.shape)
ac_df

(64, 3)


Unnamed: 0,Neighborhood,Latitude,Longitude
0,Arch Hill,-36.863020,174.748580
1,Auckland CBD,-36.848399,174.764388
2,Avondale,-36.890448,174.687017
3,Balmoral,-36.888200,174.740190
4,Blackpool,-37.051564,174.884397
...,...,...,...
59,Wai o Taiki Bay,-36.868200,174.870190
60,Waterview,-36.879466,174.699364
61,Western Springs,-36.863106,174.720365
62,Westfield,-36.950000,174.850000


In [25]:
# save the DataFrame as CSV file
ac_df.to_csv("ac_df.csv", index=False)

## 4. Create map of neighbourhoods of Auckland

In [28]:
# create map of Auckland using latitude and longitude values
map_ac = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, neighborhood in zip(ac_df['Latitude'], ac_df['Longitude'], ac_df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_ac)  
    
map_ac

In [29]:
# save the map as HTML file
map_ac.save('map_ac.html')

## 5.  Foursquare API to explore the neighborhoods

In [30]:
radius = 2000
LIMIT = 100

venues = []

for lat, long, neighborhood in zip(ac_df['Latitude'], ac_df['Longitude'], ac_df['Neighborhood']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [31]:

# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['Neighborhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(3365, 7)


Unnamed: 0,Neighborhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Arch Hill,-36.86302,174.74858,Grey Lynn Park,-36.861524,174.743148,Park
1,Arch Hill,-36.86302,174.74858,Countdown,-36.858375,174.748862,Market
2,Arch Hill,-36.86302,174.74858,Ponsonby Central,-36.856276,174.746169,Shopping Mall
3,Arch Hill,-36.86302,174.74858,El Sizzling Chorizo,-36.85629,174.746131,Argentinian Restaurant
4,Arch Hill,-36.86302,174.74858,Viva Latino! Dance Studios,-36.860666,174.753579,Dance Studio


In [32]:
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))

There are 200 uniques categories.


In [33]:
# print out the list of categories
venues_df['VenueCategory'].unique()[:50]

array(['Park', 'Market', 'Shopping Mall', 'Argentinian Restaurant',
       'Dance Studio', 'Gym / Fitness Center', 'Café', 'Pizza Place',
       'Italian Restaurant', 'Bistro', 'Burger Joint',
       'Fruit & Vegetable Store', 'Food Court', 'South Indian Restaurant',
       'Japanese Restaurant', 'French Restaurant', 'Toy / Game Store',
       'Sushi Restaurant', 'Vietnamese Restaurant',
       'Vegetarian / Vegan Restaurant', 'Bakery', 'American Restaurant',
       'General Entertainment', 'Hostel', 'Restaurant',
       'Mexican Restaurant', 'Noodle House', 'Malay Restaurant',
       'Food Truck', 'Tattoo Parlor', 'Middle Eastern Restaurant',
       'Stadium', 'Gourmet Shop', 'Food & Drink Shop', 'Record Shop',
       'Chinese Restaurant', 'Coffee Shop', 'Bar', 'Bagel Shop',
       'Indian Restaurant', 'Yakitori Restaurant', 'Hotel', 'Gym', 'Pub',
       'Cupcake Shop', 'Gay Bar', 'Thai Restaurant', 'City Hall',
       'Dessert Shop', 'Theater'], dtype=object)

## 6. Analyze Each Neighborhood

In [34]:
# one hot encoding
ac_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
ac_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [ac_onehot.columns[-1]] + list(ac_onehot.columns[:-1])
ac_onehot = ac_onehot[fixed_columns]

print(ac_onehot.shape)
ac_onehot.head()

(3365, 201)


Unnamed: 0,Neighborhoods,American Restaurant,Aquarium,Arcade,Argentinian Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Automotive Shop,...,Vietnamese Restaurant,Vineyard,Waterfront,Wine Bar,Wine Shop,Winery,Women's Store,Yakitori Restaurant,Yoga Studio,Zoo
0,Arch Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Arch Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Arch Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Arch Hill,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Arch Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
ac_grouped = ac_onehot.groupby(["Neighborhoods"]).mean().reset_index()

print(ac_grouped.shape)
ac_grouped

(64, 201)


Unnamed: 0,Neighborhoods,American Restaurant,Aquarium,Arcade,Argentinian Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Automotive Shop,...,Vietnamese Restaurant,Vineyard,Waterfront,Wine Bar,Wine Shop,Winery,Women's Store,Yakitori Restaurant,Yoga Studio,Zoo
0,Arch Hill,0.02,0.0,0.0,0.01,0.00,0.0,0.00,0.0,0.0,...,0.010000,0.0,0.0,0.0,0.000000,0.0,0.0,0.01,0.0,0.000000
1,Auckland CBD,0.00,0.0,0.0,0.01,0.01,0.0,0.00,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.00,0.0,0.000000
2,Avondale,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.0,...,0.027778,0.0,0.0,0.0,0.000000,0.0,0.0,0.00,0.0,0.000000
3,Balmoral,0.00,0.0,0.0,0.00,0.00,0.0,0.04,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.00,0.0,0.000000
4,Blackpool,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.00,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,Waikowhai,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.00,0.0,0.000000
60,Waterview,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.00,0.0,0.000000
61,Western Springs,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.0,...,0.014493,0.0,0.0,0.0,0.014493,0.0,0.0,0.00,0.0,0.014493
62,Westfield,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.0,...,0.200000,0.0,0.0,0.0,0.000000,0.0,0.0,0.00,0.0,0.000000


#### New DataFrame for Fruit & Vegetable Store data only

In [37]:
len(ac_grouped[ac_grouped["Fruit & Vegetable Store"] > 0])

18

In [39]:
ac_vege = ac_grouped[["Neighborhoods","Fruit & Vegetable Store"]]

In [40]:
ac_vege.head()

Unnamed: 0,Neighborhoods,Fruit & Vegetable Store
0,Arch Hill,0.01
1,Auckland CBD,0.01
2,Avondale,0.0
3,Balmoral,0.0
4,Blackpool,0.0


## 7. Cluster Neighborhoods

In [57]:
# set number of clusters
kclusters = 3

ac_clustering = ac_vege.drop(["Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(ac_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([2, 2, 0, 0, 0, 1, 2, 0, 0, 2])

In [58]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
ac_merged = ac_vege.copy()

# add clustering labels
ac_merged["Cluster Labels"] = kmeans.labels_

In [59]:

ac_merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
ac_merged.head()

Unnamed: 0,Neighborhood,Fruit & Vegetable Store,Cluster Labels
0,Arch Hill,0.01,2
1,Auckland CBD,0.01,2
2,Avondale,0.0,0
3,Balmoral,0.0,0
4,Blackpool,0.0,0


In [60]:
# merge to add latitude/longitude for each neighborhood
ac_merged = ac_merged.join(ac_df.set_index("Neighborhood"), on="Neighborhood")

print(ac_merged.shape)
ac_merged.head() # check the last columns!

(64, 5)


Unnamed: 0,Neighborhood,Fruit & Vegetable Store,Cluster Labels,Latitude,Longitude
0,Arch Hill,0.01,2,-36.86302,174.74858
1,Auckland CBD,0.01,2,-36.848399,174.764388
2,Avondale,0.0,0,-36.890448,174.687017
3,Balmoral,0.0,0,-36.8882,174.74019
4,Blackpool,0.0,0,-37.051564,174.884397


In [61]:

# sort the results by Cluster Labels
print(ac_merged.shape)
ac_merged.sort_values(["Cluster Labels"], inplace=True)
ac_merged

(64, 5)


Unnamed: 0,Neighborhood,Fruit & Vegetable Store,Cluster Labels,Latitude,Longitude
63,Westmere,0.000000,0,-37.568210,175.140200
26,Mount Eden,0.000000,0,-36.883602,174.754237
27,Mount Roskill,0.000000,0,-36.916066,174.736536
28,Mount Wellington,0.000000,0,-36.912733,174.839904
29,New Windsor,0.000000,0,-36.905310,174.712787
...,...,...,...,...,...
9,Epsom,0.013889,2,-36.888454,174.772938
6,Eden Terrace,0.010000,2,-36.864135,174.757433
1,Auckland CBD,0.010000,2,-36.848399,174.764388
49,Royal Oak,0.016393,2,-36.910653,174.772330


### Visualizing the resulting clusters

In [62]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ac_merged['Latitude'], ac_merged['Longitude'], ac_merged['Neighborhood'], ac_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

- <font color='red'>Red</font> indicates Cluster 0 
- <font color='green'>Green</font> indicates Cluster 1
- <font color='violet'>Violet</font> indicates Cluster 0 

In [None]:
# save the map as HTML file
map_clusters.save('map_clusters.html')

## 8 Examining the clusters

#### Cluster 0

In [64]:
ac_merged.loc[ac_merged['Cluster Labels'] == 0]

Unnamed: 0,Neighborhood,Fruit & Vegetable Store,Cluster Labels,Latitude,Longitude
63,Westmere,0.0,0,-37.56821,175.1402
26,Mount Eden,0.0,0,-36.883602,174.754237
27,Mount Roskill,0.0,0,-36.916066,174.736536
28,Mount Wellington,0.0,0,-36.912733,174.839904
29,New Windsor,0.0,0,-36.90531,174.712787
30,Newmarket,0.0,0,-36.86741,174.776385
62,Westfield,0.0,0,-36.95,174.85
52,Saint Marys Bay,0.0,0,-36.8382,174.74019
33,Onehunga,0.0,0,-36.920599,174.790655
51,Saint Johns,0.0,0,-36.8682,174.84019


#### Cluster 1

In [65]:
ac_merged.loc[ac_merged['Cluster Labels'] == 1]

Unnamed: 0,Neighborhood,Fruit & Vegetable Store,Cluster Labels,Latitude,Longitude
5,Blockhouse Bay,0.071429,1,-36.9282,174.70019


#### Cluster 2

In [66]:
ac_merged.loc[ac_merged['Cluster Labels'] == 2]

Unnamed: 0,Neighborhood,Fruit & Vegetable Store,Cluster Labels,Latitude,Longitude
60,Waterview,0.02439,2,-36.879466,174.699364
61,Western Springs,0.014493,2,-36.863106,174.720365
0,Arch Hill,0.01,2,-36.86302,174.74858
47,Ponsonby,0.01,2,-36.850733,174.739223
34,Oneroa,0.033333,2,-36.77819,175.01019
32,One Tree Hill,0.022727,2,-36.904613,174.789187
24,Morningside,0.01,2,-36.8682,174.73019
20,Kohimarama,0.017241,2,-36.85726,174.840978
19,Kingsland,0.01,2,-36.870197,174.745209
16,Grey Lynn,0.01,2,-36.861569,174.739555
