# web scraping to get the data and convert to dataframe

In [1]:
import pandas as pd
import requests
! pip install BeautifulSoup4
import bs4 as bs
import urllib.request
from bs4 import BeautifulSoup
!pip install lxml
import lxml
import html5lib
import numpy as np

Collecting BeautifulSoup4
[?25l  Downloading https://files.pythonhosted.org/packages/66/25/ff030e2437265616a1e9b25ccc864e0371a0bc3adb7c5a404fd661c6f4f6/beautifulsoup4-4.9.1-py3-none-any.whl (115kB)
[K     |████████████████████████████████| 122kB 5.1MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2 (from BeautifulSoup4)
  Downloading https://files.pythonhosted.org/packages/6f/8f/457f4a5390eeae1cc3aeab89deb7724c965be841ffca6cfca9197482e470/soupsieve-2.0.1-py3-none-any.whl
Installing collected packages: soupsieve, BeautifulSoup4
Successfully installed BeautifulSoup4-4.9.1 soupsieve-2.0.1
Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/55/6f/c87dffdd88a54dd26a3a9fef1d14b6384a9933c455c54ce3ca7d64a84c88/lxml-4.5.1-cp36-cp36m-manylinux1_x86_64.whl (5.5MB)
[K     |████████████████████████████████| 5.5MB 8.6MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.5.1


### Get the recommended parks geospatial information online 

In [2]:
res = requests.get("https://www.latlong.net/category/parks-199-53.html")
soup = BeautifulSoup(res.content,'html.parser')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))


In [3]:
data=df[0]
data

Unnamed: 0,Place Name,Latitude,Longitude
0,"Fort Canning Hill, Singapore",1.294444,103.846947
1,"Admiralty Park, Singapore",1.446392,103.780655
2,"Mount Faber Park, Singapore",1.273806,103.817497
3,"Gardens by the Bay, Singapore",1.282375,103.864273


# check the neighborhoods of the four parks 
##### download and import

In [4]:
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.22.0               |     pyh9f0ad1d_0          63 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          97 KB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.50-py_0
  geopy              conda-forge/noarch::geopy-1.22.0-pyh9f0ad1d_0



Downloading and Extracting Packages
geopy-1.22.0         | 63 KB     | ##################################### | 100% 
geographiclib-1.50   | 34 KB     | ###############################

##### get the latitude and longitude value for Singapore  

In [5]:
address = 'Singapore'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Singapore are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Singapore are 1.357107, 103.8194992.


##### plot Singapore with the parks on it 

In [6]:
# create map of Manhattan using latitude and longitude values
map_Sing = folium.Map(location=[latitude, longitude], zoom_start=11)
# add markers to map
for lat, lng, label in zip(data['Latitude'], data['Longitude'], data['Place Name']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Sing)
map_Sing

#### Define Foursquare Credentials and Version

In [7]:
CLIENT_ID = 'KFPAYX44B3O4PEUZCYRYTZFWFIIZVOOCCU5R3G4NZ14FJ2F2' # your Foursquare ID
CLIENT_SECRET = 'DH44I4VRJOZYAICIQUJQVHNYYO5QB45J2QNVZH4BWQLLLG33' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: KFPAYX44B3O4PEUZCYRYTZFWFIIZVOOCCU5R3G4NZ14FJ2F2
CLIENT_SECRET:DH44I4VRJOZYAICIQUJQVHNYYO5QB45J2QNVZH4BWQLLLG33


### explore the first parks nearby venues 

In [8]:
data.loc[0, 'Place Name']
park_latitude = data.loc[0, 'Latitude'] # park latitude value
park_longitude = data.loc[0, 'Longitude'] # park longitude value

park_name = data.loc[0, 'Place Name'] # park name

print('Latitude and longitude values of {} are {}, {}.'.format(park_name, 
                                                               park_latitude, 
                                                               park_longitude))

Latitude and longitude values of Fort Canning Hill, Singapore are 1.2944440000000002, 103.846947.


### Now, let's get the top 100 venues that are in Fort Canning Hill within a radius of 500 meters

In [13]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    park_latitude, 
    park_longitude, 
    radius, 
    LIMIT)
#url
results = requests.get(url).json()
#results


In [10]:

def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']


In [14]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

16 venues were returned by Foursquare.


  This is separate from the ipykernel package so we can avoid doing imports until


#### explore all neighborhood of the 4 parks in Singapore 

In [15]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Place Name', 
                  'Place Name Latitude', 
                  'Place Name Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [16]:
parks_venues = getNearbyVenues(names=data['Place Name'],
                                   latitudes=data['Latitude'],
                                   longitudes=data['Longitude']
                                  )
print(parks_venues.shape)
parks_venues.head()

Fort Canning Hill, Singapore
Admiralty Park, Singapore
Mount Faber Park, Singapore
Gardens by the Bay, Singapore
(66, 7)


Unnamed: 0,Place Name,Place Name Latitude,Place Name Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Fort Canning Hill, Singapore",1.294444,103.846947,Fort Canning Park,1.295075,103.846421,Park
1,"Fort Canning Hill, Singapore",1.294444,103.846947,National Museum of Singapore,1.296498,103.848462,Museum
2,"Fort Canning Hill, Singapore",1.294444,103.846947,Le Bistrot Du Sommelier,1.294645,103.849577,French Restaurant
3,"Fort Canning Hill, Singapore",1.294444,103.846947,The Substation,1.294367,103.849443,Art Gallery
4,"Fort Canning Hill, Singapore",1.294444,103.846947,Hotel Fort Canning,1.295854,103.845453,Hotel


In [18]:
print('There are {} uniques categories.'.format(len(parks_venues['Venue Category'].unique())))
parks_venues.groupby('Place Name').count()

There are 42 uniques categories.


Unnamed: 0_level_0,Place Name Latitude,Place Name Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Place Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Admiralty Park, Singapore",1,1,1,1,1,1
"Fort Canning Hill, Singapore",16,16,16,16,16,16
"Gardens by the Bay, Singapore",31,31,31,31,31,31
"Mount Faber Park, Singapore",18,18,18,18,18,18


In [19]:
# one hot encoding
parks_onehot = pd.get_dummies(parks_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
parks_onehot['Neighborhood'] = parks_venues['Place Name'] 

# move neighborhood column to the first column
fixed_columns = [parks_onehot.columns[-1]] + list(parks_onehot.columns[:-1])
parks_onehot = parks_onehot[fixed_columns]

parks_onehot.head()
parks_onehot.shape
parks_grouped = parks_onehot.groupby('Neighborhood').mean().reset_index()
parks_grouped



Unnamed: 0,Neighborhood,Art Gallery,BBQ Joint,Bar,Botanical Garden,Bowling Alley,Bridge,Business Service,Cable Car,Café,...,Playground,Pool,Roof Deck,Satay Restaurant,Scenic Lookout,Sculpture Garden,Seafood Restaurant,Shopping Mall,Spa,Waterfront
0,"Admiralty Park, Singapore",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Fort Canning Hill, Singapore",0.0625,0.0,0.0625,0.0,0.0,0.0,0.0625,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0625,0.0625
2,"Gardens by the Bay, Singapore",0.0,0.032258,0.064516,0.064516,0.0,0.064516,0.0,0.0,0.0,...,0.032258,0.0,0.032258,0.032258,0.096774,0.032258,0.064516,0.0,0.0,0.0
3,"Mount Faber Park, Singapore",0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.055556,0.055556,...,0.0,0.055556,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0


In [20]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]



##### find the top 10 venues for each neighborhood 

In [21]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = parks_grouped['Neighborhood']

for ind in np.arange(parks_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(parks_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Admiralty Park, Singapore",Park,Waterfront,Gastropub,Garden,Fried Chicken Joint,French Restaurant,Food Court,Dim Sum Restaurant,Coffee Shop,Club House
1,"Fort Canning Hill, Singapore",Hotel,French Restaurant,Waterfront,Park,Bar,Business Service,Coffee Shop,Spa,Lighthouse,Movie Theater
2,"Gardens by the Bay, Singapore",Garden,Scenic Lookout,Hotel,Bridge,Seafood Restaurant,Bar,Botanical Garden,Lounge,Food Court,Gastropub
3,"Mount Faber Park, Singapore",Scenic Lookout,Chinese Restaurant,Hong Kong Restaurant,Park,Grocery Store,Gym,Food Court,Karaoke Bar,Club House,Mountain


In [22]:
# set number of clusters
kclusters = 3

parks_grouped_clustering = parks_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(parks_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 2, 1, 1], dtype=int32)

In [24]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

parks_merged = data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
parks_merged = parks_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Place Name')

parks_merged.head() # check the last columns!

Unnamed: 0,Place Name,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Fort Canning Hill, Singapore",1.294444,103.846947,2,Hotel,French Restaurant,Waterfront,Park,Bar,Business Service,Coffee Shop,Spa,Lighthouse,Movie Theater
1,"Admiralty Park, Singapore",1.446392,103.780655,0,Park,Waterfront,Gastropub,Garden,Fried Chicken Joint,French Restaurant,Food Court,Dim Sum Restaurant,Coffee Shop,Club House
2,"Mount Faber Park, Singapore",1.273806,103.817497,1,Scenic Lookout,Chinese Restaurant,Hong Kong Restaurant,Park,Grocery Store,Gym,Food Court,Karaoke Bar,Club House,Mountain
3,"Gardens by the Bay, Singapore",1.282375,103.864273,1,Garden,Scenic Lookout,Hotel,Bridge,Seafood Restaurant,Bar,Botanical Garden,Lounge,Food Court,Gastropub


In [25]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(parks_merged['Latitude'], parks_merged['Longitude'], parks_merged['Place Name'], parks_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Exame each cluster 

In [38]:
parks_merged.loc[parks_merged['Cluster Labels'] == 0]


Unnamed: 0,Place Name,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,"Admiralty Park, Singapore",1.446392,103.780655,0,Park,Waterfront,Gastropub,Garden,Fried Chicken Joint,French Restaurant,Food Court,Dim Sum Restaurant,Coffee Shop,Club House


In [39]:
parks_merged.loc[parks_merged['Cluster Labels'] == 1]


Unnamed: 0,Place Name,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,"Mount Faber Park, Singapore",1.273806,103.817497,1,Scenic Lookout,Chinese Restaurant,Hong Kong Restaurant,Park,Grocery Store,Gym,Food Court,Karaoke Bar,Club House,Mountain
3,"Gardens by the Bay, Singapore",1.282375,103.864273,1,Garden,Scenic Lookout,Hotel,Bridge,Seafood Restaurant,Bar,Botanical Garden,Lounge,Food Court,Gastropub


In [40]:
parks_merged.loc[parks_merged['Cluster Labels'] == 2]


Unnamed: 0,Place Name,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Fort Canning Hill, Singapore",1.294444,103.846947,2,Hotel,French Restaurant,Waterfront,Park,Bar,Business Service,Coffee Shop,Spa,Lighthouse,Movie Theater
