In [2]:
# import libraries 

import pandas as pd
!conda install -c conda-forge beautifulsoup4 --yes
from bs4 import BeautifulSoup
import requests
import numpy as np
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim 
from pandas.io.json import json_normalize  

!conda install -c conda-forge geocoder --yes
import geocoder # to get coordinates

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print("Libraries imported.")

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - geocoder


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    brotlipy-0.7.0             |py36h8c4c3a4_1000         346 KB  conda-forge
    chardet-3.0.4              |py36h9f0ad1d_1006         188 KB  conda-forge
    click-7.1.2                |     pyh9f0ad1d_0          64 KB  conda-forge
    cryptography-2.9.2         |   py36h45558ae_0         613 KB  conda-forge
    future-0.18.2              |   py36h9f0ad1d_1         714 KB  conda-forge

# IBM Applied Data Science Capstone Course 

## Week 5 Final Report 

### Opening a hotel in Taipei, Taiwan 
1. Build a dataframe of neighborhoods in Taipei, Taiwan by web scraping the data from Wikipedia page
2. Get the geographical coordinates of the neighborhoods
3. Obtain the venue data for the neighborhoods from Foursquare API
4. Explore and cluster the neighborhoods
5. Select the best cluster to open a new hotel

### 1. Import Wikipedia data into Dataframe

In [4]:
#retrieve data from Wikipedia and convert the data into created list then Dataframe

data = requests.get("https://en.wikipedia.org/wiki/Category:Districts_of_Taipei").text
soup = BeautifulSoup(data, 'html.parser')
neighborhoodList = []
for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    neighborhoodList.append(row.text)
tpe_df = pd.DataFrame({"Neighborhood": neighborhoodList})

tpe_df.head()

Unnamed: 0,Neighborhood
0,Beitou District
1,"Daan District, Taipei City"
2,"Datong District, Taipei"
3,Eastern District of Taipei
4,Guting District


In [5]:
#confirming number of rows are correct
tpe_df.shape

(16, 1)

### 2. Get geo coorindates 

In [8]:
#retrieve geographical coordinates
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Taipei, Taiwan'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

coords = [ get_latlng(neighborhood) for neighborhood in tpe_df["Neighborhood"].tolist() ]

coords

[[25.13289000000003, 121.50253000000009],
 [25.02138000779031, 121.5443399888916],
 [25.06588996335633, 121.51669991892228],
 [25.26553000000007, 121.5227000000001],
 [25.02147006939235, 121.55398993782116],
 [24.993110025535255, 121.54135001500374],
 [24.988789956142977, 121.56372994892956],
 [25.054380047752282, 121.60673007660569],
 [25.069090000000074, 121.58847000000003],
 [25.09313000000003, 121.51976000000002],
 [25.0516499633563, 121.54774002553529],
 [25.03535000000005, 121.49948000000006],
 [24.98974000000004, 121.56963000000007],
 [25.03360992671262, 121.57002000000011],
 [25.052290007790315, 121.52269000389524],
 [25.032470000000046, 121.5185600000001]]

In [9]:
#new dataframe for the coordinates and merge with the district dataframe
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])
tpe_df['Latitude'] = df_coords['Latitude']
tpe_df['Longitude'] = df_coords['Longitude']
print(tpe_df.shape)
tpe_df

(16, 3)


Unnamed: 0,Neighborhood,Latitude,Longitude
0,Beitou District,25.13289,121.50253
1,"Daan District, Taipei City",25.02138,121.54434
2,"Datong District, Taipei",25.06589,121.5167
3,Eastern District of Taipei,25.26553,121.5227
4,Guting District,25.02147,121.55399
5,Jingmei District,24.99311,121.54135
6,Muzha District,24.98879,121.56373
7,"Nangang District, Taipei",25.05438,121.60673
8,Neihu District,25.06909,121.58847
9,Shilin District,25.09313,121.51976


In [11]:
#save the new dataframe as CSV
tpe_df.to_csv("tpe_df.csv", index=False)

### 3. Create map of Taipei with neighborhoods displayed

In [12]:
# get the coordinates of Taipei
address = 'Taipei, Taiwan'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate is {}, {}.'.format(latitude, longitude))

The geograpical coordinate is 25.0375198, 121.5636796.


In [14]:
# Map of Taipei using coordinate 
map_tpe = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, lng, neighborhood in zip(tpe_df['Latitude'], tpe_df['Longitude'], tpe_df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_tpe)  
    
map_tpe

In [15]:
# saved as HTML
map_tpe.save('map_tpe.html')

### 4. Use Foursquare to explore neighborhoods

In [16]:
#exploring neighborhoods

CLIENT_ID = 'PNNTTIIPRW4E5HOYFVP2JU20KUJUX0YF3A15WYIRHRHZHIF3' 
CLIENT_SECRET = '3H5OSLL0KIWIXIOUVXQX1XJHYIDXZKLJKELUXXJ5PSQQSF4N'
VERSION = '20180605'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: PNNTTIIPRW4E5HOYFVP2JU20KUJUX0YF3A15WYIRHRHZHIF3
CLIENT_SECRET:3H5OSLL0KIWIXIOUVXQX1XJHYIDXZKLJKELUXXJ5PSQQSF4N


<b> Finding the top 100 venues within 2000 meters radius </b>

In [17]:
radius = 2000
LIMIT = 100

venues = []

for lat, long, neighborhood in zip(tpe_df['Latitude'], tpe_df['Longitude'], tpe_df['Neighborhood']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [18]:
# new dataframe for the venue lists

venues_df = pd.DataFrame(venues)
venues_df.columns = ['Neighborhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(1375, 7)


Unnamed: 0,Neighborhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Beitou District,25.13289,121.50253,蔡元益紅茶（總店）,25.131896,121.502012,Tea Room
1,Beitou District,25.13289,121.50253,Beitou Park (北投公園),25.136605,121.504432,Park
2,Beitou District,25.13289,121.50253,Beitou Market (北投市場 Beitou Market),25.132509,121.50218,Farmers Market
3,Beitou District,25.13289,121.50253,日勝生加賀屋 Kagaya Taipei,25.136448,121.506889,Hotel
4,Beitou District,25.13289,121.50253,拾米屋 SheMe House,25.136224,121.499005,Café


<b> Checking how many venues for each neighborhood and how many unique categories among the venues </b>

In [19]:
# explore numbers of venues 
venues_df.groupby(["Neighborhood"]).count()


Unnamed: 0_level_0,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Beitou District,67,67,67,67,67,67
"Daan District, Taipei City",100,100,100,100,100,100
"Datong District, Taipei",100,100,100,100,100,100
Eastern District of Taipei,4,4,4,4,4,4
Guting District,100,100,100,100,100,100
Jingmei District,87,87,87,87,87,87
Muzha District,58,58,58,58,58,58
"Nangang District, Taipei",82,82,82,82,82,82
Neihu District,100,100,100,100,100,100
Shilin District,100,100,100,100,100,100


In [21]:
# explore unique categories of venues 
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))
venues_df['VenueCategory'].unique()[:50]

There are 175 uniques categories.


array(['Tea Room', 'Park', 'Farmers Market', 'Hotel', 'Café', 'Resort',
       'Hot Spring', 'Asian Restaurant', 'Vegetarian / Vegan Restaurant',
       'Noodle House', 'Italian Restaurant', 'Dessert Shop',
       'Historic Site', 'Pool', 'Coffee Shop', 'Trail', 'History Museum',
       'Athletics & Sports', 'Convenience Store', 'Sushi Restaurant',
       'Gym / Fitness Center', 'Chinese Restaurant', 'Gym',
       'Fast Food Restaurant', 'Japanese Restaurant', 'Bus Station',
       'Metro Station', 'Arepa Restaurant', 'Taiwanese Restaurant',
       'Mountain', 'Train', 'Szechuan Restaurant', 'Train Station',
       'Malay Restaurant', 'Art Museum', 'Bistro', 'Pub',
       'Paper / Office Supplies Store', 'Bakery', 'Breakfast Spot',
       'Hotpot Restaurant', 'Massage Studio', 'Brewery', 'Restaurant',
       'Ice Cream Shop', 'Chinese Breakfast Place', 'Lounge', 'Diner',
       'Pizza Place', 'Cocktail Bar'], dtype=object)

### 5. Analyze each neighborhood 

In [23]:
# analyze the neighborhood on the different categories 

# one hot encoding
tpe_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
tpe_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [tpe_onehot.columns[-1]] + list(tpe_onehot.columns[:-1])
tpe_onehot = tpe_onehot[fixed_columns]

print(tpe_onehot.shape)
tpe_onehot.head()

(1375, 176)


Unnamed: 0,Neighborhoods,American Restaurant,Arcade,Arepa Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,...,Trail,Train,Train Station,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Yoga Studio,Zoo,Zoo Exhibit
0,Beitou District,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Beitou District,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Beitou District,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Beitou District,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Beitou District,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<b> Grouping neighborhoods by row by the mean of category frequency </b>

In [24]:
tpe_grouped = tpe_onehot.groupby(["Neighborhoods"]).mean().reset_index()

print(tpe_grouped.shape)
tpe_grouped

(16, 176)


Unnamed: 0,Neighborhoods,American Restaurant,Arcade,Arepa Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,...,Trail,Train,Train Station,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Yoga Studio,Zoo,Zoo Exhibit
0,Beitou District,0.0,0.0,0.014925,0.0,0.0,0.0,0.029851,0.014925,0.0,...,0.014925,0.014925,0.014925,0.014925,0.0,0.0,0.0,0.0,0.0,0.0
1,"Daan District, Taipei City",0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,...,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0,0.0
2,"Datong District, Taipei",0.0,0.0,0.0,0.0,0.01,0.01,0.04,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Eastern District of Taipei,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Guting District,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,...,0.02,0.0,0.0,0.02,0.0,0.0,0.02,0.0,0.0,0.0
5,Jingmei District,0.011494,0.0,0.0,0.0,0.0,0.0,0.045977,0.0,0.0,...,0.0,0.0,0.0,0.022989,0.0,0.0,0.0,0.0,0.0,0.0
6,Muzha District,0.0,0.0,0.0,0.0,0.0,0.0,0.017241,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Nangang District, Taipei",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012195,...,0.012195,0.0,0.02439,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Neihu District,0.02,0.0,0.0,0.0,0.0,0.0,0.03,0.01,0.01,...,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0
9,Shilin District,0.01,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.01,...,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0


<b> A new dataframe with Hotel data </b>

In [25]:
# dataframe with just hotel data

tpe_hotel = tpe_grouped[["Neighborhoods", "Hotel"]]
tpe_hotel.head()

Unnamed: 0,Neighborhoods,Hotel
0,Beitou District,0.149254
1,"Daan District, Taipei City",0.04
2,"Datong District, Taipei",0.1
3,Eastern District of Taipei,0.0
4,Guting District,0.04


### 6. Cluster Neighborhoods 

In [30]:
# create neighborhoods clusters

kclusters = 3
tpe_clustering = tpe_hotel.drop(["Neighborhoods"],1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tpe_clustering)
kmeans.labels_[0:10]

tpe_merged = tpe_hotel.copy()
tpe_merged["Cluster Labels"] = kmeans.labels_

tpe_merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
tpe_merged.head()

tpe_merged = tpe_merged.join(tpe_df.set_index("Neighborhood"), on="Neighborhood")
print(tpe_merged.shape)
tpe_merged.head() 

(16, 5)


Unnamed: 0,Neighborhood,Hotel,Cluster Labels,Latitude,Longitude
0,Beitou District,0.149254,1,25.13289,121.50253
1,"Daan District, Taipei City",0.04,2,25.02138,121.54434
2,"Datong District, Taipei",0.1,1,25.06589,121.5167
3,Eastern District of Taipei,0.0,0,25.26553,121.5227
4,Guting District,0.04,2,25.02147,121.55399


In [31]:
print(tpe_merged.shape)
tpe_merged.sort_values(["Cluster Labels"], inplace = True)
tpe_merged 

(16, 5)


Unnamed: 0,Neighborhood,Hotel,Cluster Labels,Latitude,Longitude
3,Eastern District of Taipei,0.0,0,25.26553,121.5227
5,Jingmei District,0.0,0,24.99311,121.54135
6,Muzha District,0.0,0,24.98879,121.56373
8,Neihu District,0.02,0,25.06909,121.58847
12,Wenshan District,0.0,0,24.98974,121.56963
15,Zhongzheng District,0.01,0,25.03247,121.51856
0,Beitou District,0.149254,1,25.13289,121.50253
2,"Datong District, Taipei",0.1,1,25.06589,121.5167
14,"Zhongshan District, Taipei",0.11,1,25.05229,121.52269
1,"Daan District, Taipei City",0.04,2,25.02138,121.54434


### 7. Visualizing Clusters

In [41]:
# visualizing the clusters 

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(tpe_merged['Latitude'], tpe_merged['Longitude'], tpe_merged['Neighborhood'], tpe_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [36]:
map_clusters.save('map_clusters.html')

### 8. Examine Clusters

In [38]:
# Cluster 0
tpe_merged.loc[tpe_merged['Cluster Labels'] == 0]

Unnamed: 0,Neighborhood,Hotel,Cluster Labels,Latitude,Longitude
3,Eastern District of Taipei,0.0,0,25.26553,121.5227
5,Jingmei District,0.0,0,24.99311,121.54135
6,Muzha District,0.0,0,24.98879,121.56373
8,Neihu District,0.02,0,25.06909,121.58847
12,Wenshan District,0.0,0,24.98974,121.56963
15,Zhongzheng District,0.01,0,25.03247,121.51856


In [39]:
# Cluster 1
tpe_merged.loc[tpe_merged['Cluster Labels'] == 1]

Unnamed: 0,Neighborhood,Hotel,Cluster Labels,Latitude,Longitude
0,Beitou District,0.149254,1,25.13289,121.50253
2,"Datong District, Taipei",0.1,1,25.06589,121.5167
14,"Zhongshan District, Taipei",0.11,1,25.05229,121.52269


In [40]:
# Cluster 2 
tpe_merged.loc[tpe_merged['Cluster Labels'] == 2]

Unnamed: 0,Neighborhood,Hotel,Cluster Labels,Latitude,Longitude
1,"Daan District, Taipei City",0.04,2,25.02138,121.54434
4,Guting District,0.04,2,25.02147,121.55399
7,"Nangang District, Taipei",0.036585,2,25.05438,121.60673
9,Shilin District,0.03,2,25.09313,121.51976
10,"Songshan District, Taipei",0.05,2,25.05165,121.54774
11,Wanhua District,0.06,2,25.03535,121.49948
13,"Xinyi District, Taipei",0.06,2,25.03361,121.57002


### 9. Observations

Upon the data analyzed, it is clear that Cluster 0 is the best locations to open up new hotel. These locations are concentrated not necessarily in the city center and are even generally on the side lines of Taipei and the neighboring new Taipei city. 