# CAPSTONE - Identifying ideal Restaurant Location

In [97]:
import pandas as pd
from pandas import DataFrame
import numpy as np
import requests

In [61]:
data = {'Business/Institution': ['M.D. Anderson Cancer Center', 'Texas Childrens Hospital', 'Dow Chemical', 'BP Energy Company'],
        'Address': ['1515 Holcombe Blvd., Houston, TX 77030','6621 Fannin St, Houston, TX 77030', '1254 Enclave Pkwy, Houston, TX 77077', '201 Helios Way, Houston, TX 77079'],
        'Latitude': [29.707, 29.782, 29.763, 29.782],
        'Longitude': [-95.397, -95.636, -95.621, -95.636]
        }

houston_df = DataFrame(data,columns= ['Business/Institution', 'Address', 'Latitude', 'Longitude'])

In [62]:
print(houston_df)

          Business/Institution                                 Address  \
0  M.D. Anderson Cancer Center  1515 Holcombe Blvd., Houston, TX 77030   
1     Texas Childrens Hospital       6621 Fannin St, Houston, TX 77030   
2                 Dow Chemical    1254 Enclave Pkwy, Houston, TX 77077   
3            BP Energy Company       201 Helios Way, Houston, TX 77079   

   Latitude  Longitude  
0    29.707    -95.397  
1    29.782    -95.636  
2    29.763    -95.621  
3    29.782    -95.636  


### Get nearby venues using Foursquare

In [63]:
#@hidden_cell
CLIENT_ID = '5QVZLMOJBASMFQ0IPSI0SCGUW0UN3RQXJ4GV03THBH0TYJFQ'
CLIENT_SECRET = 'YFUFDXRKLXPVQSTPUWM0LHBJIOXKVVBN1E4NTF5MVJP0KSGC'
VERSION = '20180605'

In [71]:
def getNearbyVenues(names, latitudes, longitudes):
    radius=500 
    LIMIT=100 # retrieve data for only 100 venues per business/institution
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
    # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
    # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Business/Institution', 
                  'Business/Institution Latitude', 
                  'Business/Institution Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [72]:
houston_venues = getNearbyVenues(names=houston_df['Business/Institution'],
                                   latitudes=houston_df['Latitude'],
                                   longitudes=houston_df['Longitude']
                                  )

M.D. Anderson Cancer Center
Texas Childrens Hospital
Dow Chemical
BP Energy Company


In [73]:
houston_venues.head()

Unnamed: 0,Business/Institution,Business/Institution Latitude,Business/Institution Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M.D. Anderson Cancer Center,29.707,-95.397,Jesse H. Jones Rotary House International,29.705968,-95.396263,Hotel
1,M.D. Anderson Cancer Center,29.707,-95.397,Chick-fil-A,29.707939,-95.397616,Fast Food Restaurant
2,M.D. Anderson Cancer Center,29.707,-95.397,Cafe Anderson,29.706995,-95.397711,Restaurant
3,M.D. Anderson Cancer Center,29.707,-95.397,Chick-fil-A,29.70938,-95.397482,Fast Food Restaurant
4,M.D. Anderson Cancer Center,29.707,-95.397,Lantern Cafe,29.704928,-95.397462,Restaurant


In [74]:
print(houston_venues.shape)
houston_venues.head(100)

(44, 7)


Unnamed: 0,Business/Institution,Business/Institution Latitude,Business/Institution Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M.D. Anderson Cancer Center,29.707,-95.397,Jesse H. Jones Rotary House International,29.705968,-95.396263,Hotel
1,M.D. Anderson Cancer Center,29.707,-95.397,Chick-fil-A,29.707939,-95.397616,Fast Food Restaurant
2,M.D. Anderson Cancer Center,29.707,-95.397,Cafe Anderson,29.706995,-95.397711,Restaurant
3,M.D. Anderson Cancer Center,29.707,-95.397,Chick-fil-A,29.70938,-95.397482,Fast Food Restaurant
4,M.D. Anderson Cancer Center,29.707,-95.397,Lantern Cafe,29.704928,-95.397462,Restaurant
5,M.D. Anderson Cancer Center,29.707,-95.397,Third Coast Restaurant,29.709996,-95.397619,Breakfast Spot
6,M.D. Anderson Cancer Center,29.707,-95.397,Smoothie King,29.706943,-95.397153,Smoothie Shop
7,M.D. Anderson Cancer Center,29.707,-95.397,The MarketPlace,29.709587,-95.398621,Food Court
8,M.D. Anderson Cancer Center,29.707,-95.397,M.D. Anderson Fitness Center,29.70459,-95.39695,Gym / Fitness Center
9,M.D. Anderson Cancer Center,29.707,-95.397,Starbucks,29.706948,-95.397712,Coffee Shop


In [75]:
print('There are {} unique categories.'.format(len(houston_venues['Venue Category'].unique()))) # unique Venue Categories

There are 21 unique categories.


## Number of Venues per Business/Institution

In [76]:
houston_venues.groupby('Business/Institution').count()

Unnamed: 0_level_0,Business/Institution Latitude,Business/Institution Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Business/Institution,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BP Energy Company,7,7,7,7,7,7
Dow Chemical,9,9,9,9,9,9
M.D. Anderson Cancer Center,21,21,21,21,21,21
Texas Childrens Hospital,7,7,7,7,7,7


## Frequency of each Venue Category

In [77]:
houston_venues.groupby('Venue Category').count()

Unnamed: 0_level_0,Business/Institution,Business/Institution Latitude,Business/Institution Longitude,Venue,Venue Latitude,Venue Longitude
Venue Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ATM,1,1,1,1,1,1
American Restaurant,4,4,4,4,4,4
Bagel Shop,1,1,1,1,1,1
Breakfast Spot,1,1,1,1,1,1
Bus Station,2,2,2,2,2,2
Café,3,3,3,3,3,3
Cajun / Creole Restaurant,1,1,1,1,1,1
Coffee Shop,8,8,8,8,8,8
Donut Shop,1,1,1,1,1,1
Event Space,2,2,2,2,2,2


In [84]:
# one hot encoding
one_hot = pd.get_dummies(houston_venues[['Venue Category']], prefix="", prefix_sep="")
one_hot.drop(one_hot.columns[1], axis = 'columns')
one_hot.insert(loc=0, column='Business/Institution', value=houston_venues['Business/Institution'].to_list() )
one_hot.shape

(44, 22)

In [85]:
houston_grouped = one_hot.groupby('Business/Institution').mean().reset_index()
houston_grouped.head()

Unnamed: 0,Business/Institution,ATM,American Restaurant,Bagel Shop,Breakfast Spot,Bus Station,Café,Cajun / Creole Restaurant,Coffee Shop,Donut Shop,...,Food Court,Gym,Gym / Fitness Center,Hotel,Park,Restaurant,Sandwich Place,Smoothie Shop,Sushi Restaurant,Vietnamese Restaurant
0,BP Energy Company,0.0,0.0,0.0,0.0,0.142857,0.142857,0.0,0.285714,0.0,...,0.0,0.142857,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0
1,Dow Chemical,0.0,0.0,0.111111,0.0,0.0,0.111111,0.111111,0.111111,0.111111,...,0.111111,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.111111,0.111111
2,M.D. Anderson Cancer Center,0.047619,0.190476,0.0,0.047619,0.0,0.0,0.0,0.142857,0.0,...,0.047619,0.0,0.047619,0.047619,0.0,0.095238,0.142857,0.047619,0.0,0.0
3,Texas Childrens Hospital,0.0,0.0,0.0,0.0,0.142857,0.142857,0.0,0.285714,0.0,...,0.0,0.142857,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0


In [86]:
houston_grouped.shape

(4, 22)

In [87]:
#Sort venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### Top 10 Venues per neighborhood

In [89]:
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Business/Institution']
for ind in np.arange(10):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
business_inst_venues_sorted = pd.DataFrame(columns=columns)
business_inst_venues_sorted['Business/Institution'] =houston_grouped['Business/Institution']

for ind in np.arange(houston_grouped.shape[0]):
    business_inst_venues_sorted.iloc[ind, 1:] = return_most_common_venues(houston_grouped.iloc[ind, :], 10)

business_inst_venues_sorted.head()

Unnamed: 0,Business/Institution,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,BP Energy Company,Coffee Shop,Park,Bus Station,Café,Gym,Event Space,Vietnamese Restaurant,Donut Shop,American Restaurant,Bagel Shop
1,Dow Chemical,Vietnamese Restaurant,Donut Shop,Sandwich Place,Bagel Shop,Café,Cajun / Creole Restaurant,Food Court,Sushi Restaurant,Coffee Shop,American Restaurant
2,M.D. Anderson Cancer Center,American Restaurant,Fast Food Restaurant,Sandwich Place,Coffee Shop,Restaurant,Food Court,Breakfast Spot,ATM,Gym / Fitness Center,Hotel
3,Texas Childrens Hospital,Coffee Shop,Park,Bus Station,Café,Gym,Event Space,Vietnamese Restaurant,Donut Shop,American Restaurant,Bagel Shop


### Clustering the businesses/institutions

In [93]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 4

houston_grouped_clustering = houston_grouped.drop('Business/Institution', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(houston_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

  return_n_iter=True)


array([0, 2, 1, 0], dtype=int32)

### New dataframe that include cluster, neighborhoods and top 10 venues

In [94]:
# add clustering labels
business_inst_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

business_inst_venues_sorted

Unnamed: 0,Cluster Labels,Business/Institution,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0,BP Energy Company,Coffee Shop,Park,Bus Station,Café,Gym,Event Space,Vietnamese Restaurant,Donut Shop,American Restaurant,Bagel Shop
1,2,Dow Chemical,Vietnamese Restaurant,Donut Shop,Sandwich Place,Bagel Shop,Café,Cajun / Creole Restaurant,Food Court,Sushi Restaurant,Coffee Shop,American Restaurant
2,1,M.D. Anderson Cancer Center,American Restaurant,Fast Food Restaurant,Sandwich Place,Coffee Shop,Restaurant,Food Court,Breakfast Spot,ATM,Gym / Fitness Center,Hotel
3,0,Texas Childrens Hospital,Coffee Shop,Park,Bus Station,Café,Gym,Event Space,Vietnamese Restaurant,Donut Shop,American Restaurant,Bagel Shop


In [95]:
houston_merged = houston_df

# merge houston_grouped with houston_data to add latitude/longitude for each business/institution

houston_merged = pd.merge(houston_df,business_inst_venues_sorted, on='Business/Institution')
houston_merged

Unnamed: 0,Business/Institution,Address,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M.D. Anderson Cancer Center,"1515 Holcombe Blvd., Houston, TX 77030",29.707,-95.397,1,American Restaurant,Fast Food Restaurant,Sandwich Place,Coffee Shop,Restaurant,Food Court,Breakfast Spot,ATM,Gym / Fitness Center,Hotel
1,Texas Childrens Hospital,"6621 Fannin St, Houston, TX 77030",29.782,-95.636,0,Coffee Shop,Park,Bus Station,Café,Gym,Event Space,Vietnamese Restaurant,Donut Shop,American Restaurant,Bagel Shop
2,Dow Chemical,"1254 Enclave Pkwy, Houston, TX 77077",29.763,-95.621,2,Vietnamese Restaurant,Donut Shop,Sandwich Place,Bagel Shop,Café,Cajun / Creole Restaurant,Food Court,Sushi Restaurant,Coffee Shop,American Restaurant
3,BP Energy Company,"201 Helios Way, Houston, TX 77079",29.782,-95.636,0,Coffee Shop,Park,Bus Station,Café,Gym,Event Space,Vietnamese Restaurant,Donut Shop,American Restaurant,Bagel Shop


In [100]:
!conda install -c conda-forge geopy --yes

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          91 KB

The following NEW packages will be INSTALLED:

    geographiclib: 1.50-py_0   conda-forge
    geopy:         1.20.0-py_0 conda-forge


Downloading and Extracting Packages
geopy-1.20.0         | 57 KB     | ##################################### | 100% 
geographiclib-1.50   | 34 KB     | ##

In [101]:
from geopy.geocoders import Nominatim

In [102]:
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode('Houston, TX')
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Houston are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Houston are 29.7589382, -95.3676974.


In [106]:
import folium # map rendering library

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import os # import os module

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [107]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(houston_merged['Latitude'], houston_merged['Longitude'], houston_merged['Business/Institution'], houston_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters