# Coursera Capstone Project

### Problem: Determine the best location for an Indian restaurant in Philadelphia

##### Import Packages

In [1]:
# Pandas dataframe
import pandas as pd

# Geographic locations - latitiude and longitude
from geopy.geocoders import Nominatim 

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library
import requests # library to handle requests

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    branca-0.4.0               |             py_0          26 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    altair-4.0.1               |             py_0         575 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.0 MB

The following NEW packages will be 

##### Prepare data for analysis

List of Philadelphia neighborhoods and zip codes

Assumption: *Since the list of neighborhoods and zipcodes is not readily available online, they were manually identified and loaded into a dataframe.
            Source: https://www.visitphilly.com/areas/philadelphia-neighborhoods/*

In [2]:
data = {'Neighborhood':['Bella Vista',
'Callow Hill',
'Chinatown',
'East Passyunk',
'Fairmount',
'Fishtown',
'Graduate Hospital',
'Logan Square',
'Market East',
'Midtown Village',
'Northern Liberties',
'Old City',
'Pennsport',
'Powelton Village',
'Queen Village',
'Rittenhouse Square',
'Society Hill',
'Spring Garden',
'Spruce Hill and Cedar Park',
'University City',
'Washington Square West'], 

'Zipcode':['19147',
'19107',
'19107',
'19148',
'19103',
'19125',
'19146',
'19102',
'19107',
'19107',
'19123',
'19106',
'19147',
'19104',
'19147',
'19103',
'19106',
'19130',
'19104',
'19104',
'19147',
]} 

In [3]:
df = pd.DataFrame(data) 
df

Unnamed: 0,Neighborhood,Zipcode
0,Bella Vista,19147
1,Callow Hill,19107
2,Chinatown,19107
3,East Passyunk,19148
4,Fairmount,19103
5,Fishtown,19125
6,Graduate Hospital,19146
7,Logan Square,19102
8,Market East,19107
9,Midtown Village,19107


In [4]:
## We can observe from the above data that certain postal codes associate to multiple neighborhoods
df.groupby("Zipcode").Neighborhood.nunique()

Zipcode
19102    1
19103    2
19104    3
19106    2
19107    4
19123    1
19125    1
19130    1
19146    1
19147    4
19148    1
Name: Neighborhood, dtype: int64

In [5]:
# Therefore, we can group the neighborhoods with the same zipcode separated by a comma
df_neighborhood = df.groupby(['Zipcode'])['Neighborhood'].apply(', '.join).reset_index()
df_neighborhood

Unnamed: 0,Zipcode,Neighborhood
0,19102,Logan Square
1,19103,"Fairmount, Rittenhouse Square"
2,19104,"Powelton Village, Spruce Hill and Cedar Park, ..."
3,19106,"Old City, Society Hill"
4,19107,"Callow Hill, Chinatown, Market East, Midtown V..."
5,19123,Northern Liberties
6,19125,Fishtown
7,19130,Spring Garden
8,19146,Graduate Hospital
9,19147,"Bella Vista, Pennsport, Queen Village, Washing..."


##### Concatenate the latitude and longitude corresponding to each zipcode

In [6]:
df_latlon = pd.DataFrame(columns = ['Zipcode', 'Latitude', 'Longitude'])

for zipcode in df_neighborhood.Zipcode:
    geolocator = Nominatim(user_agent="ny_explorer")
    location = geolocator.geocode(zipcode)
    latitude = location.latitude
    longitude = location.longitude
    df_latlon = df_latlon.append({'Zipcode': zipcode, 'Latitude': latitude, 'Longitude': longitude}, ignore_index=True)
    
    print('The geograpical coordinate of Zipcode {} is {}, {}.'.format(zipcode, latitude, longitude))

The geograpical coordinate of Zipcode 19102 is 39.9462124, -75.1650177.
The geograpical coordinate of Zipcode 19103 is 39.9558698, -75.17183038606788.
The geograpical coordinate of Zipcode 19104 is 39.9492522, -75.2093769.
The geograpical coordinate of Zipcode 19106 is 39.9475001, -75.1462128.
The geograpical coordinate of Zipcode 19107 is 39.9470719, -75.1546484.
The geograpical coordinate of Zipcode 19123 is 39.948016, -75.2234057.
The geograpical coordinate of Zipcode 19125 is 44.12166069492773, 9.836874853240918.
The geograpical coordinate of Zipcode 19130 is 39.9643103, -75.1660569.
The geograpical coordinate of Zipcode 19146 is 39.940421549678355, -75.177791448773.
The geograpical coordinate of Zipcode 19147 is 39.93748556550609, -75.15540403916046.
The geograpical coordinate of Zipcode 19148 is 39.9269224, -75.1670587.


In [7]:
df_latlon

Unnamed: 0,Zipcode,Latitude,Longitude
0,19102,39.946212,-75.165018
1,19103,39.95587,-75.17183
2,19104,39.949252,-75.209377
3,19106,39.9475,-75.146213
4,19107,39.947072,-75.154648
5,19123,39.948016,-75.223406
6,19125,44.121661,9.836875
7,19130,39.96431,-75.166057
8,19146,39.940422,-75.177791
9,19147,39.937486,-75.155404


In [8]:
## Next, we join the latitude and longitude with the original table containing the neighborhood information

df_merged = pd.merge(df_neighborhood, df_latlon, how = 'left', left_on = 'Zipcode', right_on = 'Zipcode')
df_merged

Unnamed: 0,Zipcode,Neighborhood,Latitude,Longitude
0,19102,Logan Square,39.946212,-75.165018
1,19103,"Fairmount, Rittenhouse Square",39.95587,-75.17183
2,19104,"Powelton Village, Spruce Hill and Cedar Park, ...",39.949252,-75.209377
3,19106,"Old City, Society Hill",39.9475,-75.146213
4,19107,"Callow Hill, Chinatown, Market East, Midtown V...",39.947072,-75.154648
5,19123,Northern Liberties,39.948016,-75.223406
6,19125,Fishtown,44.121661,9.836875
7,19130,Spring Garden,39.96431,-75.166057
8,19146,Graduate Hospital,39.940422,-75.177791
9,19147,"Bella Vista, Pennsport, Queen Village, Washing...",39.937486,-75.155404


In [9]:
## Viewing the above dataframe, it appears the neighborhood Fishtown may have an incorrect latitude, longitude. Let us exclude this neighborhood from our analysis

df_merged_fin = df_merged[df_merged.Neighborhood != 'Fishtown'].reset_index(drop=True)
df_merged_fin

Unnamed: 0,Zipcode,Neighborhood,Latitude,Longitude
0,19102,Logan Square,39.946212,-75.165018
1,19103,"Fairmount, Rittenhouse Square",39.95587,-75.17183
2,19104,"Powelton Village, Spruce Hill and Cedar Park, ...",39.949252,-75.209377
3,19106,"Old City, Society Hill",39.9475,-75.146213
4,19107,"Callow Hill, Chinatown, Market East, Midtown V...",39.947072,-75.154648
5,19123,Northern Liberties,39.948016,-75.223406
6,19130,Spring Garden,39.96431,-75.166057
7,19146,Graduate Hospital,39.940422,-75.177791
8,19147,"Bella Vista, Pennsport, Queen Village, Washing...",39.937486,-75.155404
9,19148,East Passyunk,39.926922,-75.167059


##### Exploratory analysis: Using geopy library to create a map of Philadelphia

In [10]:
# Identifying the co-ordinates for Philadelphia
address = 'Philadelphia, PA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Philadelphia are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Philadelphia are 39.9527237, -75.1635262.


#### Create a map of Philadelphia with neighborhoods superimposed on top.

In [11]:
# create map of New York using latitude and longitude values
map_philly = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, neighborhood in zip(df_merged_fin['Latitude'], df_merged_fin['Longitude'], df_merged_fin['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_philly)  
    
map_philly

## Using Foursquare data to obtain Venues information

##### Define Foursquare Credentials and Version

In [12]:
CLIENT_ID = 'SF4XMZOXZH4QTLHEDLBAUT0JZM1MUQSRKGRZNNR00KEQOKLU' # your Foursquare ID
CLIENT_SECRET = 'GYWTUYF0U2XGOS50O14NZW2FQRBIWWJCVAE1EU14RF5TVKRB' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: SF4XMZOXZH4QTLHEDLBAUT0JZM1MUQSRKGRZNNR00KEQOKLU
CLIENT_SECRET:GYWTUYF0U2XGOS50O14NZW2FQRBIWWJCVAE1EU14RF5TVKRB


In [13]:
## Initialize data 

# Obtain top 500 venues for any given neighborhood
LIMIT = 500

##### Explore Neighborhoods in Philadelphia

In [14]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [15]:
philly_venues = getNearbyVenues(names=df_merged_fin['Neighborhood'],
                                   latitudes=df_merged_fin['Latitude'],
                                   longitudes=df_merged_fin['Longitude']
                                  )

Logan Square
Fairmount, Rittenhouse Square
Powelton Village, Spruce Hill and Cedar Park, University City
Old City, Society Hill
Callow Hill, Chinatown, Market East, Midtown Village
Northern Liberties
Spring Garden
Graduate Hospital
Bella Vista, Pennsport, Queen Village, Washington Square West
East Passyunk


In [16]:
print(philly_venues.shape)
philly_venues.head()

(663, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Logan Square,39.946212,-75.165018,Kimmel Center for the Performing Arts,39.946785,-75.165234,Concert Hall
1,Logan Square,39.946212,-75.165018,Writer's Block Rehab,39.94645,-75.163602,Cocktail Bar
2,Logan Square,39.946212,-75.165018,Verizon Hall,39.946598,-75.165976,Concert Hall
3,Logan Square,39.946212,-75.165018,Vetri,39.946761,-75.163183,Italian Restaurant
4,Logan Square,39.946212,-75.165018,Sweet Box Cupcakes & Bake Shop,39.945457,-75.162746,Cupcake Shop


In [17]:
## Exploratory analysis of each neighborhood
philly_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Bella Vista, Pennsport, Queen Village, Washington Square West",89,89,89,89,89,89
"Callow Hill, Chinatown, Market East, Midtown Village",93,93,93,93,93,93
East Passyunk,73,73,73,73,73,73
"Fairmount, Rittenhouse Square",78,78,78,78,78,78
Graduate Hospital,33,33,33,33,33,33
Logan Square,100,100,100,100,100,100
Northern Liberties,23,23,23,23,23,23
"Old City, Society Hill",100,100,100,100,100,100
"Powelton Village, Spruce Hill and Cedar Park, University City",26,26,26,26,26,26
Spring Garden,48,48,48,48,48,48


In [18]:
print('There are {} uniques categories.'.format(len(philly_venues['Venue Category'].unique())))

There are 177 uniques categories.


In [19]:
## Obtaining a listing of unique venue categories
philly_venues['Venue Category'].unique()

array(['Concert Hall', 'Cocktail Bar', 'Italian Restaurant',
       'Cupcake Shop', 'Sushi Restaurant', 'Theater',
       'Mexican Restaurant', 'Opera House', 'Gym', 'Coffee Shop',
       'Pizza Place', 'Greek Restaurant', 'Bakery',
       'Comfort Food Restaurant', 'Gastropub', 'Cycle Studio',
       'Gift Shop', 'Ice Cream Shop', 'Breakfast Spot', 'Bar',
       'Southern / Soul Food Restaurant', 'Beer Garden', 'Wine Bar',
       'Gay Bar', 'Vegetarian / Vegan Restaurant', 'Hotel',
       'Indian Restaurant', 'Bookstore', 'Steakhouse', 'Dive Bar',
       'Dessert Shop', 'Hookah Bar', 'Tapas Restaurant', 'Jewelry Store',
       'Restaurant', 'American Restaurant', 'New American Restaurant',
       "Men's Store", 'Falafel Restaurant', 'Sports Club', 'Optical Shop',
       'Accessories Store', 'Smoke Shop', 'Deli / Bodega',
       'Clothing Store', 'Cuban Restaurant', 'Asian Restaurant',
       'General Entertainment', 'Shoe Store', 'Convenience Store',
       'Church', 'Thai Restaurant'

We notice from the above unique list of venue categories that there are a diverse set of bars, restaurants and stores in the neighborhoods. However, there isn't any data available
on universities. Therefore, let us create a dataframe with a count of the number of the feature variables that will help predict the result. 
The feature variables we will be using from the venue category list are Indian restaurants, Yoga studios, Bus/Train/Metro stop

In [20]:
## Filtering the dataset by venue categories in order to obtain the desired feature variables
philly_venues_filtered = philly_venues[philly_venues['Venue Category'].str.contains('Indian|Yoga|Station|Bus|Train|Metro',case=False)].reset_index(drop=True)
philly_venues_filtered

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Logan Square,39.946212,-75.165018,Indeblue,39.948479,-75.162219,Indian Restaurant
1,Logan Square,39.946212,-75.165018,Masala Kitchen : Kati Rolls & Platters,39.94899,-75.16102,Indian Restaurant
2,Logan Square,39.946212,-75.165018,Dhyana Yoga,39.949882,-75.16799,Yoga Studio
3,"Fairmount, Rittenhouse Square",39.95587,-75.17183,Veda - Modern Indian Bistro,39.951967,-75.172862,Indian Restaurant
4,"Powelton Village, Spruce Hill and Cedar Park, ...",39.949252,-75.209377,Studio 34,39.948916,-75.213546,Yoga Studio
5,"Powelton Village, Spruce Hill and Cedar Park, ...",39.949252,-75.209377,Desi Village Indian Restaurant,39.948981,-75.214205,Indian Restaurant
6,"Old City, Society Hill",39.9475,-75.146213,Karma Restaurant & Bar,39.94998,-75.145614,Indian Restaurant
7,"Callow Hill, Chinatown, Market East, Midtown V...",39.947072,-75.154648,The Yoga Garden,39.946714,-75.154714,Yoga Studio
8,Northern Liberties,39.948016,-75.223406,SEPTA Bus Stop # 24630 / South 48th Street & W...,39.948432,-75.219183,Bus Station
9,Spring Garden,39.96431,-75.166057,17th & Spring Garden 2 Bus Stop,39.962984,-75.166375,Bus Station


In [21]:
## Group the venue categories as Indian Restaurant, Yoga Studio and Public Transport. Therefore, combine all the public transport options

philly_venues_filtered.loc[philly_venues_filtered['Venue Category'].str.contains('Bus|Train|Metro',case=False), 'Venue Category'] = 'Public Transport'

In [22]:
philly_venues_filtered

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Logan Square,39.946212,-75.165018,Indeblue,39.948479,-75.162219,Indian Restaurant
1,Logan Square,39.946212,-75.165018,Masala Kitchen : Kati Rolls & Platters,39.94899,-75.16102,Indian Restaurant
2,Logan Square,39.946212,-75.165018,Dhyana Yoga,39.949882,-75.16799,Yoga Studio
3,"Fairmount, Rittenhouse Square",39.95587,-75.17183,Veda - Modern Indian Bistro,39.951967,-75.172862,Indian Restaurant
4,"Powelton Village, Spruce Hill and Cedar Park, ...",39.949252,-75.209377,Studio 34,39.948916,-75.213546,Yoga Studio
5,"Powelton Village, Spruce Hill and Cedar Park, ...",39.949252,-75.209377,Desi Village Indian Restaurant,39.948981,-75.214205,Indian Restaurant
6,"Old City, Society Hill",39.9475,-75.146213,Karma Restaurant & Bar,39.94998,-75.145614,Indian Restaurant
7,"Callow Hill, Chinatown, Market East, Midtown V...",39.947072,-75.154648,The Yoga Garden,39.946714,-75.154714,Yoga Studio
8,Northern Liberties,39.948016,-75.223406,SEPTA Bus Stop # 24630 / South 48th Street & W...,39.948432,-75.219183,Public Transport
9,Spring Garden,39.96431,-75.166057,17th & Spring Garden 2 Bus Stop,39.962984,-75.166375,Public Transport


##### Preparing the data set based on the selected Venue categories

In [23]:
# one hot encoding
Philly_onehot = pd.get_dummies(philly_venues_filtered[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Philly_onehot['Neighborhood'] = philly_venues_filtered['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Philly_onehot.columns[-1]] + list(Philly_onehot.columns[:-1])
Philly_onehot = Philly_onehot[fixed_columns].groupby(['Neighborhood']).sum().reset_index()

Philly_Foursquare = Philly_onehot.groupby(['Neighborhood']).sum().reset_index()
Philly_Foursquare

Unnamed: 0,Neighborhood,Indian Restaurant,Public Transport,Yoga Studio
0,"Callow Hill, Chinatown, Market East, Midtown V...",0,0,1
1,East Passyunk,0,0,1
2,"Fairmount, Rittenhouse Square",1,0,0
3,Logan Square,2,0,1
4,Northern Liberties,0,1,0
5,"Old City, Society Hill",1,0,0
6,"Powelton Village, Spruce Hill and Cedar Park, ...",1,0,1
7,Spring Garden,0,1,0


## Adding other feature data such as Population, Crime rate and Number of universities for the above neighborhoods
Source: Based on manual search online

In [40]:
data = {'Neighborhood':['Callow Hill, Chinatown, Market East, Midtown Village',
'East Passyunk',
'Fairmount, Rittenhouse Square',
'Graduate Hospital',
'Logan Square',
'Northern Liberties',
'Old City, Society Hill',
'Powelton Village, Spruce Hill and Cedar Park, University City',
'Spring Garden'
], 

'Population':[24248,
46532,
11140,
13163,
12232,
34112,
19092,
20421,
16203
], 
'Number of universities':[4,
12,
5,
2,
6,
3,
7,
15,
3
],
'Crime rate':[
0.41,
0.92,
1.26,
0.61,
0.92,
0.65,
0.23,
1.56,
0.60
] } 

In [41]:
df_other_features = pd.DataFrame(data) 
df_other_features

Unnamed: 0,Neighborhood,Population,Number of universities,Crime rate
0,"Callow Hill, Chinatown, Market East, Midtown V...",24248,4,0.41
1,East Passyunk,46532,12,0.92
2,"Fairmount, Rittenhouse Square",11140,5,1.26
3,Graduate Hospital,13163,2,0.61
4,Logan Square,12232,6,0.92
5,Northern Liberties,34112,3,0.65
6,"Old City, Society Hill",19092,7,0.23
7,"Powelton Village, Spruce Hill and Cedar Park, ...",20421,15,1.56
8,Spring Garden,16203,3,0.6


In [43]:
Philly_neighborhood_data = pd.merge(Philly_Foursquare, df_other_features, how = 'left', left_on = 'Neighborhood', right_on = 'Neighborhood')
Philly_neighborhood_data

Unnamed: 0,Neighborhood,Indian Restaurant,Public Transport,Yoga Studio,Population,Number of universities,Crime rate
0,"Callow Hill, Chinatown, Market East, Midtown V...",0,0,1,24248,4,0.41
1,East Passyunk,0,0,1,46532,12,0.92
2,"Fairmount, Rittenhouse Square",1,0,0,11140,5,1.26
3,Logan Square,2,0,1,12232,6,0.92
4,Northern Liberties,0,1,0,34112,3,0.65
5,"Old City, Society Hill",1,0,0,19092,7,0.23
6,"Powelton Village, Spruce Hill and Cedar Park, ...",1,0,1,20421,15,1.56
7,Spring Garden,0,1,0,16203,3,0.6


## Now, we are ready to use Machine Learning techniques to identify the best neighborhood for an Indian restaurant

##### Normalizing the data

In [44]:
from sklearn.preprocessing import StandardScaler
import numpy as np

X = Philly_neighborhood_data.values[:,1:]
X = np.nan_to_num(X)
Clus_dataSet = StandardScaler().fit_transform(X)
Clus_dataSet



array([[-0.89802651, -0.57735027,  1.        ,  0.11165171, -0.70019195,
        -0.99330632],
       [-0.89802651, -0.57735027,  1.        ,  2.10129322,  1.24816826,
         0.24604835],
       [ 0.53881591, -0.57735027, -1.        , -1.05870464, -0.45664692,
         1.0722848 ],
       [ 1.97565832, -0.57735027,  1.        , -0.96120471, -0.2131019 ,
         0.24604835],
       [-0.89802651,  1.73205081, -1.        ,  0.9923654 , -0.94373698,
        -0.41008059],
       [ 0.53881591, -0.57735027, -1.        , -0.34870512,  0.03044313,
        -1.43072561],
       [ 0.53881591, -0.57735027,  1.        , -0.23004449,  1.97880334,
         1.80131696],
       [-0.89802651,  1.73205081, -1.        , -0.60665137, -0.94373698,
        -0.53158595]])

<h2 id="modeling">Modeling</h2>

Lets apply k-means on our dataset, and take look at cluster labels.

In [45]:
## Performing the modeling on the data
clusterNum = 3 # initialized
k_means = KMeans(init = "k-means++", n_clusters = clusterNum, n_init = 12)
k_means.fit(X)
labels = k_means.labels_
print(labels)

[2 1 0 0 1 2 2 0]


In [47]:
## Create a new column with cluster labels in the dataframe
Philly_neighborhood_data["Clus"] = labels

In [48]:
Philly_neighborhood_data

Unnamed: 0,Neighborhood,Indian Restaurant,Public Transport,Yoga Studio,Population,Number of universities,Crime rate,Clus
0,"Callow Hill, Chinatown, Market East, Midtown V...",0,0,1,24248,4,0.41,2
1,East Passyunk,0,0,1,46532,12,0.92,1
2,"Fairmount, Rittenhouse Square",1,0,0,11140,5,1.26,0
3,Logan Square,2,0,1,12232,6,0.92,0
4,Northern Liberties,0,1,0,34112,3,0.65,1
5,"Old City, Society Hill",1,0,0,19092,7,0.23,2
6,"Powelton Village, Spruce Hill and Cedar Park, ...",1,0,1,20421,15,1.56,2
7,Spring Garden,0,1,0,16203,3,0.6,0


In [62]:
## Centroid values for each cluster
Philly_neighborhood_data.groupby('Clus').mean()

Unnamed: 0_level_0,Indian Restaurant,Public Transport,Yoga Studio,Population,Number of universities,Crime rate
Clus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.0,0.333333,0.0,12178.333333,4.333333,0.93
1,0.0,1.0,0.5,40322.0,7.5,0.785
2,0.666667,0.0,0.666667,21253.666667,8.666667,0.733333


## Insight

Based on the above data, we can conclude the following about each cluster:

Cluster 0: The least number of universities, highest crime rate and the most number of indian restaurants. This easily makes it the weakest choice for an Indian restaurant.

Cluster 1: Best public transport options, highest population, and a high number of universities. However, the most population in the cluster and also no other Indian restaurants           i.e. no competition, makes it the best choice for an Indian restaurant

Cluster 2: No public transport, but the highest number of universities and the least crime rate makes this an alternative option.

In [None]:
## Let us append the latitude and longitude data to visualize the clusters on a map

In [49]:
Philly_clustered_data = pd.merge(Philly_neighborhood_data, df_merged_fin, how = 'left', left_on = 'Neighborhood', right_on = 'Neighborhood')
Philly_clustered_data

Unnamed: 0,Neighborhood,Indian Restaurant,Public Transport,Yoga Studio,Population,Number of universities,Crime rate,Clus,Zipcode,Latitude,Longitude
0,"Callow Hill, Chinatown, Market East, Midtown V...",0,0,1,24248,4,0.41,2,19107,39.947072,-75.154648
1,East Passyunk,0,0,1,46532,12,0.92,1,19148,39.926922,-75.167059
2,"Fairmount, Rittenhouse Square",1,0,0,11140,5,1.26,0,19103,39.95587,-75.17183
3,Logan Square,2,0,1,12232,6,0.92,0,19102,39.946212,-75.165018
4,Northern Liberties,0,1,0,34112,3,0.65,1,19123,39.948016,-75.223406
5,"Old City, Society Hill",1,0,0,19092,7,0.23,2,19106,39.9475,-75.146213
6,"Powelton Village, Spruce Hill and Cedar Park, ...",1,0,1,20421,15,1.56,2,19104,39.949252,-75.209377
7,Spring Garden,0,1,0,16203,3,0.6,0,19130,39.96431,-75.166057


In [50]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)
kclusters = 3

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Philly_clustered_data['Latitude'], Philly_clustered_data['Longitude'], Philly_clustered_data['Neighborhood'], Philly_clustered_data['Clus']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters