# Project Title: Find a place to open a restaurant

## Data:
* Latitude, longitude, and estimated population of 2015 data - Download csv data from United States Zip Codes.org
* Location and surrounding data - From Foursquare

## Criteria:
* Only Maryland, US data
* Sort by highest estimated population in 2015 from IRS


In [161]:
# Import the libraries we need in the project
# Import requests #Library to handle requests

import pandas as pd #Library for data analysis
import numpy as np #Library to handle data in a vectorized manner
import json
import random #Library for random number generation

import requests
from pandas.io.json import json_normalize

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0
import folium as folium
#from folium.plugins import MarkerCluster

# Libraries for displaying images
from IPython.display import Image
from IPython.core.display import HTML

print('Libraries imported.')


Libraries imported.


In [162]:
# Use pandas read_cvs to read the postal data
postalData = pd.read_csv('zip_code_database.csv')
postalData.head()

Unnamed: 0,zip,type,decommissioned,primary_city,acceptable_cities,unacceptable_cities,state,county,timezone,area_codes,world_region,country,latitude,longitude,irs_estimated_population_2015
0,501,UNIQUE,0,Holtsville,,I R S Service Center,NY,Suffolk County,America/New_York,631,,US,40.81,-73.04,562
1,544,UNIQUE,0,Holtsville,,Irs Service Center,NY,Suffolk County,America/New_York,631,,US,40.81,-73.04,0
2,601,STANDARD,0,Adjuntas,,"Colinas Del Gigante, Jard De Adjuntas, Urb San...",PR,Adjuntas Municipio,America/Puerto_Rico,787939,,US,18.16,-66.72,0
3,602,STANDARD,0,Aguada,,"Alts De Aguada, Bo Guaniquilla, Comunidad Las ...",PR,Aguada Municipio,America/Puerto_Rico,787939,,US,18.38,-67.18,0
4,603,STANDARD,0,Aguadilla,Ramey,"Bda Caban, Bda Esteves, Bo Borinquen, Bo Ceiba...",PR,Aguadilla Municipio,America/Puerto_Rico,787,,US,18.43,-67.15,0


In [163]:
# Now, clean up the data and keep only the column and data we need. The target area is Baltimore County, Maryland, USA.
MDData = postalData.drop(columns=['decommissioned', 'unacceptable_cities', 'timezone', 'world_region'])
MDData.sort_values(by='irs_estimated_population_2015', ascending=False)

Unnamed: 0,zip,type,primary_city,acceptable_cities,state,county,area_codes,country,latitude,longitude,irs_estimated_population_2015
26692,60629,STANDARD,Chicago,Bedford Park,IL,Cook County,312773872,US,41.78,-87.71,114420
4118,11220,STANDARD,Brooklyn,,NY,Kings County,718,US,40.64,-74.02,111430
34023,77449,STANDARD,Katy,Park Row,TX,Harris County,281346832,US,29.84,-95.73,109280
3135,8701,STANDARD,Lakewood,,NJ,Ocean County,732848908,US,40.09,-74.21,105330
34065,77494,STANDARD,Katy,Park Row,TX,Fort Bend County,281832,US,29.74,-95.83,104450
...,...,...,...,...,...,...,...,...,...,...,...
10029,24042,UNIQUE,Roanoke,,VA,Roanoke City,540,US,37.27,-79.95,0
10030,24043,UNIQUE,Roanoke,,VA,Roanoke City,540,US,37.27,-79.95,0
10031,24044,UNIQUE,Roanoke,,VA,Roanoke City,540,US,37.27,-79.94,0
10032,24045,UNIQUE,Roanoke,,VA,Roanoke City,540,US,37.27,-79.94,0


In [164]:
# Clean up the data and filter to Baltimore County, Maryland, USA. And the population is above 20,000

MDData = MDData[((MDData['type'] == 'UNIQUE') | (MDData['type'] == 'STANDARD')) & (MDData['state'] == 'MD')]  
#MDData = MDData[(MDData['state'] == 'MD') & (MDData['irs_estimated_population_2015'] > 20000)]
MDData.sort_values(by='irs_estimated_population_2015', ascending=False, inplace=True)
MDData.groupby(['county', 'primary_city'])
MDData

Unnamed: 0,zip,type,primary_city,acceptable_cities,state,county,area_codes,country,latitude,longitude,irs_estimated_population_2015
8791,20906,STANDARD,Silver Spring,Aspen Hill,MD,Montgomery County,301240,US,39.09,-77.06,68290
8769,20878,STANDARD,Gaithersburg,"Darnestown, N Potomac, No Potomac, North Potomac",MD,Montgomery County,240301,US,39.12,-77.25,62930
8929,21234,STANDARD,Parkville,Baltimore,MD,Baltimore County,410443,US,39.38,-76.55,62620
8765,20874,STANDARD,Germantown,Darnestown,MD,Montgomery County,240,US,39.17,-77.26,59300
8873,21122,STANDARD,Pasadena,"Lake Shore, Millersville, Riviera Beach",MD,Anne Arundel County,410443,US,39.11,-76.55,57620
...,...,...,...,...,...,...,...,...,...,...,...
9079,21709,UNIQUE,Frederick,,MD,Frederick County,240,US,39.42,-77.41,0
8945,21268,UNIQUE,Baltimore,,MD,Baltimore City,410,US,39.21,-76.72,0
8947,21273,UNIQUE,Baltimore,,MD,Baltimore City,410,US,39.30,-76.61,0
9073,21690,UNIQUE,Chestertown,,MD,Kent County,410,US,39.21,-76.07,0


In [165]:
MDData.shape

(481, 11)

In [166]:
# Catagorize the color based on the population. If population is equal to or greater than 60000, then pop color is red.
#  If population is equal to or greater than 40000, then pop color is yellow. Otherwise, pop color is blue.

def popColor(MDData):
    if MDData['irs_estimated_population_2015'] >= 60000:
        return 'red'
    elif MDData['irs_estimated_population_2015'] >= 40000:
        return 'yellow'
    else:
        return 'blue'

MDData['PopColor'] = MDData.apply(popColor, axis=1)

In [167]:
# Use MD data with pop color equal to red or yellow for the rest of the analysis.
MDDataPop = MDData[(MDData['PopColor'] == 'red') | (MDData['PopColor'] == 'yellow')].reset_index(drop=True)

MDDataPop.head()


Unnamed: 0,zip,type,primary_city,acceptable_cities,state,county,area_codes,country,latitude,longitude,irs_estimated_population_2015,PopColor
0,20906,STANDARD,Silver Spring,Aspen Hill,MD,Montgomery County,301240,US,39.09,-77.06,68290,red
1,20878,STANDARD,Gaithersburg,"Darnestown, N Potomac, No Potomac, North Potomac",MD,Montgomery County,240301,US,39.12,-77.25,62930,red
2,21234,STANDARD,Parkville,Baltimore,MD,Baltimore County,410443,US,39.38,-76.55,62620,red
3,20874,STANDARD,Germantown,Darnestown,MD,Montgomery County,240,US,39.17,-77.26,59300,yellow
4,21122,STANDARD,Pasadena,"Lake Shore, Millersville, Riviera Beach",MD,Anne Arundel County,410443,US,39.11,-76.55,57620,yellow


In [168]:
# Create map of Maryland State with zip code, city, County, and 2015 estimated population
map_MD = folium.Map(location=[39.09, -77.06], zoom_start=10)

for lat, lng, city, county, population, popColor in zip(MDDataPop['latitude'], MDDataPop['longitude'], MDDataPop['primary_city'], MDDataPop['county'], MDDataPop['irs_estimated_population_2015'], MDDataPop['PopColor']):
        label = 'City: {}, County: {}, Population: {}, PopColor: {}'.format(city, county, population, popColor)
        label = folium.Popup(label, parse_html=True)
 
        folium.CircleMarker(
            location=[lat, lng],
            radius=5,
            popup=label,
            #icon=folium.Icon(color='yellow',icon_color='green',icon='cloud')
            color='blue',
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7,
            parse_html=False
        ).add_to(map_MD)

map_MD

In [169]:
#Define Foursquare Credentials and Version

CLIENT_ID = '2DNKE3IIDD0KWZVLRXDAFBYBCAXFOIL00NVVFUCZPD1DCC2I' # your Foursquare ID
CLIENT_SECRET = '004DNJPNQTBGZOJCJJN4GEERXGFQXFN5HHRAI2WN0JV50AJW' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

#print('Your credentails:')
#print('CLIENT_ID: ' + CLIENT_ID)
#print('CLIENT_SECRET:' + CLIENT_SECRET)


In [170]:
# Sort the data set by population in decending order.
MDDataPop.sort_values(by='irs_estimated_population_2015', ascending=False, inplace=True)
#print('MDDataPop: ', MDDataPop)

# For some reason the first location has few information, so we use the second record instead.
neig_latitude = MDDataPop.loc[1, 'latitude']
neig_longitude = MDDataPop.loc[1, 'longitude']
neig_name = MDDataPop.loc[1, 'primary_city']

print('Latitude and longitude values of {} are {}, {}.'.format(neig_name, neig_latitude, neig_longitude))

LIMIT = 400
radius = 500
# This is the section in Foursquare for food category
section = 'food'  

url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&section={}'.format(
    CLIENT_ID, CLIENT_SECRET, VERSION, neig_latitude, neig_longitude, radius, LIMIT, section)

url

Latitude and longitude values of Gaithersburg are 39.12, -77.25.


'https://api.foursquare.com/v2/venues/explore?client_id=2DNKE3IIDD0KWZVLRXDAFBYBCAXFOIL00NVVFUCZPD1DCC2I&client_secret=004DNJPNQTBGZOJCJJN4GEERXGFQXFN5HHRAI2WN0JV50AJW&v=20180605&ll=39.12,-77.25&radius=500&limit=400&section=food'

In [171]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5f52a8a87008ac04a8d5581d'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': '$-$$$$', 'key': 'price'},
    {'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Gaithersburg',
  'headerFullLocation': 'Gaithersburg',
  'headerLocationGranularity': 'city',
  'query': 'food',
  'totalResults': 14,
  'suggestedBounds': {'ne': {'lat': 39.1245000045, 'lng': -77.24421055591462},
   'sw': {'lat': 39.115499995499995, 'lng': -77.25578944408538}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '561eb25e498e2b15b3ba680f',
       'name': 'Inferno Pizzeria Napoletana',
       'location': {'address': '12207 Darnestown Rd',
        'lat': 39.118798999999996,
        'lng': -77.25384,
        'labeledLatLn

In [172]:
#Create function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [173]:
#Ready to clean the json and structure it into a pandas dataframe.
venues = results['response']['groups'][0]['items']

nearby_venues = json_normalize(venues) #flatten JSON

#Filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng', 'venue.location.postalCode', 'venue.location.city']
nearby_venues = nearby_venues.loc[:, filtered_columns]

#Filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

#Clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng,postalCode,city
0,Inferno Pizzeria Napoletana,Pizza Place,39.118799,-77.25384,20878,Gaithersburg
1,Neal's Bagels,Bagel Shop,39.118376,-77.251199,20878,Gaithersburg
2,California Tortilla,Fast Food Restaurant,39.118467,-77.253787,20878,Gaithersburg
3,Pho City,Vietnamese Restaurant,39.118137,-77.250913,20878,Gaithersburg
4,Papa John's Pizza,Pizza Place,39.116583,-77.252072,20878,North Potomac


In [174]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

14 venues were returned by Foursquare.


In [175]:
#Create a function to repeat the above same process to all the neighbourhoods in Maryland

def getNearbyVenues(names, latitudes, longitudes, radius=500):

    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        #Create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&section={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            neig_latitude, 
            neig_longitude, 
            radius, 
            LIMIT,
            section)
        
        #Make the GET request
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        #Return only relevant information for each nearby venue
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['location']['postalCode'],
            v['venue']['location']['city'],           
            v['venue']['categories'][0]['name']) for v in results])
        
        nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
        nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue ZipCode',               
                  'Venue City',                   
                  'Venue Category']
    
    return(nearby_venues)

In [176]:
MD_venues = getNearbyVenues(names=MDDataPop['primary_city'], 
                            latitudes=MDDataPop['latitude'], 
                            longitudes=MDDataPop['longitude']
                           )

Silver Spring
Gaithersburg
Parkville
Germantown
Pasadena
Silver Spring
Silver Spring
Hagerstown
Owings Mills
Glen Burnie
Fort Washington
Dundalk
Hyattsville
Potomac
Baltimore
Baltimore
Upper Marlboro
Ellicott City
Catonsville
Baltimore
Rockville
Lanham
Upper Marlboro
Gwynn Oak
Frederick
Ellicott City
Columbia


In [182]:
#Check the size of ther resulting dataframe and the top 20 records of MD_venues
print(MD_venues.shape)
MD_venues.head(20)

(378, 9)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue ZipCode,Venue City,Venue Category
0,Silver Spring,39.09,-77.06,Inferno Pizzeria Napoletana,39.118799,-77.25384,20878,Gaithersburg,Pizza Place
1,Silver Spring,39.09,-77.06,Neal's Bagels,39.118376,-77.251199,20878,Gaithersburg,Bagel Shop
2,Silver Spring,39.09,-77.06,California Tortilla,39.118467,-77.253787,20878,Gaithersburg,Fast Food Restaurant
3,Silver Spring,39.09,-77.06,Pho City,39.118137,-77.250913,20878,Gaithersburg,Vietnamese Restaurant
4,Silver Spring,39.09,-77.06,Papa John's Pizza,39.116583,-77.252072,20878,North Potomac,Pizza Place
5,Silver Spring,39.09,-77.06,Subway - Potomac Valley,39.118271,-77.251659,20878,Gaithersburg,Sandwich Place
6,Silver Spring,39.09,-77.06,Wendy’s,39.11672,-77.251243,20878,Gaithersburg,Fast Food Restaurant
7,Silver Spring,39.09,-77.06,Dunkin',39.116769,-77.252564,20878,Gaithersburg,Donut Shop
8,Silver Spring,39.09,-77.06,Siriwan Thai,39.118804,-77.251216,20878,Gaithersburg,Comfort Food Restaurant
9,Silver Spring,39.09,-77.06,Spring Mill Bread Co.,39.120392,-77.250106,20878,Gaithersburg,Bakery


In [183]:
# Show how many venues were returned for each neighbourhood
MD_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue ZipCode,Venue City,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Baltimore,42,42,42,42,42,42,42,42
Catonsville,14,14,14,14,14,14,14,14
Columbia,14,14,14,14,14,14,14,14
Dundalk,14,14,14,14,14,14,14,14
Ellicott City,28,28,28,28,28,28,28,28
Fort Washington,14,14,14,14,14,14,14,14
Frederick,14,14,14,14,14,14,14,14
Gaithersburg,14,14,14,14,14,14,14,14
Germantown,14,14,14,14,14,14,14,14
Glen Burnie,14,14,14,14,14,14,14,14


In [184]:
#Show how many unique categories can be curated from all the returned venues
print('There are {} uniques categories.'.format(len(MD_venues['Venue Category'].unique())))

There are 11 uniques categories.


In [185]:
# One hot encoding
MD_onehot = pd.get_dummies(MD_venues[['Venue Category']], prefix="", prefix_sep="")
MD_onehot  #378 rows X 11 columns

Unnamed: 0,Bagel Shop,Bakery,Chinese Restaurant,Comfort Food Restaurant,Donut Shop,Fast Food Restaurant,Italian Restaurant,Pizza Place,Sandwich Place,Thai Restaurant,Vietnamese Restaurant
0,0,0,0,0,0,0,0,1,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
373,0,1,0,0,0,0,0,0,0,0,0
374,0,0,0,0,0,0,0,0,0,1,0
375,0,0,1,0,0,0,0,0,0,0,0
376,0,0,0,0,0,0,1,0,0,0,0


In [187]:
# Add neighbourhood column back to the new dataframe, MD_onehot
MD_onehot['Neighbourhood'] = MD_venues['Neighbourhood']

MD_onehot.columns.get_loc("Neighbourhood")

11

In [188]:
# Have a variable mid to contain the column, MD_onehot['Neighbourhood'] 
mid = MD_onehot['Neighbourhood']
mid

0      Silver Spring
1      Silver Spring
2      Silver Spring
3      Silver Spring
4      Silver Spring
           ...      
373         Columbia
374         Columbia
375         Columbia
376         Columbia
377         Columbia
Name: Neighbourhood, Length: 378, dtype: object

In [189]:
# Move neighbourhood column to the first column
MD_onehot.drop(labels=['Neighbourhood'], axis=1, inplace=True)
MD_onehot.insert(0, "Neighbourhood", mid)
MD_onehot.head()

Unnamed: 0,Neighbourhood,Bagel Shop,Bakery,Chinese Restaurant,Comfort Food Restaurant,Donut Shop,Fast Food Restaurant,Italian Restaurant,Pizza Place,Sandwich Place,Thai Restaurant,Vietnamese Restaurant
0,Silver Spring,0,0,0,0,0,0,0,1,0,0,0
1,Silver Spring,1,0,0,0,0,0,0,0,0,0,0
2,Silver Spring,0,0,0,0,0,1,0,0,0,0,0
3,Silver Spring,0,0,0,0,0,0,0,0,0,0,1
4,Silver Spring,0,0,0,0,0,0,0,1,0,0,0


In [190]:
MD_onehot.shape

(378, 12)

In [192]:
# Let's group rows by neighbourhood and by taking the mean of the frequency of occurrence of each category
MD_grouped = MD_onehot.groupby('Neighbourhood').mean().reset_index()
MD_grouped

Unnamed: 0,Neighbourhood,Bagel Shop,Bakery,Chinese Restaurant,Comfort Food Restaurant,Donut Shop,Fast Food Restaurant,Italian Restaurant,Pizza Place,Sandwich Place,Thai Restaurant,Vietnamese Restaurant
0,Baltimore,0.071429,0.071429,0.071429,0.071429,0.071429,0.214286,0.071429,0.142857,0.071429,0.071429,0.071429
1,Catonsville,0.071429,0.071429,0.071429,0.071429,0.071429,0.214286,0.071429,0.142857,0.071429,0.071429,0.071429
2,Columbia,0.071429,0.071429,0.071429,0.071429,0.071429,0.214286,0.071429,0.142857,0.071429,0.071429,0.071429
3,Dundalk,0.071429,0.071429,0.071429,0.071429,0.071429,0.214286,0.071429,0.142857,0.071429,0.071429,0.071429
4,Ellicott City,0.071429,0.071429,0.071429,0.071429,0.071429,0.214286,0.071429,0.142857,0.071429,0.071429,0.071429
5,Fort Washington,0.071429,0.071429,0.071429,0.071429,0.071429,0.214286,0.071429,0.142857,0.071429,0.071429,0.071429
6,Frederick,0.071429,0.071429,0.071429,0.071429,0.071429,0.214286,0.071429,0.142857,0.071429,0.071429,0.071429
7,Gaithersburg,0.071429,0.071429,0.071429,0.071429,0.071429,0.214286,0.071429,0.142857,0.071429,0.071429,0.071429
8,Germantown,0.071429,0.071429,0.071429,0.071429,0.071429,0.214286,0.071429,0.142857,0.071429,0.071429,0.071429
9,Glen Burnie,0.071429,0.071429,0.071429,0.071429,0.071429,0.214286,0.071429,0.142857,0.071429,0.071429,0.071429


In [193]:
# Confirm the new size
print('MD_grouped.shape: ', MD_grouped.shape)

tempCol = MD_grouped.columns.T
print('tempCol: ', tempCol)
print('tempCol.shape: ', tempCol.shape)
tempCol.columns = ['venue', 'freq']

print('tempCol: ', tempCol)


MD_grouped.shape:  (21, 12)
tempCol:  Index(['Neighbourhood', 'Bagel Shop', 'Bakery', 'Chinese Restaurant',
       'Comfort Food Restaurant', 'Donut Shop', 'Fast Food Restaurant',
       'Italian Restaurant', 'Pizza Place', 'Sandwich Place',
       'Thai Restaurant', 'Vietnamese Restaurant'],
      dtype='object')
tempCol.shape:  (12,)
tempCol:  Index(['Neighbourhood', 'Bagel Shop', 'Bakery', 'Chinese Restaurant',
       'Comfort Food Restaurant', 'Donut Shop', 'Fast Food Restaurant',
       'Italian Restaurant', 'Pizza Place', 'Sandwich Place',
       'Thai Restaurant', 'Vietnamese Restaurant'],
      dtype='object')


Index(['Neighbourhood', 'Bagel Shop', 'Bakery', 'Chinese Restaurant',
       'Comfort Food Restaurant', 'Donut Shop', 'Fast Food Restaurant',
       'Italian Restaurant', 'Pizza Place', 'Sandwich Place',
       'Thai Restaurant', 'Vietnamese Restaurant'],
      dtype='object')

In [199]:
# Create function to sort the restaurant in descending order
def return_most_common_restaurant(row, num_top_restaurant):
    row_restaurant = row.iloc[1:]
    row_restaurant_sorted = row_restaurant.sort_values(ascending=False)
    
    return row_restaurant_sorted.index.values[0:num_top_restaurant]

In [201]:
# Let's create the new dataframe and display the top 10 restaurant for each neighbourhood
num_top_restaurant = 10

indicators = ['st', 'nd', 'rd']

# Create columns according to number of top restaurant
columns = ['Neighbourhood']
for ind in np.arange(num_top_restaurant):
    try:
        columns.append('{}{} Most Common Restaurant'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Restaurant'.format(ind+1))
        
# Create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = MD_grouped['Neighbourhood']

for ind in np.arange(MD_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_restaurant(MD_grouped.iloc[ind, :], num_top_restaurant)
    
neighbourhoods_venues_sorted.head(20)
    


Unnamed: 0,Neighbourhood,1st Most Common Restaurant,2nd Most Common Restaurant,3rd Most Common Restaurant,4th Most Common Restaurant,5th Most Common Restaurant,6th Most Common Restaurant,7th Most Common Restaurant,8th Most Common Restaurant,9th Most Common Restaurant,10th Most Common Restaurant
0,Baltimore,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
1,Catonsville,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
2,Columbia,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
3,Dundalk,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
4,Ellicott City,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
5,Fort Washington,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
6,Frederick,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
7,Gaithersburg,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
8,Germantown,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
9,Glen Burnie,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery


In [202]:
MD_grouped.head()

Unnamed: 0,Neighbourhood,Bagel Shop,Bakery,Chinese Restaurant,Comfort Food Restaurant,Donut Shop,Fast Food Restaurant,Italian Restaurant,Pizza Place,Sandwich Place,Thai Restaurant,Vietnamese Restaurant
0,Baltimore,0.071429,0.071429,0.071429,0.071429,0.071429,0.214286,0.071429,0.142857,0.071429,0.071429,0.071429
1,Catonsville,0.071429,0.071429,0.071429,0.071429,0.071429,0.214286,0.071429,0.142857,0.071429,0.071429,0.071429
2,Columbia,0.071429,0.071429,0.071429,0.071429,0.071429,0.214286,0.071429,0.142857,0.071429,0.071429,0.071429
3,Dundalk,0.071429,0.071429,0.071429,0.071429,0.071429,0.214286,0.071429,0.142857,0.071429,0.071429,0.071429
4,Ellicott City,0.071429,0.071429,0.071429,0.071429,0.071429,0.214286,0.071429,0.142857,0.071429,0.071429,0.071429


In [226]:
# Run k-means clustering

# Set number of clusters
kclusters = 7

MD_grouped_clu = MD_grouped.drop('Neighbourhood', 1)
#print('MD_grouped_clu: ', MD_grouped_clu)

# Run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0, n_init=100).fit(MD_grouped_clu)

# Check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

  return_n_iter=True)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [228]:
# Create a new dataframe that includes the cluster as well as the top 10 restaurant
neighbourhoods_venues_backup = neighbourhoods_venues_sorted
neighbourhoods_venues_sorted = neighbourhoods_venues_sorted.drop('Cluster Labels', 1)

# Add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [229]:
neighbourhoods_venues_sorted.head(10)

Unnamed: 0,Cluster Labels,Neighbourhood,1st Most Common Restaurant,2nd Most Common Restaurant,3rd Most Common Restaurant,4th Most Common Restaurant,5th Most Common Restaurant,6th Most Common Restaurant,7th Most Common Restaurant,8th Most Common Restaurant,9th Most Common Restaurant,10th Most Common Restaurant
0,0,Baltimore,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
1,0,Catonsville,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
2,0,Columbia,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
3,0,Dundalk,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
4,0,Ellicott City,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
5,0,Fort Washington,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
6,0,Frederick,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
7,0,Gaithersburg,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
8,0,Germantown,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
9,0,Glen Burnie,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery


In [230]:
MDDataPop.head()

Unnamed: 0,zip,type,primary_city,acceptable_cities,state,county,area_codes,country,latitude,longitude,irs_estimated_population_2015,PopColor
0,20906,STANDARD,Silver Spring,Aspen Hill,MD,Montgomery County,301240,US,39.09,-77.06,68290,red
1,20878,STANDARD,Gaithersburg,"Darnestown, N Potomac, No Potomac, North Potomac",MD,Montgomery County,240301,US,39.12,-77.25,62930,red
2,21234,STANDARD,Parkville,Baltimore,MD,Baltimore County,410443,US,39.38,-76.55,62620,red
3,20874,STANDARD,Germantown,Darnestown,MD,Montgomery County,240,US,39.17,-77.26,59300,yellow
4,21122,STANDARD,Pasadena,"Lake Shore, Millersville, Riviera Beach",MD,Anne Arundel County,410443,US,39.11,-76.55,57620,yellow


In [231]:
MD_merged = MDDataPop

# Merage MD grouped with MDDataPop to add latitude/longitude for the neighbourhood with high population
MD_merged = MD_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='primary_city')

MD_merged.head(10)

Unnamed: 0,zip,type,primary_city,acceptable_cities,state,county,area_codes,country,latitude,longitude,...,1st Most Common Restaurant,2nd Most Common Restaurant,3rd Most Common Restaurant,4th Most Common Restaurant,5th Most Common Restaurant,6th Most Common Restaurant,7th Most Common Restaurant,8th Most Common Restaurant,9th Most Common Restaurant,10th Most Common Restaurant
0,20906,STANDARD,Silver Spring,Aspen Hill,MD,Montgomery County,301240,US,39.09,-77.06,...,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
1,20878,STANDARD,Gaithersburg,"Darnestown, N Potomac, No Potomac, North Potomac",MD,Montgomery County,240301,US,39.12,-77.25,...,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
2,21234,STANDARD,Parkville,Baltimore,MD,Baltimore County,410443,US,39.38,-76.55,...,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
3,20874,STANDARD,Germantown,Darnestown,MD,Montgomery County,240,US,39.17,-77.26,...,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
4,21122,STANDARD,Pasadena,"Lake Shore, Millersville, Riviera Beach",MD,Anne Arundel County,410443,US,39.11,-76.55,...,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
5,20904,STANDARD,Silver Spring,Colesville,MD,Montgomery County,301,US,39.07,-76.98,...,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
6,20902,STANDARD,Silver Spring,Wheaton,MD,Montgomery County,240301,US,39.05,-77.04,...,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
7,21740,STANDARD,Hagerstown,,MD,Washington County,240301,US,39.63,-77.71,...,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
8,21117,STANDARD,Owings Mills,Garrison,MD,Baltimore County,410,US,39.41,-76.79,...,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
9,21061,STANDARD,Glen Burnie,,MD,Anne Arundel County,410443,US,39.16,-76.63,...,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery


In [232]:
# Drop the type, acceptable_cities, area_codes, and country column
MD_merged = MD_merged.drop(columns=['type', 'acceptable_cities', 'area_codes', 'country'])
MD_merged.head()

Unnamed: 0,zip,primary_city,state,county,latitude,longitude,irs_estimated_population_2015,PopColor,Cluster Labels,1st Most Common Restaurant,2nd Most Common Restaurant,3rd Most Common Restaurant,4th Most Common Restaurant,5th Most Common Restaurant,6th Most Common Restaurant,7th Most Common Restaurant,8th Most Common Restaurant,9th Most Common Restaurant,10th Most Common Restaurant
0,20906,Silver Spring,MD,Montgomery County,39.09,-77.06,68290,red,0,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
1,20878,Gaithersburg,MD,Montgomery County,39.12,-77.25,62930,red,0,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
2,21234,Parkville,MD,Baltimore County,39.38,-76.55,62620,red,0,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
3,20874,Germantown,MD,Montgomery County,39.17,-77.26,59300,yellow,0,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
4,21122,Pasadena,MD,Anne Arundel County,39.11,-76.55,57620,yellow,0,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery


In [233]:
# Change cluster labels in MD_merged from float to int
MD_merged['Cluster Labels'] = MD_merged['Cluster Labels'].astype(int)

# Visualize the resulting clusters
map_clusters = folium.Map(location=[39.09, -77.06], zoom_start=11)

# Set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Add markers to the map
markers_colors = []
for lat, lon, poi, cluster, popNum in zip(MD_merged['latitude'],  MD_merged['longitude'], MD_merged['primary_city'], MD_merged['Cluster Labels'], MD_merged['irs_estimated_population_2015']):
    label = folium.Popup(str(poi) + ' Cluster: ' + str(cluster) + ' Population: ' + str(popNum), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
    
map_clusters

In [234]:
#Examine clusters

#Cluster 1
MD_merged.loc[MD_merged['Cluster Labels'] == 0, MD_merged.columns[[1] + list(range(5, MD_merged.shape[1]))]]

Unnamed: 0,primary_city,longitude,irs_estimated_population_2015,PopColor,Cluster Labels,1st Most Common Restaurant,2nd Most Common Restaurant,3rd Most Common Restaurant,4th Most Common Restaurant,5th Most Common Restaurant,6th Most Common Restaurant,7th Most Common Restaurant,8th Most Common Restaurant,9th Most Common Restaurant,10th Most Common Restaurant
0,Silver Spring,-77.06,68290,red,0,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
1,Gaithersburg,-77.25,62930,red,0,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
2,Parkville,-76.55,62620,red,0,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
3,Germantown,-77.26,59300,yellow,0,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
4,Pasadena,-76.55,57620,yellow,0,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
5,Silver Spring,-76.98,55730,yellow,0,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
6,Silver Spring,-77.04,52380,yellow,0,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
7,Hagerstown,-77.71,52380,yellow,0,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
8,Owings Mills,-76.79,52350,yellow,0,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery
9,Glen Burnie,-76.63,51090,yellow,0,Fast Food Restaurant,Pizza Place,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Italian Restaurant,Donut Shop,Comfort Food Restaurant,Chinese Restaurant,Bakery


In [235]:
#Cluster 1
MD_merged.loc[MD_merged['Cluster Labels'] == 1, MD_merged.columns[[1] + list(range(5, MD_merged.shape[1]))]]

Unnamed: 0,primary_city,longitude,irs_estimated_population_2015,PopColor,Cluster Labels,1st Most Common Restaurant,2nd Most Common Restaurant,3rd Most Common Restaurant,4th Most Common Restaurant,5th Most Common Restaurant,6th Most Common Restaurant,7th Most Common Restaurant,8th Most Common Restaurant,9th Most Common Restaurant,10th Most Common Restaurant
