### Collect imports

In [142]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
print("Imports complete")

Imports complete


### Scrape the neighborhood table from wikipedia using beautiful soup

In [143]:
page = requests.get("https://en.wikipedia.org/wiki/List_of_neighborhoods_in_Miami")
soup = BeautifulSoup(page.content, 'html.parser')

table = soup.find('table', 'wikitable sortable')
#for table in soup.find_all('table'):
#    if (table.attrs == 'wikitable sortable'):
#        break
        
#table = soup.find_all('table')[0]
tbody = table.find('tbody')


In [144]:

cols = []
header_row = tbody.find_all('tr')[0]
headers = header_row.find_all('th')
if (headers != []):

    for header in headers:
        col = header.get_text().replace('\n', '')
        cols.append(col)

# going to skip first row which are th-es
num_rows = len(tbody.find_all('tr')) - 1
print("num_rows: ", num_rows)

df = pd.DataFrame(columns = cols, index = range(num_rows))
df.shape


num_rows:  26


(26, 6)

### Pull out the actual table data into the dataframe

In [145]:
row_marker = 0
for row in table.find_all('tr'):
    if (row_marker != 0 and row_marker < num_rows):
        column_marker = 0
        columns = []
        columns = row.find_all('td')
        for column in columns:
            df.iat[row_marker,column_marker] = column.get_text()
            column_marker += 1
    row_marker += 1
df.head()

Unnamed: 0,Neighborhood,Demonym,Population2010,Population/Km²,Sub-neighborhoods,Coordinates
0,,,,,,
1,Allapattah,,54289.0,4401.0,,"25.815,-80.224\n"
2,Arts & Entertainment District,,11033.0,7948.0,,"25.799,-80.190\n"
3,Brickell,Brickellite,31759.0,14541.0,West Brickell,"25.758,-80.193\n"
4,Buena Vista,,9058.0,3540.0,Buena Vista East Historic District and Design ...,"25.813,-80.192\n"


### Extract the neighborhood names and the coordinates
### Split the coordinate field into latitude and longitude

In [146]:
new = df['Coordinates'].str.split(',', n = 1, expand = True)
df['Latitude'] = new[0]
df['Longitude'] = new[1]

In [147]:
drop_cols = [1,2,3,4,5]
df.drop(df.columns[drop_cols], axis = 1, inplace = True)

In [148]:
df.drop(df.index[0], inplace = True)

In [149]:
df['Longitude'].replace('\\n', '', regex = True, inplace = True)
df = df.replace(to_replace='None', value=np.nan).dropna()
df.reset_index(drop = True, inplace = True)
df.shape


(24, 3)

In [150]:
df.dtypes

Neighborhood    object
Latitude        object
Longitude       object
dtype: object

In [151]:
df = df.apply(pd.to_numeric, errors = 'ignore')
print(df.dtypes)
df.head()

Neighborhood     object
Latitude        float64
Longitude       float64
dtype: object


Unnamed: 0,Neighborhood,Latitude,Longitude
0,Allapattah,25.815,-80.224
1,Arts & Entertainment District,25.799,-80.19
2,Brickell,25.758,-80.193
3,Buena Vista,25.813,-80.192
4,Coconut Grove,25.712,-80.257


### Display the neighborhoods on a map of Miami

In [152]:
import json
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium 
import random
print("Imports complete")

Imports complete


In [153]:
Miami_latitude = 25.7617
Miami_longitude = -80.1918
miami_map = folium.Map(location=[Miami_latitude, Miami_longitude], zoom_start=14)
tooltip = "Click me"
for lat, lng, neighborhood in zip(df['Latitude'], df['Longitude'], df['Neighborhood']):
    label = '{}'.format(neighborhood)
    folium.Marker([lat, lng], popup = label).add_to(miami_map)
          
miami_map

### Get location data for Boys Town in Chicago using the address of the KitKat club.

In [154]:
from geopy.geocoders import Nominatim

In [155]:
address = "3700 N Halsted St Chicago IL"
geolocator = Nominatim(user_agent="my_explorer")
kitkat = None
while(kitkat == None):
    kitkat = geolocator.geocode(address)
latitude = kitkat.latitude
longitude = kitkat.longitude
print('The geograpical coordinate of KitKat Club are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of KitKat Club are 41.9493618, -87.6498987.


### Append KitKat club to the dataframe

In [156]:
cols = ['Neighborhood', 'Latitude', 'Longitude']
new_df = pd.DataFrame([['KitKat', latitude, longitude]], columns = cols)
df = df.append(new_df, ignore_index=True)
df

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Allapattah,25.815,-80.224
1,Arts & Entertainment District,25.799,-80.19
2,Brickell,25.758,-80.193
3,Buena Vista,25.813,-80.192
4,Coconut Grove,25.712,-80.257
5,Coral Way,25.75,-80.283
6,Design District,25.813,-80.193
7,Downtown,25.774,-80.193
8,Edgewater,25.802,-80.19
9,Flagami,25.762,-80.316


### Gather up the venues for these neighborhoods

In [157]:
CLIENT_ID = '0KE5TTWD03ZGR5NHFKGI2HKDDLIZBQUZB1VUZEBEQIJSIE2N' # your Foursquare ID
CLIENT_SECRET = '41K1PDWHLZXKDV5YB10JA4MICC1LQYCGU5UHIJ0GFEAXJQEU' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100

In [158]:
# Taken from Manhattan notebook
def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### This is the code that fetches venues for all the neighborhoods

In [159]:
miami_venues = getNearbyVenues(names = df['Neighborhood'],
                               latitudes = df['Latitude'],
                               longitudes = df['Longitude'])
print(miami_venues.shape)
miami_venues.head()

Allapattah
Arts & Entertainment District
Brickell
Buena Vista
Coconut Grove
Coral Way
Design District
Downtown
Edgewater
Flagami
Grapeland Heights
Liberty City
Little Haiti
Little Havana
Lummus Park
Midtown
Overtown
Park West
The Roads
Upper Eastside
Venetian Islands
Virginia Key
West Flagler
Wynwood
KitKat
(1494, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Allapattah,25.815,-80.224,Club Tipico Dominicano,25.809557,-80.218593,Nightclub
1,Allapattah,25.815,-80.224,Family Dollar,25.807093,-80.223627,Discount Store
2,Allapattah,25.815,-80.224,Little Caesars,25.809315,-80.22424,Pizza Place
3,Allapattah,25.815,-80.224,Redbox,25.807651,-80.225859,Video Store
4,Allapattah,25.815,-80.224,Winn-Dixie,25.808179,-80.224911,Grocery Store


In [160]:
print('There are {} unique categories.'.format(len(miami_venues['Venue Category'].unique())))

There are 213 unique categories.


### Process the returned data to be "one-hot" on the categories

In [161]:
# one hot encoding
miami_onehot = pd.get_dummies(miami_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
miami_onehot['Neighborhood'] = miami_venues['Neighborhood']


In [162]:
miami_onehot.columns = miami_onehot.columns.get_level_values(0)
miami_grouped = miami_onehot.groupby('Neighborhood').mean().reset_index()
miami_grouped

Unnamed: 0,Neighborhood,ATM,Airport,Airport Service,American Restaurant,Aquarium,Arcade,Arepa Restaurant,Argentinian Restaurant,Art Gallery,...,Tree,Vegetarian / Vegan Restaurant,Venezuelan Restaurant,Video Store,Waterfront,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Allapattah,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0
1,Arts & Entertainment District,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.11,...,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0
2,Brickell,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.03,0.0,...,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.03
3,Buena Vista,0.0,0.0,0.0,0.01,0.0,0.01,0.01,0.02,0.05,...,0.0,0.0,0.0,0.0,0.01,0.01,0.01,0.0,0.0,0.0
4,Coconut Grove,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Coral Way,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Design District,0.0,0.0,0.0,0.01,0.0,0.01,0.01,0.02,0.06,...,0.0,0.0,0.0,0.0,0.01,0.01,0.01,0.0,0.0,0.0
7,Downtown,0.0,0.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.01
8,Edgewater,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.01,0.08,...,0.0,0.01,0.0,0.0,0.0,0.01,0.02,0.0,0.0,0.0
9,Flagami,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.038462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Create a pandas dataframe of the top ten venues for each neighborhood after sorting according to count

In [163]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [164]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = miami_grouped['Neighborhood']

for ind in np.arange(miami_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(miami_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Allapattah,Park,Food & Drink Shop,Grocery Store,Discount Store,Latin American Restaurant
1,Arts & Entertainment District,Art Gallery,Ice Cream Shop,Restaurant,Bar,Peruvian Restaurant
2,Brickell,Hotel,Italian Restaurant,Pizza Place,Bar,Restaurant
3,Buena Vista,Italian Restaurant,Coffee Shop,Art Gallery,Pizza Place,Café
4,Coconut Grove,Park,Boat or Ferry,Cosmetics Shop,Garden,Trail
5,Coral Way,Park,Grocery Store,Café,Tennis Court,IT Services
6,Design District,Art Gallery,Italian Restaurant,Coffee Shop,Furniture / Home Store,Café
7,Downtown,Hotel,Italian Restaurant,Seafood Restaurant,Cocktail Bar,Coffee Shop
8,Edgewater,Art Gallery,Ice Cream Shop,Coffee Shop,Beer Garden,Pizza Place
9,Flagami,Liquor Store,Bakery,Seafood Restaurant,Fast Food Restaurant,Cuban Restaurant


In [166]:
kclusters = 5

miami_grouped_clustering = miami_grouped.drop('Neighborhood', 1)


# run k-means clustering
kmeans = KMeans(n_clusters = kclusters, random_state = 0).fit(miami_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 1, 1, 1, 3, 1, 1, 1, 1, 1], dtype=int32)

In [167]:
neighborhoods_venues_sorted.columns
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
df1 = neighborhoods_venues_sorted
df1.head()

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,3,Allapattah,Park,Food & Drink Shop,Grocery Store,Discount Store,Latin American Restaurant
1,1,Arts & Entertainment District,Art Gallery,Ice Cream Shop,Restaurant,Bar,Peruvian Restaurant
2,1,Brickell,Hotel,Italian Restaurant,Pizza Place,Bar,Restaurant
3,1,Buena Vista,Italian Restaurant,Coffee Shop,Art Gallery,Pizza Place,Café
4,3,Coconut Grove,Park,Boat or Ferry,Cosmetics Shop,Garden,Trail


### Check out the cluster to see which one has KitKat

In [169]:
df1.loc[df1['Cluster Labels'] == 0, df1.columns[[1] + list(range(2, df1.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
12,Liberty City,Sandwich Place,Donut Shop,Seafood Restaurant,Food,Park


In [170]:
df1.loc[df1['Cluster Labels'] == 1, df1.columns[[1] + list(range(2, df1.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1,Arts & Entertainment District,Art Gallery,Ice Cream Shop,Restaurant,Bar,Peruvian Restaurant
2,Brickell,Hotel,Italian Restaurant,Pizza Place,Bar,Restaurant
3,Buena Vista,Italian Restaurant,Coffee Shop,Art Gallery,Pizza Place,Café
5,Coral Way,Park,Grocery Store,Café,Tennis Court,IT Services
6,Design District,Art Gallery,Italian Restaurant,Coffee Shop,Furniture / Home Store,Café
7,Downtown,Hotel,Italian Restaurant,Seafood Restaurant,Cocktail Bar,Coffee Shop
8,Edgewater,Art Gallery,Ice Cream Shop,Coffee Shop,Beer Garden,Pizza Place
9,Flagami,Liquor Store,Bakery,Seafood Restaurant,Fast Food Restaurant,Cuban Restaurant
10,Grapeland Heights,Rental Car Location,Bus Station,Hotel Pool,Hotel,Train Station
11,KitKat,Gay Bar,General Entertainment,Mexican Restaurant,Coffee Shop,Sports Bar


In [171]:
df1.loc[df1['Cluster Labels'] == 2, df1.columns[[1] + list(range(2, df1.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
21,Venetian Islands,Cruise,Park,Island,Yoga Studio,Food


In [172]:
df1.loc[df1['Cluster Labels'] == 3, df1.columns[[1] + list(range(2, df1.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Allapattah,Park,Food & Drink Shop,Grocery Store,Discount Store,Latin American Restaurant
4,Coconut Grove,Park,Boat or Ferry,Cosmetics Shop,Garden,Trail


In [173]:
df1.loc[df1['Cluster Labels'] == 4, df1.columns[[1] + list(range(2, df1.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
22,Virginia Key,Park,Moving Target,Food,Dive Bar,Cafeteria
