In [22]:
# Created on Jan 29, 2020
# Autor: Alex Mottus
import pandas as pd # library to process data as dataframes
import numpy as np # library to handle data in a vectorized manner

df_boroughes=pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
# Read in the first dataframe - read_html creates multiple dataframs
df_borough=df_boroughes[0]
# Remove records which have a 'Borough' value of 'Not Assigned'
df_borough.drop(df_borough[df_borough['Borough']=='Not assigned'].index, inplace=True)
# Loop through each row in the dataframe checking for Neighbourhoods with the value 'Not assigned'. \
# If one is found move over the value in the Borough cell
for index, row in df_borough.iterrows():
    if row['Neighbourhood']=='Not assigned':
        row['Neighbourhood']= row['Borough']
# Identify all records with the same Postal Code and Borough: merger the Neighbourhood values in a comma delimited format
df_b=df_borough.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index()
# Rename the postal code column
df_b.rename(columns={'Postcode':'Postalcode'}, inplace=True)

#print(df_b.shape)

# Read in Geospactial data by Postal Code
df_data = pd.read_csv('http://cocl.us/Geospatial_data')
#print(df_data.head())
# My Code resumes here: Merge Dataframes based on the matching postal code fields
df_final=df_b.merge(df_data,how='inner', left_on='Postalcode', right_on='Postal Code')
# Remove extra postal code column
del df_final['Postal Code']
# Display top 10 Records

#df_final.head(10)

# Map Data
#!pip install folium
import folium # plotting library
from sklearn.cluster import KMeans
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Toronto Central 43.6552° N, 79.3641° W
latitude=43.6552
longitude=-79.3641
toronto_map=folium.Map(location=[latitude, longitude], zoom_start=12) # generate map centred on Toronto
tor_data = df_final[df_final['Borough'].str.contains('Toronto')].reset_index(drop=True)
#df[df['model'].str.contains('ac')]

for lat, lng, label in zip(tor_data['Latitude'], tor_data['Longitude'], tor_data['Borough']):
#   label = '{}, {}'.format(bor, neigh)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
            [lat, lng],
            radius=5,
            poup=label,
            fill=True,
            color='blue',
            fill_color='3186cc',
            fill_opacity=0.6,
            parse_html=False).add_to(toronto_map)

#tor_data.head()
toronto_map

CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

neighborhood_latitude = tor_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = tor_data.loc[0, 'Longitude'] # neighborhood longitude value
neighborhood_name = tor_data.loc[0, 'Borough'] # neighborhood name
radius = 500 # define radius
LIMIT = 100 # limit of number of venues returned by Foursquare API
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

results = requests.get(url).json()

# function that extracts the category of the venue
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
#        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Borough', #replace neighborhood with Borough 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)
    
tor_venues = getNearbyVenues(names=tor_data['Borough'],
                                   latitudes=tor_data['Latitude'],
                                   longitudes=tor_data['Longitude']
                                  )

# one hot encoding
tor_onehot = pd.get_dummies(tor_venues[['Venue Category']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
tor_onehot['Borough'] = tor_venues['Borough'] 
# move neighborhood column to the first column
fixed_columns = [tor_onehot.columns[-1]] + list(tor_onehot.columns[:-1])
tor_onehot = tor_onehot[fixed_columns]
#tor_onehot.head()
tor_grouped = tor_onehot.groupby('Borough').mean().reset_index()

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Borough']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
tor_venues_sorted = pd.DataFrame(columns=columns)
tor_venues_sorted['Borough'] = tor_grouped['Borough']
for ind in np.arange(tor_grouped.shape[0]):
    tor_venues_sorted.iloc[ind, 1:] = return_most_common_venues(tor_grouped.iloc[ind, :], num_top_venues)
#tor_venues_sorted.head()

# set number of clusters
kclusters = 4
tor_grouped_clustering = tor_grouped.drop('Borough', 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tor_grouped_clustering)
# check cluster labels generated for each row in the dataframe
#kmeans.labels_[0:10] 

# add clustering labels
tor_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
tor_merged = tor_data
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
tor_merged = tor_merged.join(tor_venues_sorted.set_index('Borough'), on='Borough')
#tor_merged.head() # check the last columns!

# create map
import matplotlib.cm as cm
import matplotlib.colors as colors

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(tor_merged['Latitude'], tor_merged['Longitude'], tor_merged['Borough'], tor_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

