In [213]:
#!pip install bs4
#!pip install requests
#!pip install lxml
#!pip install geocoder
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab

In [245]:
from bs4 import BeautifulSoup as bs
from requests import get
import pandas as pd
import folium
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

In [184]:
response = get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
pagina_web = response.content
sopa = bs(pagina_web)

In [185]:
html_tabla = sopa.find('table', attrs={'class': 'wikitable'})

In [186]:
dfs = pd.read_html(pagina_web)

In [187]:
tabla_municipios = dfs[0]

In [188]:
indexNames = tabla_municipios[ tabla_municipios['Borough'] == 'Not assigned' ].index
tabla_municipios_sin_cosas = tabla_municipios.drop(indexNames)

In [189]:
print(tabla_municipios_sin_cosas.head())
print(tabla_municipios_sin_cosas.shape)

  Postcode           Borough     Neighbourhood
2      M3A        North York         Parkwoods
3      M4A        North York  Victoria Village
4      M5A  Downtown Toronto      Harbourfront
5      M6A        North York  Lawrence Heights
6      M6A        North York    Lawrence Manor
(210, 3)


In [190]:
tabla_municipios_sin_cosas.head()
grupos = tabla_municipios_sin_cosas.groupby(['Postcode', 'Borough'],)

salida = grupos['Neighbourhood'].apply(lambda x: ', '.join(list(x)),)


In [191]:
df_agrupado = salida.to_frame().reset_index()
df_agrupado.columns = ['Postcode', 'Borough', 'Neighbourhood']
print(df_agrupado.shape)

(103, 3)


In [192]:
#type(salida)
def asignar_vecindario(row):
    if row['Neighbourhood'] == 'Not assigned':
        return row['Borough']
    else:
        return row['Neighbourhood']
    
columna_nueva = df_agrupado.apply(asignar_vecindario, axis=1)

df_final = df_agrupado.assign(Neighbourhood=columna_nueva.values)


In [193]:
print(df_agrupado[df_agrupado['Neighbourhood'] == 'Not assigned'])
print(df_final[df_final['Neighbourhood'] == 'Not assigned'])
print(df_final[df_final['Postcode'] == 'M9A'])

   Postcode       Borough Neighbourhood
93      M9A  Queen's Park  Not assigned
Empty DataFrame
Columns: [Postcode, Borough, Neighbourhood]
Index: []
   Postcode       Borough Neighbourhood
93      M9A  Queen's Park  Queen's Park


In [194]:
df_final.shape

(103, 3)

In [195]:
csv_coords = pd.read_csv('http://cocl.us/Geospatial_data')



In [196]:
def obtener_coordenadas(postal_code):
    lat_long = csv_coords[csv_coords['Postal Code'] == postal_code].values[0]
    return lat_long[1], lat_long[2]


def obtener_coordenadas_row(row):
    return obtener_coordenadas(row['Postcode'])


In [197]:
lats = []
lons = []
for indice, row in df_final.iterrows():
    postal_code = row['Postcode']
    lat, lon = obtener_coordenadas(postal_code)
    lats.append(lat)
    lons.append(lon)


In [198]:
df_coords = df_final.assign(Latitude=pd.Series(lats), Longitude=pd.Series(lons))

In [199]:
df_coords[df_coords['Postcode'] == 'M5G']

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
57,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383


In [200]:
df_coords.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [202]:
CLIENT_ID = 'IXBMVBGYVTQLNBICYG3FQLULTFKK2M15I5SPAJHUTBPPOGJO' # your Foursquare ID
CLIENT_SECRET = 'YASBANHVDSLDGS35NGR24HYNQ4DKJ0AK1SEBXLZLTVHLCQZE' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: IXBMVBGYVTQLNBICYG3FQLULTFKK2M15I5SPAJHUTBPPOGJO
CLIENT_SECRET:YASBANHVDSLDGS35NGR24HYNQ4DKJ0AK1SEBXLZLTVHLCQZE


In [307]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [218]:
latitude = 43.653908
longitude = -79.384293

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_coords['Latitude'], 
                                           df_coords['Longitude'], 
                                           df_coords['Borough'], 
                                           df_coords['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [308]:
lugares_toronto = getNearbyVenues(
                                    names=df_coords['Neighbourhood'],
                                    latitudes=df_coords['Latitude'],
                                    longitudes=df_coords['Longitude'],
                                    radius=500
                                  )

In [309]:
lugares_toronto.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Rouge, Malvern",43.806686,-79.194353,Interprovincial Group,43.80563,-79.200378,Print Shop
2,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
3,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Affordable Toronto Movers,43.787919,-79.162977,Moving Target
4,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place


In [300]:
lugares_toronto.groupby('Neighborhood').count()


Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Agincourt,4,4,4,4,4,4
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",2,2,2,2,2,2
"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown",11,11,11,11,11,11
"Alderwood, Long Branch",9,9,9,9,9,9
...,...,...,...,...,...,...
Willowdale West,6,6,6,6,6,6
Woburn,3,3,3,3,3,3
"Woodbine Gardens, Parkview Hill",12,12,12,12,12,12
Woodbine Heights,9,9,9,9,9,9


In [310]:
print('There are {} uniques categories.'.format(len(lugares_toronto['Venue Category'].unique())))

There are 271 uniques categories.


In [311]:
toronto_onehot = pd.get_dummies(lugares_toronto[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = lugares_toronto['Neighborhood'] 

# move neighborhood column to the first column
columnas = list(toronto_onehot.columns)
columnas.remove('Neighborhood')
fixed_columns = ['Neighborhood'] + columnas 
toronto_onehot = toronto_onehot[fixed_columns]

toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

In [312]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [313]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()
# set number of clusters

kclusters = 2

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_coords

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

toronto_merged.dropna(inplace=True)

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [315]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Scarborough,0.0,Fast Food Restaurant,Print Shop,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Deli / Bodega
1,Scarborough,0.0,Moving Target,Bar,Yoga Studio,Doner Restaurant,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Donut Shop,Department Store
2,Scarborough,0.0,Electronics Store,Mexican Restaurant,Pizza Place,Breakfast Spot,Rental Car Location,Medical Center,Intersection,Dim Sum Restaurant,Diner,Discount Store
3,Scarborough,0.0,Coffee Shop,Korean Restaurant,Yoga Studio,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Dumpling Restaurant
4,Scarborough,0.0,Hakka Restaurant,Gas Station,Bank,Fried Chicken Joint,Thai Restaurant,Athletics & Sports,Caribbean Restaurant,Bakery,Donut Shop,Doner Restaurant
...,...,...,...,...,...,...,...,...,...,...,...,...
97,North York,0.0,Baseball Field,Construction & Landscaping,Yoga Studio,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant
99,Etobicoke,0.0,Chinese Restaurant,Pizza Place,Sandwich Place,Coffee Shop,Intersection,Middle Eastern Restaurant,Discount Store,Yoga Studio,Dim Sum Restaurant,Diner
100,Etobicoke,0.0,Pizza Place,Mobile Phone Shop,Sandwich Place,Bus Line,Yoga Studio,Dog Run,Dim Sum Restaurant,Diner,Discount Store,Doner Restaurant
101,Etobicoke,0.0,Grocery Store,Beer Store,Pharmacy,Pizza Place,Liquor Store,Fried Chicken Joint,Fast Food Restaurant,Sandwich Place,Japanese Restaurant,Discount Store


In [316]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
14,Scarborough,1.0,Playground,Park,Yoga Studio,Donut Shop,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant
23,North York,1.0,Convenience Store,Park,Bank,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Yoga Studio
25,North York,1.0,Bus Stop,Park,Food & Drink Shop,Yoga Studio,Doner Restaurant,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Donut Shop
30,North York,1.0,Park,Airport,Bus Stop,Snack Place,Yoga Studio,Donut Shop,Diner,Discount Store,Dog Run,Doner Restaurant
31,North York,1.0,Park,Bank,Hotel,Shopping Mall,Yoga Studio,Doner Restaurant,Dim Sum Restaurant,Diner,Discount Store,Dog Run
40,East York,1.0,Park,Convenience Store,Coffee Shop,Yoga Studio,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
44,Central Toronto,1.0,Park,Bus Line,Swim School,Yoga Studio,Doner Restaurant,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Donut Shop
50,Downtown Toronto,1.0,Park,Playground,Trail,Yoga Studio,Dog Run,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store
64,Central Toronto,1.0,Park,Sushi Restaurant,Jewelry Store,Trail,Yoga Studio,Doner Restaurant,Dim Sum Restaurant,Diner,Discount Store,Dog Run
74,York,1.0,Park,Women's Store,Market,Fast Food Restaurant,Colombian Restaurant,Comfort Food Restaurant,Event Space,College Stadium,Ethiopian Restaurant,Electronics Store


When using 3, 4 and 5 centroids the algorithm only assigns the centroid to those classes, so I chose to use just two classes.


The cluster 0 is about food and shopping


The cluster 1 is about open areas and leissure