## Agrupación y Segmentación de Vecindarios en la Ciudad de Toronto

## Empiecemos por crear un nuevo Notebook e importemos librerias y dependencias 

In [1]:
import requests # librería para manejar las solicitudes
import pandas as pd # librería para análisis de datos
import numpy as np # librería para manejar datos vectorizados
import random # librería para generar números aleatorios
from bs4 import BeautifulSoup

!pip install geopy
from geopy.geocoders import Nominatim # módulo para convertir una dirección en valores de latitud y longitud 

# librerías para mostrar imágenes 
from IPython.display import Image 
from IPython.core.display import HTML 
    
# librería para convertir un archivo json en un dataframe pandas
from pandas.io.json import json_normalize


! pip install folium==0.5.0
import folium # librería para graficar 

print('Folium installed')
print('Libraries imported.')

Collecting geopy
  Downloading geopy-2.3.0-py3-none-any.whl (119 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting geographiclib<3,>=1.52
  Downloading geographiclib-2.0-py3-none-any.whl (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: geographiclib, geopy
Successfully installed geographiclib-2.0 geopy-2.3.0
Collecting folium==0.5.0
  Downloading folium-0.5.0.tar.gz (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.2/79.2 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: folium
  Building wheel for folium (setup.py) ... [?25ldone
[?25h  Created wheel for folium: filename=folium-0.5.0-py3-none-any.whl size=76117 sha256=d0129d00b0df78d9cd96e169a12a50e100003efd266761fc4b

## Crear el código para rastrear la siguiente página de Wikipedia, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, usaremos BeatifulSoup que hace Web Scraping para obtener los datos de la página web

In [2]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")

## Procesaremos únicamente las celdas que tengan un municipio asignado. Ignoramos las celdas con un municipio que esté No asignado.

In [3]:
table = soup.find('table')

In [4]:
file=[]
ncol=list(range(0,9))

for i in ncol :
    
    for row in table.find_all('tr'): # in html table row is represented by the table
    # Get all columns in each row.
        cols = row.find_all('p') # in html a column is represented by the tag td or p
        file.append(cols[i].getText())

    # convert to dataframe:
df= pd.DataFrame(file)

## Por pandas se dará formato a la tabla.

In [5]:
df["Postal Code"]=df[0].str[0:3]
df["Borough"]=df[0].str[3:].str.split("(",n=1,expand=True)[0]
df["Neigh"]=df[0].str.rsplit("(",n=2,expand=True)[1].str.split(")",n=1,expand=True)[0]
df["Neighbourhood"]=df["Neigh"].str.replace(" / ",",")

In [6]:
postalcode_list=df[(df["Borough"]!="Not assigned\n")]
postalcode_list=postalcode_list.drop([0,"Neigh"],axis=1)

Veamos la tabla

In [7]:
postalcode_list.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
1,M1B,Scarborough,"Malvern,Rouge"
2,M1C,Scarborough,"Rouge Hill,Port Union,Highland Creek"
3,M1E,Scarborough,"Guildwood,Morningside,West Hill"
4,M1G,Scarborough,Woburn
5,M1H,Scarborough,Cedarbrae


In [9]:
postalcode_list.shape

(103, 3)

## Coordenadas de latitud y longitud de cada vecindario

Para obtener los datos, utilizaremos un archivo csv que tiene las coordenadas geográficas de cada código postal: http://cocl.us/Geospatial_data

In [16]:
#"http://cocl.us/Geospatial_data"
Geospatial_Coordinates=pd.read_csv("Geospatial_Coordinates.csv")

In [17]:
coord_list=pd.merge(postalcode_list,Geospatial_Coordinates,on="Postal Code")

In [18]:
coord_list.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill,Port Union,Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Agrupación y Segmentación de vecindarios de Toronto

Vamos a usar la libreria geopy para explorar los vecindarios de la ciudad de Toronto y vamos a crear un mapa.

In [24]:
address = 'TORONTO,Ontario'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [28]:
toronto_data = coord_list[coord_list['Borough'].str.contains('Toronto')].reset_index(drop=True)
toronto_data.groupby("Borough").size().sort_values(ascending=False)

Borough
Downtown Toronto                                                17
Central Toronto                                                  9
West Toronto                                                     6
East Toronto                                                     4
Downtown TorontoStn A PO Boxes25 The Esplanade                   1
East TorontoBusiness reply mail Processing Centre969 Eastern     1
East YorkEast Toronto                                            1
dtype: int64

In [29]:
# crear un mapa de Manhattan usando los valores de latitud y longitud
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# añadir los marcadores al mapa
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Analisis de vecindarios de Toronto

Vamos a utilizar el API de FourSquare para explorar los barrios y segmentarlos

Definir version y credenciales de Foursquare

In [68]:
#@hidden_cell
CLIENT_ID ='55S2ZJJJVK3T1D1ZDNQLHEAH15HG4AHHMYJZNETKJK5EXKBB' # su ID de Foursquare
CLIENT_SECRET ='NRH3ODLT453CFZY2C0P33DJEDKCXROHBNC1OQSMT52ST3TD4' # su Secreto de Cliente de Foursquare
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 55S2ZJJJVK3T1D1ZDNQLHEAH15HG4AHHMYJZNETKJK5EXKBB
CLIENT_SECRET:NRH3ODLT453CFZY2C0P33DJEDKCXROHBNC1OQSMT52ST3TD4


In [73]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # crear la URL de solicitud de API
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # solicitud GET
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # regresa solo información relevante de cada sitio cercano
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [75]:
radius = 500
LIMIT = 50 # this will retreive data for only 100 venues per neighbourhood
toronto_venues = getNearbyVenues(names=toronto_data['Neighbourhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

The Beaches


KeyError: 'groups'

Veamos el dataset de Venues

In [66]:
print(toronto_venues.shape)
toronto_venues.head(10)

NameError: name 'toronto_venues' is not defined

Numero de venues por vecindario

In [67]:
#pd.set_option('display.max_colwidth', 1)
toronto_venues.groupby('Neighbourhood').count()

NameError: name 'toronto_venues' is not defined