In [3]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re
!conda install -c conda-forge folium=0.5.0 --yes
import folium

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    altair-4.0.1               |             py_0         575 KB  conda-forge
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    branca-0.4.0               |             py_0          26 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.0 MB

The following NEW packages will be 

In [4]:
## PART 1 SCRAPPING NEIGHBORHOOD INFO 

#define headers for the request
headers = requests.utils.default_headers()
headers.update({"user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36"})

In [5]:
#Wikipedia link for the information to scrap
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
req = requests.get(url, headers)
soup = BeautifulSoup(req.content, 'html.parser')


In [6]:

#Finding columns name html style
columns_name = str(soup.table.find_all("th")).replace("\n","")

#cleaning tags
columns_string=re.findall('<th>(.+?)</th>',columns_name)

#Transform in a Series to create data frame
columns_name = pd.Series(list(columns_string))


In [7]:

#finding dataframe values
values = str(soup.table.find_all("td")).replace("\n","")
values = re.findall('<td>(.+?)</td>',values)

#finding Neighbourhood clean name
neigh = []
for value in values:
    if len(re.findall('>(.+?)</a>',str(value)))<1:
        neigh.append(value)
    else:
        a = re.findall('>(.+?)<',str(value)) #result type is a list
        neigh.append(a[0])

#We have a list with all values, now we are going to create 3 Series for the dataframe values (1 for Postcodes, 1 for Borough and 1 for Neighbourhoods)
#3 empty list
pc = [] #Postcode
br = [] #Borough
ne = [] #Neighbourhoods

for i in range(len(neigh)):
    if i == 0:
        pc.append(neigh[i])
        br.append(neigh[i+1])
        ne.append(neigh[i+2])
    elif i in range(int(len(neigh)/3)):
        pc.append(neigh[0+i*3])
        br.append(neigh[1+i*3])
        ne.append(neigh[2+i*3])
        

In [8]:
#Creating DataFrame with the data scrapped
d = {"PostalCode":pc,
     columns_name[1]:br,
     "Neighborhood":ne}
df = pd.DataFrame(d)

#Selecting Index with Borough == Not assigned
dropIndex = df[df['Borough']=="Not assigned"].index
#dropping Not Assigned Borough values

df.drop(dropIndex,inplace=True)

In [9]:
#Grouping by Postal Code and Borough
df=df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(','.join).reset_index()
#printing number of rows
df.head()


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [10]:
df.shape

(103, 3)

In [11]:
## PART 2 ADD LATITUDE AND LONGITUDE

#Lets Start with the second part, Getting lat and lon from each Postal Code
#URl to csv file to get Lat and Long

csv = 'https://cocl.us/Geospatial_data'
location = pd.read_csv(csv)


In [12]:
neighborhoods = pd.concat([df,location], axis=1, join='inner')

In [13]:
neighborhoods.drop(columns=['Postal Code'],inplace=True)
neighborhoods

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


In [24]:
#PART 3 EXPLORE AND CLUSTERING

#There is a diference between the job done with New York data and the Toronto Data. Toronto has many Neighborhoods in the same borough, 
#and the principal geolocation is determined by the PostalCode. I going to incluide PostCode and Borough in each analysis.

#Creating Toronto Map

latitude = 43.6532
longitude = -79.3832
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood, postcode in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood'], neighborhoods['PostalCode']):
    label = '{} {}: {}'.format(postcode, borough, neighborhood )
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

In [15]:
#Creating a dataframe only with borough that contains the word "TORONTO"
Neigh = neighborhoods[neighborhoods['Borough'].str.contains('Toronto',case=False)].reset_index()

Neigh.drop(columns='index',inplace=True)

In [18]:
Neigh.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [23]:
#Creating a Map with only the borough that contains de word "TORONTO"
latitude = 43.6532
longitude = -79.3832
map_Toronto2 = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood, postcode in zip(Neigh['Latitude'], Neigh['Longitude'], Neigh['Borough'], Neigh['Neighborhood'], Neigh['PostalCode']):
    label = '{} {}: {}'.format(postcode, borough, neighborhood )
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto2)  
    
map_Toronto2

In [25]:
#foursquare credentials
CLIENT_ID = 'JRRZNIVCUYW3DIXKI34LPFI11ZQID4MOXAXI4VVIFH3GXBFM' #Foursquare ID
CLIENT_SECRET = 'GHZGQEA2VDYJYEXAJAPQI4SWSW241WXS5QEQDSSS4SERFPL3' #Foursquare Secret
VERSION = '20180605' # Foursquare API version

LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius



In [28]:
#Funtion to scrap the foursquare information, I going to include PostalCode and Borough. Because many neighborhood has the same location of one PostalCode.

def getNearbyVenues(postcode, borough, neigh, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for postcode, borough, neigh, lat, lng in zip(postcode, borough, neigh, latitudes, longitudes):
        print('{} {}: {}'.format(postcode,borough,neigh))
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            postcode,
            borough,
            neigh,
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['PostalCode',
                  'Borough',
                  'Neighborhood', 
                  'Latitude', 
                  'Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [29]:
Toronto_venues = getNearbyVenues(postcode=Neigh['PostalCode'], 
                                 borough=Neigh['Borough'], 
                                 neigh=Neigh['Neighborhood'], 
                                 latitudes=Neigh['Latitude'], 
                                 longitudes=Neigh['Longitude'],
                                 radius=500)

M4E East Toronto: The Beaches
M4K East Toronto: The Danforth West,Riverdale
M4L East Toronto: The Beaches West,India Bazaar
M4M East Toronto: Studio District
M4N Central Toronto: Lawrence Park
M4P Central Toronto: Davisville North
M4R Central Toronto: North Toronto West
M4S Central Toronto: Davisville
M4T Central Toronto: Moore Park,Summerhill East
M4V Central Toronto: Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West
M4W Downtown Toronto: Rosedale
M4X Downtown Toronto: Cabbagetown,St. James Town
M4Y Downtown Toronto: Church and Wellesley
M5A Downtown Toronto: Harbourfront
M5B Downtown Toronto: Ryerson,Garden District
M5C Downtown Toronto: St. James Town
M5E Downtown Toronto: Berczy Park
M5G Downtown Toronto: Central Bay Street
M5H Downtown Toronto: Adelaide,King,Richmond
M5J Downtown Toronto: Harbourfront East,Toronto Islands,Union Station
M5K Downtown Toronto: Design Exchange,Toronto Dominion Centre
M5L Downtown Toronto: Commerce Court,Victoria Hotel
M5N Central Toronto: 

In [33]:
print(Toronto_venues.shape)
Toronto_venues.head()

(1724, 9)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,M4E,East Toronto,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,M4E,East Toronto,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


In [44]:
Toronto_venues.groupby(['PostalCode','Borough','Neighborhood']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
PostalCode,Borough,Neighborhood,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
M4E,East Toronto,The Beaches,4,4,4,4,4,4
M4K,East Toronto,"The Danforth West,Riverdale",41,41,41,41,41,41
M4L,East Toronto,"The Beaches West,India Bazaar",17,17,17,17,17,17
M4M,East Toronto,Studio District,42,42,42,42,42,42
M4N,Central Toronto,Lawrence Park,3,3,3,3,3,3
M4P,Central Toronto,Davisville North,7,7,7,7,7,7
M4R,Central Toronto,North Toronto West,25,25,25,25,25,25
M4S,Central Toronto,Davisville,35,35,35,35,35,35
M4T,Central Toronto,"Moore Park,Summerhill East",4,4,4,4,4,4
M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West",15,15,15,15,15,15


In [45]:
print('There are {} uniques categories.'.format(len(Toronto_venues['Venue Category'].unique())))

There are 238 uniques categories.


In [46]:
#Checking for each PostalCode Borough and Neighboorhood
# one hot encoding
manhattan_onehot = pd.get_dummies(manhattan_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
manhattan_onehot['Neighborhood'] = manhattan_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [manhattan_onehot.columns[-1]] + list(manhattan_onehot.columns[:-1])
manhattan_onehot = manhattan_onehot[fixed_columns]

manhattan_onehot.head()

NameError: name 'manhattan_venues' is not defined