## Task 1: Scrape the Wikipedia page

In [1]:
# Load pandas

import pandas as pd

# Webpage url                                                                                                               
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# Extract tables
dfs = pd.read_html(url)

# Get first table                                                                                                           
df = dfs[0]
#print(df)
df.head()
df.shape

(180, 3)

### Remove 'Not assigned' in 'Borough'

In [2]:
df=df[df['Borough']!='Not assigned']
df.reset_index(drop=True, inplace=True)
print(df.head(3))
df.shape

  Postal Code           Borough              Neighbourhood
0         M3A        North York                  Parkwoods
1         M4A        North York           Victoria Village
2         M5A  Downtown Toronto  Regent Park, Harbourfront


(103, 3)

### Combine neighborhood  exist in one postal code area

In [3]:
df.groupby(['Postal Code','Borough'], sort=False)['Neighbourhood'].apply(','.join).reset_index()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [4]:
df[df['Neighbourhood']=='Not assigned']['Neighbourhood'].value_counts()

Series([], Name: Neighbourhood, dtype: int64)

### No 'Not assigned  neighborhood' found

In [5]:
df.shape[0]

103

In [6]:
! pip install geocoder



## Task 2: add coordinate

In [7]:
# define the coordinate columns
add_names = ['Latitude', 'Longitude'] 
column_names = [df.columns , add_names]


In [8]:
coordinates = pd.DataFrame(columns=add_names)
coordinates

Unnamed: 0,Latitude,Longitude


In [9]:
# instantiate the dataframe

import geocoder # import geocoder

# loop the postal code
for pos in df['Postal Code']:
# initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(pos))
      lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]

    
    coordinates.append({'latitude': latitude, 'longitude':longitude}, ignor_index=True)
coordinates.head()

KeyboardInterrupt: 

In [10]:
coor=pd.read_csv('Geospatial_Coordinates.csv')
coor.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
df=pd.merge(df,coor,on='Postal Code')
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## Task 3: Cluster Neighbourhood

In [12]:
df=df[df['Borough'].str.find('Toronto') != -1]
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [13]:
# one hot encoding
neigh_onehot = df['Neighbourhood'].str.get_dummies(sep=',')
neigh_onehot.head()


Unnamed: 0,Adelaide,Bathurst Quay,Cabbagetown,Chinatown,Deer Park,Design Exchange,Dovercourt Village,Exhibition Place,Forest Hill Road Park,Forest Hill SE,...,Runnymede,St. James Town,Stn A PO Boxes,Studio District,Summerhill West,The Annex,The Beaches,The Danforth West,Toronto Dominion Centre,University of Toronto
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [15]:
df.shape

(39, 5)

In [19]:
!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library


usage: conda-script.py [-h] [-V] command ...
conda-script.py: error: unrecognized arguments: # uncomment this line if you haven't completed the Foursquare API lab


In [24]:
# create map of New York using latitude and longitude values
latitude=df.iloc[0,3]
longitude=df.iloc[0,4]

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough in zip(df['Latitude'], df['Longitude'], df['Borough']):
    label = '{}'.format(borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto