### Toronto Neighborhood DataFrame

In [132]:
import requests
import numpy as np
import lxml.html as lh
import pandas as pd


In [57]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [58]:
#Create a handle, page, to handle the contents of the website
page = requests.get(url)
#Store the contents of the website under doc
doc = lh.fromstring(page.content)
#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

In [59]:
for t in tr_elements[0]:
    print(t)

<Element th at 0x7f51a21712c8>
<Element th at 0x7f51a2171318>
<Element th at 0x7f51a2171368>


In [60]:
#Check the length of the first 12 rows
[len(T) for T in tr_elements[:12]]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

In [61]:
tr_elements = doc.xpath('//tr')
#Create empty list
col=[]
i=0
#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    col.append((name,[]))

In [62]:
col

[('Postcode', []), ('Borough', []), ('Neighbourhood\n', [])]

In [63]:
#Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 3, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

In [64]:
[len(C) for (title,C) in col]

[288, 288, 288]

In [65]:
Dict={title:column for (title,column) in col}


In [66]:
Dict.keys()

dict_keys(['Postcode', 'Borough', 'Neighbourhood\n'])

##### Data Frame

In [67]:
df=pd.DataFrame(Dict)

In [68]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


###### drop \n

In [69]:
df.columns= map(lambda x: x.rstrip('\n'),df.columns)

In [70]:
df.columns

Index(['Postcode', 'Borough', 'Neighbourhood'], dtype='object')

In [71]:
df['Neighbourhood'] =df['Neighbourhood'].map(lambda x: x.rstrip('\n'))

In [72]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [73]:
df.shape

(288, 3)

In [80]:
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned
df=df[df.Borough !='Not assigned']

In [81]:
df.shape

(211, 3)

In [120]:
df_final=df.groupby("Postcode" ,as_index=False).agg({'Borough' : 'first', 'Neighbourhood' : lambda x: "%s" % ', '.join(x)})

In [121]:
df_final.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [122]:
df_final.shape

(103, 3)

### Get Latitude and Longtitude

In [113]:
! pip install geocoder
import geocoder

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 17.1MB/s ta 0:00:01
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [133]:
postal_code=pd.read_csv("http://cocl.us/Geospatial_data")

In [134]:
postal_code.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [135]:
lat_lng_coords=df_final.join(postal_code.set_index('Postal Code'), on='Postcode')

In [136]:
lat_lng_coords.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


### Explore and Cluster the neighborhoods in Toronto

In [130]:
# import numpy as np # library to handle data in a vectorized manner

# import pandas as pd # library for data analsysis

# !conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
# from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# import requests # library to handle requests

# # Matplotlib and associated plotting modules
# import matplotlib.cm as cm
# import matplotlib.colors as colors

# # import k-means from clustering stage
# from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library


Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    folium-0.5.0               |             py_0          45 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    altair-3.2.0               |           py36_0         770 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         868 KB

The following NEW packages will be INSTALLED:

    altair:  3.2.0-py36_0 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge


Downloading and Extracting Packages
folium-0.5.0         | 45 KB    

In [142]:
# work with only boroughs that contain the word Toronto
Toronto_df=lat_lng_coords[lat_lng_coords.Borough.str.contains("Toronto") ]


In [144]:
Toronto_df.shape

(38, 5)

In [157]:
Toronto_df.reset_index(drop=True,inplace=True)

In [158]:
Toronto_df.groupby('Borough').count()

Unnamed: 0_level_0,Postcode,Neighbourhood,Latitude,Longitude
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Central Toronto,9,9,9,9
Downtown Toronto,18,18,18,18
East Toronto,5,5,5,5
West Toronto,6,6,6,6


In [159]:
print('There are {} uniques Borough.'.format(len(Toronto_df['Borough'].unique())))

There are 4 uniques Borough.


##### Analyze each Neighborhood

In [165]:
# one hot encoding
toronto_onehot = pd.get_dummies(Toronto_df[['Borough']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = Toronto_df['Neighbourhood'] 

#move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighbourhood,Central Toronto,Downtown Toronto,East Toronto,West Toronto
0,The Beaches,0,0,1,0
1,"The Danforth West, Riverdale",0,0,1,0
2,"The Beaches West, India Bazaar",0,0,1,0
3,Studio District,0,0,1,0
4,Lawrence Park,1,0,0,0


In [166]:
toronto_onehot.shape

(38, 5)

In [173]:
##Cluster
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_onehot.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

  return_n_iter=True)


array([0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 2, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 0], dtype=int32)

In [176]:
# add clustering labels
toronto_onehot.insert(0, 'Cluster Labels', kmeans.labels_)



# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = Toronto_df.join(toronto_onehot.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,Central Toronto,Downtown Toronto,East Toronto,West Toronto
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,0,0,1,0
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0,0,0,1,0
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,0,0,0,1,0
3,M4M,East Toronto,Studio District,43.659526,-79.340923,0,0,0,1,0
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,2,1,0,0,0


In [188]:
address = 'Toronto, Canada'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Manhattan are 43.653963, -79.387207.


In [189]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### 4 Clusters

In [195]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[2] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,Cluster Labels,Central Toronto,Downtown Toronto,East Toronto,West Toronto
0,The Beaches,0,0,0,1,0
1,"The Danforth West, Riverdale",0,0,0,1,0
2,"The Beaches West, India Bazaar",0,0,0,1,0
3,Studio District,0,0,0,1,0
37,Business Reply Mail Processing Centre 969 Eastern,0,0,0,1,0


In [196]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[2] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,Cluster Labels,Central Toronto,Downtown Toronto,East Toronto,West Toronto
10,Rosedale,1,0,1,0,0
11,"Cabbagetown, St. James Town",1,0,1,0,0
12,Church and Wellesley,1,0,1,0,0
13,"Harbourfront, Regent Park",1,0,1,0,0
14,"Ryerson, Garden District",1,0,1,0,0
15,St. James Town,1,0,1,0,0
16,Berczy Park,1,0,1,0,0
17,Central Bay Street,1,0,1,0,0
18,"Adelaide, King, Richmond",1,0,1,0,0
19,"Harbourfront East, Toronto Islands, Union Station",1,0,1,0,0


In [197]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[2] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,Cluster Labels,Central Toronto,Downtown Toronto,East Toronto,West Toronto
4,Lawrence Park,2,1,0,0,0
5,Davisville North,2,1,0,0,0
6,North Toronto West,2,1,0,0,0
7,Davisville,2,1,0,0,0
8,"Moore Park, Summerhill East",2,1,0,0,0
9,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",2,1,0,0,0
22,Roselawn,2,1,0,0,0
23,"Forest Hill North, Forest Hill West",2,1,0,0,0
24,"The Annex, North Midtown, Yorkville",2,1,0,0,0


In [193]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[2] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,Central Toronto,Downtown Toronto,East Toronto,West Toronto
31,West Toronto,3,0,0,0,1
32,West Toronto,3,0,0,0,1
33,West Toronto,3,0,0,0,1
34,West Toronto,3,0,0,0,1
35,West Toronto,3,0,0,0,1
36,West Toronto,3,0,0,0,1
