# Toronto Neighborhood Clustering

In [1]:
#taking care of all the imports
import pandas as pd
import numpy as np

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library
import requests #for getting webpage data

!conda install -c conda-forge beautifulsoup4 --yes

from bs4 import BeautifulSoup #for web-scraping

#!conda install -c conda-forge geocoder --yes

#import geocoder #for getting addresses

Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.0.2p             |       h470a237_1         3.1 MB  conda-forge
    geopy-1.17.0               |             py_0          49 KB  conda-forge
    certifi-2018.11.29         |        py36_1000         145 KB  conda-forge
    conda-4.5.11               |        py36_1000         651 KB  conda-forge
    ca-certificates-2018.11.29 |       ha4d7672_0         143 KB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         4.1 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.49-py_0            conda-forge
    geopy:           

In [2]:
#getting the html page
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html_text=requests.get(url)

soup=BeautifulSoup(html_text.text,'html.parser')


### Getting the data from the Soup file into a DataFrame

In [3]:
toronto_base=pd.DataFrame()

#marker to avoid getting the first header data
skip_head=True

#looking for row tags
for tag in soup.find_all('tr'):
    #marker to tell which piece of data we should be looking at
    i=1
    
    #iterate through children within the table row
    for ch in tag.children:
        #skip if this is part of the table header or if it's not a table entry
        if(ch.name!=None and skip_head==False):
            if(i==1):  #then this is the Postal Code
                postcode=ch.string
                i=i+1
            elif(i==2):  #then this is the Borough
                borough=ch.string
                i=i+1
            elif(i==3):  #then this is the Neighborhood
                neigh=ch.string
                if(neigh!='Not assigned\n'):  #then need to dig a little deeper into the hyperlink tag to get the data
                    if(ch.a!=None):
                        neigh=ch.a.string
                
                i=i+1
            
    #re-set marker 
    i=1
    
    if(skip_head==False):  #then this is not the first time cycling through
        if(borough!='Not assigned'):  #using this to screen out postal codes that aren't assigned
            #adding the row to the dataframe
            toronto_base=toronto_base.append([[postcode,borough,neigh]])
            
    #after 1st time through, this marker now goes false now that we've skipped the table header
    skip_head=False
    
#add in column headers
toronto_base.columns=('PostalCode','Borough','Neighborhood')
toronto_base.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
0,M4A,North York,Victoria Village
0,M5A,Downtown Toronto,Harbourfront
0,M5A,Downtown Toronto,Regent Park
0,M6A,North York,Lawrence Heights


### Pre-Processing the Data

In [35]:
#create the new dataframe with the correct headers
toronto_final=pd.DataFrame(columns=('PostalCode','Borough','Neighborhood'))

#getting one copy of each of the Postal Codes and putting them in the new dataframe
toronto_final['PostalCode']=toronto_base['PostalCode'].unique()
#sorting alphabetically and eliminating the 'None' Postal Code
toronto_final=toronto_final.sort_values('PostalCode')[:-1]

#resetting the index and dropping the old index column
toronto_final.reset_index(inplace=True)
toronto_final.drop(['index'],axis=1,inplace=True)
toronto_final.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,,
1,M1C,,
2,M1E,,
3,M1G,,
4,M1H,,


In [36]:
#iterating through each Postal Code
for i in range(0,toronto_final.shape[0]-1):
    #getting the corresponding Borough from the original dataframe and copying it into the new dataframe
    temp=toronto_base.loc[lambda df: df.PostalCode==toronto_final.loc[i]['PostalCode']]['Borough'].unique()
    toronto_final.loc[i]['Borough']=temp[0]
    
    #getting the list of neighborhoods from the original dataframe
    temp=toronto_base.loc[lambda df: df.PostalCode==toronto_final.loc[i]['PostalCode']]['Neighborhood'].unique()
    s='' #creating a new string object
    
    #iterating through the neighborhood list and adding them into the string object
    for j in range(0,len(temp)):
        if(temp[j][-1]=='\n'): #then need to eliminate \n at the end            
            temp[j]=temp[j][:-1]
        if(temp[j]=='Not assigned'): #then there's no Neighborhood name and need to copy the Borough name
            temp[j]=toronto_final.loc[i]['Borough']
        if(s==''):  #then it's the first neighborhood entered
            s=temp[j]
        else:  #there's already neighborhoods in the list and we need a comma
            s=s+", "+temp[j]
    #put the list of neighborhoods into the new dataframe
    toronto_final.loc[i]['Neighborhood']=s
toronto_final.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [37]:
print(toronto_final.shape)

(103, 3)


### Getting Latitude/Longitude

In [38]:
####### Tried geocoder, but it didn't work.  Below is the code I was trying.
#lat_lng=None
#got_em_all=True
#c=0

#all_lat_lng=pd.DataFrame()

#for i in range(0,toronto_final.shape[0]):
 #   lat_lng=None
  #  k=0
   
    #while(lat_lng==None) and k<10:
   
        #g=geocoder.google("{}, Toronto, Ontario".format(toronto_final.loc[i]['PostalCode']))
    #    lat_lng=g.latlng
     #   k=k+1

#    if(lat_lng==None):
 #       got_em_all=False
  #  else:
   #     all_lat_lng=all_lat_lng.append([[toronto_final.loc[i]['PostalCode'],lat_lng[0],lat_lng[1]]])
    #    c=c+1
#all_lat_lng.columns=('PostalCode','Latitude','Longitude')

#if(got_em_all==True):
 #   print('Obtained all latitude/longitude pairs')
#else:
 #   print('Missed some latitude/longitude pairs')
  #  print('Only got {} of them.',c)

In [41]:

#loading the latitude/longitude data
geos=pd.read_csv('http://cocl.us/Geospatial_data')
#Updating column headings
geos.columns=('PostalCode','Latitude','Longitude')
geos.head()



Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [42]:
#joining the two dataframes into one
toronto_total=toronto_final.join(geos.set_index('PostalCode'),on='PostalCode')
print(toronto_total.shape)
toronto_total.head()

(103, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
