# C9_wk3_Capstone_Toronto_wiki
Course 9:  Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

##   TASK 1:  Create DataFrame from wiki page (Toronto)

__Import Libraries__

In [35]:
import pandas as pd
from bs4 import BeautifulSoup  # scrape websites
import requests

__steps 1 & 2:  Scrape the WIKI page__

In [36]:
#===================================================
#  STEP 1:  Create notebook
#  STEP 2:  Scrape wiki page
#===================================================
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_html = requests.get(url).text
soup = BeautifulSoup(wiki_html, 'lxml')

#  Inspecting the wiki page to determine the tags
#  We need 'tr' and 'td' tags in our loop
#soup.tbody.find('tr')
#soup.tbody.find_all('tr')[4]
#soup.tbody.find_all('tr')[4].find_all('td')

df_wiki = []
for tr in soup.tbody.find_all('tr'):
    df_wiki.append([ td.get_text().strip() for td in tr.find_all('td')])
    
df_wiki[0:4]

[[],
 ['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods']]

__step 3:  Setup the DataFrame__

In [37]:
#  Step 3:  Setup the data frame
#===================================================
df = pd.DataFrame(data=df_wiki, columns=['PostalCode','Borough','Neighborhood'])
print(df.info())
print(df.head())
print(df.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 289 entries, 0 to 288
Data columns (total 3 columns):
PostalCode      288 non-null object
Borough         288 non-null object
Neighborhood    288 non-null object
dtypes: object(3)
memory usage: 6.9+ KB
None
  PostalCode       Borough      Neighborhood
0       None          None              None
1        M1A  Not assigned      Not assigned
2        M2A  Not assigned      Not assigned
3        M3A    North York         Parkwoods
4        M4A    North York  Victoria Village
(289, 3)


In [38]:
#   Ignore Boroughs that are 'Not assigned'
#-----------------------------------------
df = df[df['Borough'] != 'Not assigned']

In [39]:
#  Group Neighborhoods per PostalCode
#-----------------------------------------
#  code example from https://stackoverflow.com/questions/51584363/
df = df.groupby('PostalCode')['Borough','Neighborhood'].agg(lambda x: ', '.join(set(x))).reset_index()

print("PostalCode:\t",len(df['PostalCode'].unique()))
print("Borough:\t",len(df['Borough'].unique()))
print("Neighborhood:\t",len(df['Neighborhood'].unique()))

PostalCode:	 103
Borough:	 11
Neighborhood:	 103


In [40]:
#  "Not assigned" neighborhood, will be the same as the borough
#-----------------------------------------
df.Neighborhood[df.Neighborhood == 'Not assigned']    # look for "Not assigned"
df.Neighborhood[df.Neighborhood == 'Not assigned'] = df.Borough[df.Neighborhood == 'Not assigned']   # replace

df[df['Neighborhood'] == 'Queen\'s Park']  #  check

Unnamed: 0,PostalCode,Borough,Neighborhood
85,M7A,Queen's Park,Queen's Park


In [41]:
#  make sure that there are no 'Neighborhood' == 'Not assigned'
df[df['Neighborhood'] == 'Not assigned']   #  check

Unnamed: 0,PostalCode,Borough,Neighborhood


__Print out the DataFrame__

In [42]:
df.head(df.shape[0])

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Port Union, Highland Creek, Rouge Hill"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Kennedy Park, Ionview"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffside, Scarborough Village West, Cliffcrest"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


__save dataframe to csv file__

In [43]:
df.to_csv('C:/Users/ACER/Desktop/JAVA/IBM_Certificate/labs/C9_wk3_Toronto_task1.csv',encoding='utf-8')

__Print number of rows using the .shape method.__

In [44]:
#Print number of rows
#-----------------------------------------
print("shape:\t",df.shape)      #  dimensions
print("rows:\t", df.shape[0])   #  number of rows

shape:	 (103, 3)
rows:	 103


## TASK 2: Use GeoCoder to Create DataFrame with Latitude/Longitude

__Import Libraries__

In [45]:
import geocoder
import folium

__Define function to use the Postal Code to get the latitude & longitude__    

(Using ARCGIS package...geocoder.google is very very slow)

In [46]:
#  Define function to get latitude & longitude using postal codes
def get_latlon(postal_code):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
        #print(postal_code,lat_lng_coords)
    return lat_lng_coords

get_latlon('M9B')   #  test function

[43.64969222700006, -79.55394499999994]

__Find location information for all postal codes__

In [47]:
postal_codes = df['PostalCode']

#  change postal_codes from series to list for the for loop
geo_latlon = [get_latlon(postal_code) for postal_code in postal_codes.tolist()]

__Create new dataframe for location and use it to update the original dataframe__

In [48]:
df_latlon = pd.DataFrame(data = geo_latlon, columns={'Latitude','Longitude'})
print(df_latlon.head())

#  Add Latitude and Longitude to the original dataframe
df['Latitude']  = df_latlon['Latitude']
df['Longitude'] = df_latlon['Longitude']

    Latitude  Longitude
0  43.811525 -79.195517
1  43.785730 -79.158750
2  43.765690 -79.175256
3  43.768359 -79.217590
4  43.769688 -79.239440


__Print out updated dataframe__

In [49]:
df.head(df.shape[0])

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.811525,-79.195517
1,M1C,Scarborough,"Port Union, Highland Creek, Rouge Hill",43.785730,-79.158750
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.765690,-79.175256
3,M1G,Scarborough,Woburn,43.768359,-79.217590
4,M1H,Scarborough,Cedarbrae,43.769688,-79.239440
5,M1J,Scarborough,Scarborough Village,43.743125,-79.231750
6,M1K,Scarborough,"East Birchmount Park, Kennedy Park, Ionview",43.726245,-79.263670
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.713133,-79.285055
8,M1M,Scarborough,"Cliffside, Scarborough Village West, Cliffcrest",43.723575,-79.234976
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.696665,-79.260163


## FOLIUM MAP
__Create folium map of the Borough using the Postal Codes__

In [50]:
#  Generate Map of Boroughs based on Postal Codes
toronto_map = folium.Map(location=[df.Latitude.mean(), df.Longitude.mean()], zoom_start=10)

# add the Italian restaurants as blue circle markers
for lat, lng, label in zip(df.Latitude, df.Longitude, df.Borough):
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        color='blue',
        popup=label,
        fill = True,
        fill_color='blue',
        fill_opacity=0.6
    ).add_to(toronto_map)

# display map
toronto_map