# Week 3 assignment: Segmenting and Clustering Neighborhoods in Toronto

# Question 1

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
website_url = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

In [3]:
soup = BeautifulSoup(website_url,"lxml")

In [4]:
table = soup.find("table",{"class": "wikitable sortable"})

In [5]:
# Use the BeautifulSoup to transform the data in the table on the Wikipedia page into list
table_list = []
for row in table.find_all('tr'):
    temp_list = []
    columns = row.find_all('td')
    for column in columns:
        temp_list.append(column.get_text().replace('\n',''))
    if temp_list !=[]:
        table_list.append(temp_list)    

In [6]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. 
for i in range(len(table_list)):
    if table_list[i][2] == 'Not assigned':
        table_list[i][2] = table_list[i][1]

In [7]:
df = pd.DataFrame(table_list, columns= ["PostalCode", "Borough", "Neighbourhood"])
# Ignore cells with a borough that is Not assigned.
df = df[df['Borough']!= 'Not assigned']

In [8]:
# rows with the same PostalCode is combined into one row with the neighborhoods separated with a comma
temp = df.groupby(['PostalCode', 'Borough'])['Neighbourhood'].apply(lambda x: "%s" % ', '.join(x))

In [9]:
# the final dataframe for the question 1
df = pd.DataFrame(temp).reset_index(level=['PostalCode', 'Borough'])

In [10]:
print (df.shape)

(103, 3)


In [11]:
df

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


# Question 2

In [12]:
# use the csv file offered by author to get the geographical coordinates of postal codes
geo = pd.read_csv("Geospatial_Coordinates.csv", index_col = 0)

In [13]:
# join the geo with the dataframe in question 1 and get the final answer for quetion 2
df = df.join(geo, on = 'PostalCode', how = 'left')

In [14]:
# check the result of the join.
df.isnull().sum()

PostalCode       0
Borough          0
Neighbourhood    0
Latitude         0
Longitude        0
dtype: int64

In [15]:
print (df.shape)
df.head()

(103, 5)


Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Question 3

In [16]:
# !conda install -c conda-forge folium=0.5.0 --yes 
# uncomment above line if you haven't completed the Foursquare API lab

In [17]:
import folium

In [21]:
# The geograpical coordinate of toronto
latitude = 43.6532
longitude = -79.3832
# create map of toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
#map_toronto.save('map_toronto.html')
map_toronto

In [22]:
# this map may be not shown in github. you can open the map_toronto.png in the same repository. 