# Part1 - Web Scrapping

## Install BeautifulSoup, lxml, html5lib, requests

In [1]:
!pip install beautifulsoup4



In [2]:
!pip install lxml



In [3]:
!pip install html5lib



In [4]:
!pip install requests



### Import data from URL

In [5]:
from bs4 import BeautifulSoup
import requests

In [6]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source,'lxml')

In [7]:
content = soup.find('div', class_='mw-parser-output').table

In [8]:
import pandas as pd

# define the dataframe columns
column_names = ['PostalCode', 'Borough', 'Neighborhood'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)
neighborhoods

Unnamed: 0,PostalCode,Borough,Neighborhood


In [9]:
pd.set_option('display.max_rows', None)

## Parse data and populate DataFrame

In [10]:
neighborhoods.drop(labels=None, axis=0, index=neighborhoods.index, inplace=True, errors='raise')
for row in content.findAll("tr"):
    for cell in row.findAll("td"):
        if (cell.p.span.text != 'Not assigned'):
            PostalCode = cell.p.b.text
            BoroughAndNeighborhood = cell.p.span.text
            count = 0
            count = BoroughAndNeighborhood.count("(")
            if (count >= 1):
                BoroughAndNeighborhood = cell.p.span.text.split("(")
                Borough = BoroughAndNeighborhood[0]
                Neighborhood1 = BoroughAndNeighborhood[1].replace(" /", ",")
                Neighborhood = Neighborhood1.split(")")[0]
                neighborhoods = neighborhoods.append({'PostalCode': PostalCode, 'Borough': Borough, 'Neighborhood': Neighborhood}, ignore_index=True)


neighborhoods = neighborhoods.append({'PostalCode': PostalCode, 'Borough': Borough, 'Neighborhood': Neighborhood}, ignore_index=True)                
neighborhoods

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M9A,Etobicoke,Islington Avenue
5,M1B,Scarborough,"Malvern, Rouge"
6,M3B,North York,Don Mills
7,M4B,East York,"Parkview Hill, Woodbine Gardens"
8,M5B,Downtown Toronto,"Garden District, Ryerson"
9,M6B,North York,Glencairn


In [11]:
neighborhoods.shape

(103, 3)

In [12]:
# Read Longitude and Lattitude data from CSV file

csv_path = 'https://cocl.us/Geospatial_data/Geospatial_Coordinates.csv'
df_longlat = pd.read_csv(csv_path)

df_longlat.head(5)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
df_longlat.rename(columns = {'Postal Code':'PostalCode'}, inplace = True) 
df_longlat.head(5)

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
postalcodes = pd.merge(neighborhoods, df_longlat, how='left', on=['PostalCode'])
postalcodes

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
5,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
6,M3B,North York,Don Mills,43.745906,-79.352188
7,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
8,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
9,M6B,North York,Glencairn,43.709577,-79.445073


In [15]:
!conda install -c conda-forge folium=0.5.0 --yes

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    openssl-1.1.1e             |       h516909a_0         2.1 MB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    branca-0.4.0               |             py_0          26 KB  conda-forge
    altair-4.0.1               |             py_0         575 KB  conda-forge
    certifi-2019.11.28         |   py36h9f0ad1d_1         149 KB  conda-forge
    ------------------------------------------------------------
                       

In [22]:
!conda install -c conda-forge geopy --yes

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          92 KB

The following NEW packages will be INSTALLED:

    geographiclib: 1.50-py_0   conda-forge
    geopy:         1.21.0-py_0 conda-forge


Downloading and Extracting Packages
geopy-1.21.0         | 58 KB     | ##################################### | 100% 
geographiclib-1.50   | 34 KB     | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done


# Create a map of Toronto with neighborhoods superimposed on top

## Find Longitude and Latitude of Toronto

In [24]:
#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


### Toronto Latitude = 43.6532 and Longitude = 79.3832

In [25]:
import folium # map rendering library

# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(postalcodes['Latitude'], postalcodes['Longitude'], postalcodes['Borough'], postalcodes['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

map_toronto

#### Cluster only the neighborhoods in Downtown Toronto. So let's slice the original dataframe and create a new dataframe of the Downtown Toronto data.

In [31]:
DowntownToronto_data = postalcodes[postalcodes['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
DowntownToronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
4,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383


Let's get the geographical coordinates of Downtown Toronto.

In [32]:
address = 'Downtown Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Downtown Toronto are 43.6541737, -79.38081164513409.


Let's visualize Downtown Toronto and the neighborhoods in it.

In [33]:
# create map of Manhattan using latitude and longitude values
map_DowntownToronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(DowntownToronto_data['Latitude'], DowntownToronto_data['Longitude'], DowntownToronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_DowntownToronto)  
    
map_DowntownToronto