In [1]:
import requests
import pandas as pd

In [2]:
import lxml.html

### Getting the HTML from the wikipedia


In [3]:
html = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

In [4]:
doc = lxml.html.fromstring(html.content)

In [5]:
##print(html.content)

### Using XPath to scrape the table for postal codes neighborhoods and burroughs 

In [6]:
## using xpath to get all of the postal code, burrough, and neighborhood text data
post_codes = doc.xpath('/html/body/div[3]/div[3]/div[4]/div/table[1]/tbody/tr[*]/td[1]/text()')
burroughs = doc.xpath('/html/body/div[3]/div[3]/div[4]/div/table[1]/tbody/tr[*]/td[2]/text()')
neighborhoods = doc.xpath('/html/body/div[3]/div[3]/div[4]/div/table[1]/tbody/tr[*]/td[3]/text()')
pc = []
b = []
n = []

In [7]:
##print(list(enumerate(post_codes)))
for i in enumerate(post_codes):
    ##print(i)
    pc.append(i[1])

In [8]:
for i in enumerate(burroughs):
    ##print(i)
    b.append(i[1])

In [9]:
del neighborhoods[142]
for i in enumerate(neighborhoods):
    ##print(i)
    n.append(i[1])
    


### Putting the data into a Data Frame

In [10]:
toronto_dic = {'Postal Code':pc, 'Borough':b, 'Neighborhood':n}
df_toronto = pd.DataFrame.from_dict(toronto_dic)
df_toronto.to_csv('toronto_df1')
df_toronto

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A\n,Not assigned\n,\n
1,M2A\n,Not assigned\n,\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,Regent Park / Harbourfront\n
...,...,...,...
175,M5Z\n,Not assigned\n,\n
176,M6Z\n,Not assigned\n,\n
177,M7Z\n,Not assigned\n,\n
178,M8Z\n,Etobicoke\n,Mimico NW / The Queensway West / South of Bloo...


### Cleaning DataFrame to remove "not assigned" and blank rows


In [11]:

df_toronto['Postal Code'] = df_toronto['Postal Code'].str.replace('\n','')
df_toronto['Borough'] = df_toronto['Borough'].str.replace('\n','')
df_toronto['Neighborhood'] = df_toronto['Neighborhood'].str.replace('\n','')
df_toronto.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
7,M8A,Not assigned,
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,Malvern / Rouge


### New clean data frame below

In [12]:
#removing all rows where neighborhood is not assigned and replacing "/" with commas in Neighborhood column
dft_new = df_toronto[df_toronto.Borough != 'Not assigned']
dft_new['Neighborhood'] = dft_new['Neighborhood'].str.replace(' /',',')
dft_new

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road , Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,Business reply mail Processing CentrE
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


## reading the csv with Lat/Long  coords 

In [13]:
g_coords = pd.read_csv("https://cocl.us/Geospatial_data")

In [14]:
g_coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Merging coordinate dataframe with the Toronto dataframe on Postal Code column

In [34]:
merged = g_coords.merge(dft_new, on='Postal Code')
merged = merged.reindex(columns = ['Postal Code','Borough','Neighborhood','Latitude','Longitude'])

In [35]:
merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [17]:
conda install -c conda-forge folium


Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [19]:
!conda install -c conda-forge geopy --yes

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\brcruz\Anaconda3

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          92 KB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.50-py_0
  geopy              conda-forge/noarch::geopy-1.21.0-py_0



Downloading and Extracting Packages

geographiclib-1.50   | 34 KB     |            |   0% 
geographiclib-1.50   | 34 KB     | ####7      |  47% 
geographiclib-1.50   | 34 K

In [20]:
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

## Map of Toronto data with neighborhood, borough labels  

In [28]:
address = 'Scarboro, Toronto'
geolocator = Nominatim(user_agent = "ca_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geo coordinates of CA are {},{}'.format(latitude,longitude))

The geo coordinates of CA are 43.806122200000004,-79.28893950073736


In [31]:
map_ca = folium.Map(location=[latitude,longitude],zoom_start =10)

for lat, lng, borough, neighborhoods in zip(merged['Latitude'],merged['Longitude'],merged['Borough'],merged['Neighborhood']):
    label = '{},{}'.format(neighborhoods, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius = 5,
    popup = label,
    color = 'blue',
    fill = True,
    fill_color = '#3186cc',
    fill_opacity = 0.7,
    parse_html=False).add_to(map_ca)
map_ca