## Segmenting and Clustering Neighborhoods in Toronto_GR
### Part1
#### For this assignment, you will be required to explore and cluster the neighborhoods in Toronto.

In [2]:
!pip install bs4
import pandas as pd
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page

Collecting bs4
  Downloading https://files.pythonhosted.org/packages/10/ed/7e8b97591f6f456174139ec089c769f89a94a1a4025fe967691de971f314/bs4-0.0.1.tar.gz
Collecting beautifulsoup4 (from bs4)
[?25l  Downloading https://files.pythonhosted.org/packages/d1/41/e6495bd7d3781cee623ce23ea6ac73282a373088fcd0ddc809a047b18eae/beautifulsoup4-4.9.3-py3-none-any.whl (115kB)
[K     |████████████████████████████████| 122kB 8.8MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2; python_version >= "3.0" (from beautifulsoup4->bs4)
  Downloading https://files.pythonhosted.org/packages/41/e7/3617a4b988ed7744743fb0dbba5aa0a6e3f95a9557b43f8c4740d296b48a/soupsieve-2.2-py3-none-any.whl
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/jupyterlab/.cache/pip/wheels/a0/b0/b2/4f80b9456b87abedbc0bf2d52235414c3467d8889be38dd472
Successfully built bs4
Installing collected packages: soupsieve, beautifulsoup4, bs4
Successfully installed beauti

In [3]:
#The below url contains html tables with data about postal codes of Canada.
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [4]:
data  = requests.get(url).text

In [5]:
soup = BeautifulSoup(data,"html5lib")
tables = soup.find_all('table') 

In [6]:
for index,table in enumerate(tables):
    if ("Neighbourhood" in str(table)):
        table_index = index
print(table_index)

0


In [7]:
#print(tables[table_index].prettify())
pc_data = pd.DataFrame(columns=["Postal Code", "Borough", "Neighbourhood"])

for row in tables[table_index].tbody.find_all("tr"):
    col = row.find_all("td")
    if (col != []):
        postalcode = col[0].text.strip()
        borough = col[1].text.strip()
        neighbourhood = col[2].text.strip()
        pc_data = pc_data.append({"Postal Code":postalcode, "Borough":borough, "Neighbourhood":neighbourhood}, ignore_index=True)

pc_data

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [8]:
#ignore cells with a borough that is Not assigned
pc_data = pc_data[pc_data["Borough"]!="Not assigned"]
pc_data.shape

(103, 3)

### Part2
#### Get the latitude and the longitude coordinates of each neighborhooda

In [9]:
#download the data and open it
data2 = requests.get("http://cocl.us/Geospatial_data", allow_redirects=True)
open('geo.csv', 'wb').write(data2.content)
geo_data = pd.read_csv('geo.csv')
geo_data

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [10]:
#merge the data
pc_data = pc_data.merge(geo_data, on="Postal Code")
pc_data

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


### Part3
#### Generate maps to visualize your neighborhoods and how they cluster together

In [11]:
import numpy as np
from sklearn.cluster import KMeans
!pip install folium
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors



In [20]:
#dropping the group of Borough and postal code
cn = pc_data.drop(['Borough','Postal Code'], axis=1)

In [21]:
# set number of clusters
k = 10
cn_clustering = cn.drop('Neighbourhood', axis=1)

# run k-means clustering
kmeans = KMeans(n_clusters=k, random_state=1).fit(cn_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:15] 

array([2, 8, 1, 4, 1, 6, 5, 7, 8, 1, 4, 6, 5, 8, 8], dtype=int32)

In [22]:
# add clustering labels
cn.insert(3, 'Cluster Labels', kmeans.labels_)

cn.head() # check the last columns!

Unnamed: 0,Neighbourhood,Latitude,Longitude,Cluster Labels
0,Parkwoods,43.753259,-79.329656,2
1,Victoria Village,43.725882,-79.315572,8
2,"Regent Park, Harbourfront",43.65426,-79.360636,1
3,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,4
4,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,1


In [27]:
# create map
map_clusters = folium.Map(location=[cn['Latitude'][0], cn['Longitude'][0]], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(cn['Latitude'], cn['Longitude'], cn['Neighbourhood'], cn['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters