In [1]:
#pip install geocoder

In [2]:
import numpy as np
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import pandas as pd
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

## TASK-1

In [3]:
import requests
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
# Executing a GET method of REST API and extracting the text with the requests module
raw_text = requests.get(url).text
from bs4 import BeautifulSoup
soup = BeautifulSoup(raw_text)
table=soup.find('table')

###### The 'p' element contains 'span' element which contains the 'text' for 'Borough' == 'Not assigned'cor the name of the Borough

In [4]:
print(table.findAll('td')[0].p.span.text) # Showing Borough for first row
print(table.findAll('td')[11].p.span.text) # Showing Borough for twelfth row

Not assigned
North York(Don Mills)North


##### 'PostalCode'

###### PostalCode is contained in first characters of the 'text' within the 'p' element

In [5]:
table.findAll('td')[11].p.text

'M3BNorth York(Don Mills)North\n'

In [6]:
table.findAll('td')[11].p.text[:3]

'M3B'

#### 'Borough'

###### "Before splitting on the symbol (" 

In [7]:
(table.findAll('td')[11].span.text)

'North York(Don Mills)North'

###### Since we need the text before the symbol ( as it is the Borough, we split on that symbol and obtain:

In [8]:
(table.findAll('td')[11].span.text).split('(')[0]

'North York'

##### 'Neigborhood'

###### Neigborhood is within the 'text' of the 'span' element

In [9]:
table.findAll('td')[11].span.text

'North York(Don Mills)North'

###### After performing split, strip to obtain the text, and then replacing '/' with ',' to combine all rows into one row with the neighborhoods separated with a comma.

In [10]:
(((((table.findAll('td')[11].span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')

'Don Mills North'

##### The above operations have been used in the for loop for all rows

In [11]:
table_contents=[]

for row in table.findAll('td'):
    cell = {}
    
    #Removing the rows with 'Borough' == 'Not assigned'. 
    if row.p.span.text=='Not assigned':
        pass
    else:
        #Postal code is first 3 characters hence sliced as below
        cell['PostalCode'] = row.p.text[:3]
        
        # Next use split ,strip and replace functions for getting 'Borough' and 'Neighborhood' information.
        cell['Borough'] = (row.span.text).split('(')[0]
        
        # If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.       
        #Combining rows into one row with the neighborhoods separated with a comma.        
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

# print(table_contents)
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [12]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [13]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(len(df['Borough'].unique()), df.shape[0]))

The dataframe has 15 boroughs and 103 neighborhoods.


In [14]:
# The value_counts of Boroughs, Neigborhoods and PostCodes with Nan 
print(df['Borough'].notna().value_counts())#[0]
print(df['Neighborhood'].notna().value_counts())
print(df['PostalCode'].notna().value_counts())

True    103
Name: Borough, dtype: int64
True    103
Name: Neighborhood, dtype: int64
True    103
Name: PostalCode, dtype: int64


## TASK-2

In [15]:
import geocoder

def get_latlng(postal_code):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    return lat_lng_coords
    
location = get_latlng('M4G')
location

[43.709020000000066, -79.36348999999996]

In [16]:
df['Latitude_Longitude'] = df['PostalCode'].apply(get_latlng)

def get_lat(latlng):
    return latlng[0]

def get_lng(latlng):
    return latlng[1]

    
df["Latitude"] = df['Latitude_Longitude'].apply(get_lat)
df['Longitude'] = df['Latitude_Longitude'].apply(get_lng)
df = df.drop(['Latitude_Longitude'], axis = 1)

print(df.shape)
print(df.head())

(103, 5)
  PostalCode           Borough                      Neighborhood  Latitude  \
0        M3A        North York                         Parkwoods  43.75245   
1        M4A        North York                  Victoria Village  43.73057   
2        M5A  Downtown Toronto         Regent Park, Harbourfront  43.65512   
3        M6A        North York  Lawrence Manor, Lawrence Heights  43.72327   
4        M7A      Queen's Park     Ontario Provincial Government  43.66253   

   Longitude  
0  -79.32991  
1  -79.31306  
2  -79.36264  
3  -79.45042  
4  -79.39188  


In [17]:
toronto_map = folium.Map(location=[43.65, -79.4], zoom_start=12) #Toronto latitude, longitude is [43.65, -79.4]

X = df['Latitude']
Y = df['Longitude']
Z = np.stack((X, Y), axis=1)

# Clustering with K-means on the basis of similar neighborhoods and boroughs given in the form of their Latitude and Longitude
kmeans = KMeans(n_clusters=4, random_state=0).fit(Z)

clusters = kmeans.labels_
colors = ['red', 'green', 'blue', 'yellow']
df['Cluster'] = clusters

for latitude, longitude, borough, neighborhood, cluster in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood'] ,df['Cluster']):
    label = folium.Popup('{}, {}'.format(neighborhood, borough), parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color=colors[cluster],
        fill_opacity=0.7).add_to(toronto_map)  

toronto_map

In [18]:
df_1 = df.drop(['Cluster'], axis = 1)
df_1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.75245,-79.32991
1,M4A,North York,Victoria Village,43.73057,-79.31306
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042
4,M7A,Queen's Park,Ontario Provincial Government,43.66253,-79.39188


In [19]:
df_1.shape

(103, 5)

### TASK - 3

To view the above Map, view the Notebook here: https://nbviewer.jupyter.org/github/ankitranjan30/Coursera_Capstone/blob/main/Week%203%20-%20Segmenting%20and%20Clustering%20Neighborhoods%20in%20Toronto.ipynb
    
    The map is not visible on GitHub because Jupyter notebooks they are displayed as static HTML sites. However they can be viewed as nbviewer (https://nbviewer.jupyter.org/) to view them with the interactive elements restored

# ----------------------------------------------------------------------------------------------------------

### Tasks completed