# Segmenting and Clustering Neighborhoods in Toronto

### Scraping data from Wikipedia

In [92]:
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page
import pandas as pd

In [93]:
#Due to changes in the wikipedia page I have to use another URL containing the original information needed for this practice
url = "https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=1011037969"

In [94]:
data  = requests.get(url).text

In [95]:
soup = BeautifulSoup(data,"html5lib")

In [96]:
tables = soup.find_all('table')
len(tables)

3

In [97]:
#print(tables[0].prettify())

In [98]:
Nei_data = pd.DataFrame(columns=["PostalCode", "Borough", "Neighborhood"])

for row in tables[0].tbody.find_all("tr"):
    col = row.find_all("td")
    if (col != []):
        PCode = col[0].text.rstrip('\n')
        Borough = col[1].text.rstrip('\n')
        Neighborhood = col[2].text.rstrip('\n')
        Nei_data = Nei_data.append({"PostalCode":PCode, "Borough":Borough, "Neighborhood":Neighborhood}, ignore_index=True)

Nei_data

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [99]:
#Nei_data[Nei_data['Borough'] == 'Not assigned']

In [100]:
#Nei_data[Nei_data['Neighborhood']== 'Not assigned']

### Pre - procesing Borough Data
#### - Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
#### - More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma.
#### - If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.

In [101]:
# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
print("Before: ",Nei_data.shape)

Nei_data.drop(Nei_data[Nei_data['Borough'] == 'Not assigned'].index, inplace = True)

print("After: ",Nei_data.shape)

Before:  (180, 3)
After:  (103, 3)


In [102]:
# More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma

postal_code = Nei_data[['PostalCode','Neighborhood']]
duplicates = postal_code['PostalCode'].value_counts().to_frame()
duplicates[duplicates['PostalCode'] > 1]

# There is no duplicates per Postal Code, maybe someone has changed the source table in wikipedia. It has no sense to combine rows to get all the neighborhoods linked to one postal code in one row and separated by commas because that's the way it is in Wikipedia.

Unnamed: 0,PostalCode


In [103]:
# despite there is no duplicates postal codes, next you can find the code to combine several Neighborhoods under the same postal code.

# We get a list of all postal codes without duplicates.
postal_code_list = list(Nei_data['PostalCode'].drop_duplicates())

# and we create a dicionary getting all the Neighborhoods (separated by commas) linked to each postal code.
postal_code_dic = dict({'PostalCode':postal_code_list,'Neighborhood':[','.join(list(postal_code[postal_code['PostalCode']==item]['Neighborhood'])) for item in postal_code_list]})

#transform the dictionary to a dataframe
postal_code_datafr = pd.DataFrame(postal_code_dic)

postal_code_datafr.head()


Unnamed: 0,PostalCode,Neighborhood
0,M3A,Parkwoods
1,M4A,Victoria Village
2,M5A,"Regent Park, Harbourfront"
3,M6A,"Lawrence Manor, Lawrence Heights"
4,M7A,"Queen's Park, Ontario Provincial Government"


In [104]:
# We drop the original Neighborhood column
Nei_data.drop(columns = ['Neighborhood'], inplace = True)

# delete duplicates
Nei_data.drop_duplicates(inplace = True)

# and place the new column Neighborhood with the values separted by commas 
Nei_data = Nei_data.merge(postal_code_datafr, on = 'PostalCode')

Nei_data.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [105]:
#  If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

Nei_data['Neighborhood'].replace('Not assigned',Nei_data['Borough'], inplace=True)
Nei_data.head(12)
# Please notice that the order of the elements differ from the exam screenshot example due to changes in the wikipedia changes.

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [106]:
# In case you want to check the same records given by the author next the dataframe filtered getting the exam example postal codes same order.

Order = {'M5G':0,'M2H':1,'M4B':2,'M1J':3,'M4G':4,'M4M':5,'M1R':6,'M9V':7,'M9L':8,'M5V':9,'M1B':10,'M5A':11}

Nei_data[Nei_data['PostalCode'].isin(['M5G','M2H','M4B','M1J','M4G','M4M','M1R','M9V','M9L','M5V','M1B','M5A'])]

Nei_data_exam = Nei_data[Nei_data['PostalCode'].isin(['M5G','M2H','M4B','M1J','M4G','M4M','M1R','M9V','M9L','M5V','M1B','M5A'])]
Nei_data_exam['Order'] = [Order[item] for item in Nei_data_exam['PostalCode']]
Nei_data_exam.set_index('Order', inplace = True)
Nei_data_exam.sort_values(by=['Order'], inplace = True)
Nei_data_exam.reset_index(inplace = True)
Nei_data_exam.drop(columns = ['Order'], inplace = True)
Nei_data_exam

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Parkview Hill, Woodbine Gardens"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Wexford, Maryvale"
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har..."


In [107]:
Nei_data.shape

(103, 3)

### Getting Longitude and Latitude

In [108]:
pip install geocoder

Note: you may need to restart the kernel to use updated packages.



import geocoder # import geocoder

def lat_long (postal_code):
    # initialize your variable to None
    print(postal_code)
    for item in postal_code:
        lat_lng_coords = None
        print('item',str(item))
        # loop until you get the coordinates
        print('starting loop',lat_lng_coords)
        while(lat_lng_coords is None):
          g = geocoder.google('{}, Toronto, Ontario'.format(str(item)))
          lat_lng_coords = g.latlng
        print('lat_long',lat_lng_coords)

        latitude.extend(lat_lng_coords[0])
        longitude.extend(lat_lng_coords[1])
    
    return latitude, longitude

Nei_data_test = Nei_data.loc[0:2,:]

Nei_data_test['Latitude'], Nei_data_test['Longitude'] = lat_long(list(Nei_data_test['PostalCode']))

#Nei_data['Latitude'], Nei_data['Longitude'] = lat_long(Nei_data['PostalCode'])

Nei_data_test
    

In [109]:
# As I am not getting anything from geocoderI am going to try with th csv file.
# The URL given by the author is not working.

#lat_long_pd = pd.read_csv('http://cocl.us/Geospatial_data')
#lat_long_pd.head()

# Once I have manually downloaded in my PC the csv file and once I have uploaded it to my IBM Cloud Project:

import types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_bb27e20917e047638a8ac4e4bb04c59d = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='83_0prJpPAE4r_6XaRZ4AbaluezDEXajJo8F1eF-fiRH',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

body = client_bb27e20917e047638a8ac4e4bb04c59d.get_object(Bucket='datasciencecapstone-donotdelete-pr-nifdfkgyttirop',Key='Geospatial_Coordinates.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

lat_long_pd = pd.read_csv(body)
lat_long_pd.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [110]:
# Adding latitude and Longitude to the final dataframe

Nei_data = Nei_data.merge(lat_long_pd, left_on = 'PostalCode', right_on = 'Postal Code')
Nei_data.drop(columns='Postal Code', inplace = True)
Nei_data.head(12)
# Please notice that the order of the elements differ from the exam screenshot example due to changes in the wikipedia changes.

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [111]:
# In case you want to check the same records given by the author next the dataframe filtered getting the exam example postal codes same order.

Order = {'M5G':0,'M2H':1,'M4B':2,'M1J':3,'M4G':4,'M4M':5,'M1R':6,'M9V':7,'M9L':8,'M5V':9,'M1B':10,'M5A':11}

Nei_data[Nei_data['PostalCode'].isin(['M5G','M2H','M4B','M1J','M4G','M4M','M1R','M9V','M9L','M5V','M1B','M5A'])]

Nei_data_exam = Nei_data[Nei_data['PostalCode'].isin(['M5G','M2H','M4B','M1J','M4G','M4M','M1R','M9V','M9L','M5V','M1B','M5A'])]
Nei_data_exam['Order'] = [Order[item] for item in Nei_data_exam['PostalCode']]
Nei_data_exam.set_index('Order', inplace = True)
Nei_data_exam.sort_values(by=['Order'], inplace = True)
Nei_data_exam.reset_index(inplace = True)
Nei_data_exam.drop(columns = ['Order'], inplace = True)
Nei_data_exam

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Wexford, Maryvale",43.750072,-79.295849
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442


In [112]:
Nei_data.shape

(103, 5)