<a href="https://colab.research.google.com/github/alesouzaeu/Coursera_Capstone/blob/main/Segmenting_and_Clustering_Neighborhoods_in_Toronto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Segmenting and Clustering Neighborhoods in Toronto

## Install the necessaries packages We will need

In [1]:
!pip install requests2
!pip install beautifulsoup4



## Importing the libraries we will need.

In [119]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

from geopy.geocoders import Nominatim


#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')


#Tirando Warnings do código
import warnings
warnings.filterwarnings('ignore')




Libraries imported.


## Making web scraping with Beautiful Soup 4 in Wikipedia URL 

In [120]:
# I called the URL that contains the table we want.
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
response = requests.get(URL)
soup = BeautifulSoup(response.text, 'html.parser')
# Parseamos o Html desejado em variáveis para facilitar o trabalho
table = soup.find('table', {'class':'wikitable sortable'}).tbody

rows = table.find_all('tr')

columns = [v.text.replace('\n', '') for v in rows[0].find_all('th')]
# Defining the columns names with the web scraped texts.
df =  pd.DataFrame(columns=columns)
# Creating a loop to take a data at the right position in the dataframe.
for i in range(1, len(rows)):
  tds = rows[i].find_all('td')

  if len(tds) ==4:
      values = [tds[0].text, tds[1].text, tds[2].text.replace('\n','')]# eliminamos caracteres indesejados por meio do método .replace
  else:
    values = [td.text.replace('\n','') for td in tds ]# eliminamos caracteres indesejados por meio do método .replace
 
  df = df.append(pd.Series(values, index=columns), ignore_index=True)




In [121]:
# Printing the Header

df_header = pd.DataFrame(columns=columns)
df_header

Unnamed: 0,Postal Code,Borough,Neighbourhood


In [122]:
# The Dataframe became like that.

df.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


In [123]:
# We got out the Dataframe at json format.

df.to_json(r'list_of_postal_code_of_canada_m.json')

In [124]:
# We have imported the table to our notebook.
df = pd.read_json('list_of_postal_code_of_canada_m.json')

In [125]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [126]:
df.shape

(180, 3)

### Now, We have to throw out the 'Not assigned' data.

In [127]:
df = df.loc[df['Neighbourhood']!='Not assigned'] 

In [128]:
df.head(20)

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


In [129]:
df1 = df

In [130]:
# new data frame with split value columns 
df1["Neighbourhood"]= df1["Neighbourhood"].str.split(", ", n = 1, expand = True) 

# df display 

In [131]:
pd.set_option("display.max_rows", None, "display.max_columns", None)


In [132]:
df1.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park
5,M6A,North York,Lawrence Manor
6,M7A,Downtown Toronto,Queen's Park


In [133]:
#create variable column inside file called `myAddress and assign it to seleted address columns
df1['Query'] = df1['Postal Code'] + ", " +  df1['Borough']+ ", " + df1['Neighbourhood']

#print new column with index -first 5 rows only
df1.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Query
2,M3A,North York,Parkwoods,"M3A, North York, Parkwoods"
3,M4A,North York,Victoria Village,"M4A, North York, Victoria Village"
4,M5A,Downtown Toronto,Regent Park,"M5A, Downtown Toronto, Regent Park"
5,M6A,North York,Lawrence Manor,"M6A, North York, Lawrence Manor"
6,M7A,Downtown Toronto,Queen's Park,"M7A, Downtown Toronto, Queen's Park"


### Now We need to insert new columns to our dataset with Latitude and Longitude

In [134]:

#create 2 new columns to store lat/long - initalise to null
df1['Latitude'] = ""
df1['Longitude'] = ""

#print first rows to sample
df1.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Query,Latitude,Longitude
2,M3A,North York,Parkwoods,"M3A, North York, Parkwoods",,
3,M4A,North York,Victoria Village,"M4A, North York, Victoria Village",,
4,M5A,Downtown Toronto,Regent Park,"M5A, Downtown Toronto, Regent Park",,
5,M6A,North York,Lawrence Manor,"M6A, North York, Lawrence Manor",,
6,M7A,Downtown Toronto,Queen's Park,"M7A, Downtown Toronto, Queen's Park",,


### Here We are must to find the locations for each Postal Code with the Library GeoPy

In [135]:
'''
**Get Lat/Long Data with GeoPy**
---------------------

the code below calls a geopy API using a concatenated column of address values. We use this column as a query key 
to pull back cooresponding lat/long coordinates.
'''

geolocator = Nominatim(user_agent="geoapiExercises")

for i in df1.index:
    try:
        #tries fetch address from geopy
        location = geolocator.geocode(df1['Query'][i])
        
        #append lat/long to column using dataframe location
        df1.loc[i,'Latitude'] = location.latitude
        df1.loc[i,'Longitude'] = location.longitude
    except:
        #catches exception for the case where no value is returned
        #appends null value to column
        df1.loc[i,'Latitude'] = "null"
        df1.loc[i,'Longitude'] = "null"
#print first rows as sample
df1.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Query,Latitude,Longitude
2,M3A,North York,Parkwoods,"M3A, North York, Parkwoods",,
3,M4A,North York,Victoria Village,"M4A, North York, Victoria Village",,
4,M5A,Downtown Toronto,Regent Park,"M5A, Downtown Toronto, Regent Park",,
5,M6A,North York,Lawrence Manor,"M6A, North York, Lawrence Manor",,
6,M7A,Downtown Toronto,Queen's Park,"M7A, Downtown Toronto, Queen's Park",,


In [143]:
address = 'Toronto, Ontario, CA'

geolocator = Nominatim(user_agent="geoapiExercises")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [137]:
df1 = df1.loc[df1['Latitude']!='null'] 

In [138]:
df1

Unnamed: 0,Postal Code,Borough,Neighbourhood,Query,Latitude,Longitude
9,M1B,Scarborough,Malvern,"M1B, Scarborough, Malvern",43.8092,-79.2217
18,M1C,Scarborough,Rouge Hill,"M1C, Scarborough, Rouge Hill",43.7803,-79.1305
26,M9C,Etobicoke,Eringate,"M9C, Etobicoke, Eringate",43.6623,-79.5765
36,M1G,Scarborough,Woburn,"M1G, Scarborough, Woburn",43.7598,-79.2253
49,M5H,Downtown Toronto,Richmond,"M5H, Downtown Toronto, Richmond",43.6512,-79.3813
68,M6K,West Toronto,Brockton,"M6K, West Toronto, Brockton",43.6509,-79.44
82,M2M,North York,Willowdale,"M2M, North York, Willowdale",43.786,-79.416
91,M2N,North York,Willowdale,"M2N, North York, Willowdale",43.7615,-79.4109
95,M6N,York,Runnymede,"M6N, York, Runnymede",43.6682,-79.4793
104,M6P,West Toronto,High Park,"M6P, West Toronto, High Park",43.6539,-79.4669


In [139]:
df1 = df1.reset_index()

In [140]:
df1.drop(['index'], axis=1)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Query,Latitude,Longitude
0,M1B,Scarborough,Malvern,"M1B, Scarborough, Malvern",43.8092,-79.2217
1,M1C,Scarborough,Rouge Hill,"M1C, Scarborough, Rouge Hill",43.7803,-79.1305
2,M9C,Etobicoke,Eringate,"M9C, Etobicoke, Eringate",43.6623,-79.5765
3,M1G,Scarborough,Woburn,"M1G, Scarborough, Woburn",43.7598,-79.2253
4,M5H,Downtown Toronto,Richmond,"M5H, Downtown Toronto, Richmond",43.6512,-79.3813
5,M6K,West Toronto,Brockton,"M6K, West Toronto, Brockton",43.6509,-79.44
6,M2M,North York,Willowdale,"M2M, North York, Willowdale",43.786,-79.416
7,M2N,North York,Willowdale,"M2N, North York, Willowdale",43.7615,-79.4109
8,M6N,York,Runnymede,"M6N, York, Runnymede",43.6682,-79.4793
9,M6P,West Toronto,High Park,"M6P, West Toronto, High Park",43.6539,-79.4669


### So, This is the shape of our dataframe


In [141]:
df1.shape

(13, 7)