<a href="https://colab.research.google.com/github/alesouzaeu/Coursera_Capstone/blob/main/Segmenting_and_Clustering_Neighborhoods_in_Toronto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Segmenting and Clustering Neighborhoods in Toronto

## Install the necessaries packages We will need

In [None]:
!pip install requests2
!pip install beautifulsoup4
!pip install geocoder



## Importing the libraries we will need.

In [130]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

#Tirando Warnings do código
import warnings
warnings.filterwarnings('ignore')




## Making web scraping with Beautiful Soup 4 in Wikipedia URL 

In [133]:
# I called the URL that contains the table we want.
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
response = requests.get(URL)
soup = BeautifulSoup(response.text, 'html.parser')
# Parseamos o Html desejado em variáveis para facilitar o trabalho
table = soup.find('table', {'class':'wikitable sortable'}).tbody

rows = table.find_all('tr')

columns = [v.text.replace('\n', '') for v in rows[0].find_all('th')]
# Defining the columns names with the web scraped texts.
df =  pd.DataFrame(columns=columns)
# Creating a loop to take a data at the right position in the dataframe.
for i in range(1, len(rows)):
  tds = rows[i].find_all('td')

  if len(tds) ==4:
      values = [tds[0].text, tds[1].text, tds[2].text.replace('\n','')]# eliminamos caracteres indesejados por meio do método .replace
  else:
    values = [td.text.replace('\n','') for td in tds ]# eliminamos caracteres indesejados por meio do método .replace
 
  df = df.append(pd.Series(values, index=columns), ignore_index=True)




In [134]:
# Printing the Header

df_header = pd.DataFrame(columns=columns)
df_header

Unnamed: 0,Postal Code,Borough,Neighbourhood


In [135]:
# The Dataframe became like that.

df.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


In [136]:
# We got out the Dataframe at json format.

df.to_json(r'list_of_postal_code_of_canada_m.json')

In [137]:
# We have imported the table to our notebook.
df = pd.read_json('list_of_postal_code_of_canada_m.json')

In [138]:
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [139]:
df.shape

(180, 3)

### Now, We have to throw out the 'Not assigned' data.

In [140]:
df = df.loc[df['Borough']!='Not assigned'] 

In [141]:
df.head(20)

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


In [142]:
df.shape

(103, 3)

### So, This is the shape of our dataframe


### Now We need to insert new columns to our dataset with Latitude and Longitude

In [143]:
#create 2 new columns to store lat/long - initalise to null
df['location_lat'] = ""
df['location_long'] = ""

#print first rows to sample
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,location_lat,location_long
2,M3A,North York,Parkwoods,,
3,M4A,North York,Victoria Village,,
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",,
5,M6A,North York,"Lawrence Manor, Lawrence Heights",,
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",,


### Here We are must to find the locations for each Postal Code with the Library GeoPy

In [144]:
# Use Geopy to fetch geocode data
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="myApp")

for i in df.index:
    try:
        #tries fetch address from geopy
        location = geolocator.geocode(df['Postal Code'][i])
        
        #append lat/log to column using dataframe location
        df.loc[i, 'location_lat'] = location.latitude
        df.loc[i, 'location_long'] = location.longitude
    except:
        #catches exception for the case where no value is returned
        #appends null value to column
        df.loc[i, 'location_lat'] = "null"
        df.loc[i, 'location_long'] = "null"
        
#print first rows as sample
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,location_lat,location_long
2,M3A,North York,Parkwoods,-12.1983,-76.9623
3,M4A,North York,Victoria Village,49.4843,8.467
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",45.4406,28.018
5,M6A,North York,"Lawrence Manor, Lawrence Heights",53.7942,-1.75201
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",44.4282,26.166


### Here We have to handle with the missing data.

In [145]:
df = df.loc[df['location_lat']!='null'] 

In [146]:
df = df.reset_index()

In [147]:
df = df.drop(['index'], axis=1)

### Nice! This is the Dataset we want!

In [148]:
df

Unnamed: 0,Postal Code,Borough,Neighbourhood,location_lat,location_long
0,M3A,North York,Parkwoods,-12.1983,-76.9623
1,M4A,North York,Victoria Village,49.4843,8.467
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",45.4406,28.018
3,M6A,North York,"Lawrence Manor, Lawrence Heights",53.7942,-1.75201
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",44.4282,26.166
5,M1B,Scarborough,"Malvern, Rouge",45.2553,-76.2898
6,M3B,North York,Don Mills,45.4393,28.0213
7,M4B,East York,"Parkview Hill, Woodbine Gardens",45.4406,28.0192
8,M5B,Downtown Toronto,"Garden District, Ryerson",45.4408,28.0161
9,M6B,North York,Glencairn,44.4274,26.1654
