# Segmenting and Clustering Neighborhoods in Toronto
## Part 1

### We start by importing the necessary librairies.

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

### We get the HTML content of the Wikipedia page.

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"}
response = requests.get(url, headers=headers)
content = response.content
soup = BeautifulSoup(content)

### We extract the information we want and put it in a list.

In [3]:
table = soup.find('table', attrs={'class':'wikitable sortable'})
codes = []
for row in table.findAll('tr'):
    code = []
    for col in row.findAll('td'):
        element = col.text
        element = element[:-1]
        code.append(element)
    codes.append(code)
codes[0] = ['PostalCode','Borough','Neighborhood']
col_names = codes.pop(0)

### We use the previous list to create a DataFrame.

In [4]:
df = pd.DataFrame(codes, columns=col_names)
df.replace('Not assigned',np.nan,inplace=True)

### We ignore the rows which don't have an assigned borough.

In [5]:
df.dropna(axis=0,subset=['Borough'],inplace=True)

### We check the DataFrame

In [6]:
print(df.describe())
print(df.info())

       PostalCode     Borough Neighborhood
count         103         103          103
unique        103          10           99
top           M1L  North York    Downsview
freq            1          24            4
<class 'pandas.core.frame.DataFrame'>
Int64Index: 103 entries, 2 to 178
Data columns (total 3 columns):
PostalCode      103 non-null object
Borough         103 non-null object
Neighborhood    103 non-null object
dtypes: object(3)
memory usage: 3.2+ KB
None


### All the postal codes are unique and every neighborhood has a value different from Not assigned. Good! Finally, we print the number of rows in the DataFrame :

In [7]:
df.shape

(103, 3)