#### This project will explore, segment, and cluster the neighborhoods in the city of Toronto

##### Importing required libraries

In [96]:
import pandas as pd
import numpy as np
import urllib.request
import requests
import time
from bs4 import BeautifulSoup

##### Getting the contents of the website with 'requests' library

In [97]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(url)
response

<Response [200]>

##### Scraping the contents from html using BeautifulSoup and building an initial Pandas dataframe

In [98]:
# instantiating the soup object with response text, and html.parser option
soup = BeautifulSoup(response.text, "html.parser")

# parsing the table part of the response by looking at wikitable sortable class-type
postal_table = soup.find(class_="wikitable sortable")

# building the initial dataframe from table's contents 
table_rows = postal_table.find_all('tr')
row_values = []
for tr in table_rows:
    td = tr.find_all('td')
    row_text = [tr.text.strip() for tr in td if tr.text.strip()]
    if row_text:
        row_values.append(row_text)

toronto_df = pd.DataFrame(row_values, columns=["PostalCode", "Borough", "Neighborhood"])
toronto_df.head(10)
toronto_df.shape


(288, 3)

##### Cleaning the dataframe (dropping, combining, and truncating multiple cells)

In [110]:
# ignoring cells with a Borough that is Not assigned.
toronto_df = toronto_df[toronto_df.Borough != 'Not assigned']
toronto_df.head(10)
toronto_df.shape

# combining neighborhoods with the same PostalCode into single row 
toronto_df.groupby('PostalCode')['Neighborhood'].apply(', '.join).reset_index()
toronto_df.head(10)
toronto_df.shape

# replacing 'not assigned' neighborhood with the value of Borough
toronto_df.Neighborhood[toronto_df.Neighborhood == 'Not assigned'] = toronto_df.Borough
toronto_df.head(10)
toronto_df.shape


(211, 3)