# Segmenting and Clustering Neighborhoods in Toronto
### Adding Geographical Coordinates to the Dataframe

In [1]:
# Import libraries

import pandas as pd
import requests
from bs4 import BeautifulSoup
import requests

### First we have to build the Neighbourhood dataframe as we did in Task 1

In [5]:
# Retrieve neighbourhood data from wikipedia page
nb_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
nb_html = requests.get(nb_url).text
nb_soup = BeautifulSoup(nb_html, 'html.parser')

nb_data = []
for tr in nb_soup.tbody.find_all('tr'):
    nb_data.append([ td.get_text().strip() for td in tr.find_all('td')])

# Read data into a dataframe
nb_df = pd.DataFrame(nb_data, columns=['PostalCode','Borough','Neighbourhood'])

# Find rows of Boroughs that have "Not assigned"
NA_indx = nb_df[(nb_df['Borough'] == "Not assigned")].index

# Drop the unnecessary first row -- that marked as None
nb_df.dropna(inplace=True)

# Drop all rows of Boroughs that have "Not assigned"
nb_df.drop(NA_indx, inplace=True)

# Merge duplicate rows based on PostalCode and Borough
nb_df = nb_df.groupby(['PostalCode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()

# Fix Not assigned Neighbourhoods with the Borough Names
def fix_NB(data):
    if data['Neighbourhood'] == 'Not assigned':
        x = data['Borough']
    else:
        x = data['Neighbourhood']
    return x

nb_df['Neighborhood'] = nb_df.apply(fix_NB, axis='columns')

# Check whether the fix has worked
print("Not assigned Neighborhood count = {}".format(len(nb_df[nb_df['Neighborhood']=='Not assigned'])))

Not assigned Neighborhood count = 0


In [8]:
# We dont need the old Neighbourhood column anymore
nb_df.drop(columns='Neighbourhood', inplace=True)

# Check the final outcome
nb_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Now we can build the geological dataset and merge the 2 dataframes

In [9]:
# Reading geo data from sv to dataframe

ll_df = pd.read_csv('http://cocl.us/Geospatial_data')
ll_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
# Rename Postal Code as PostalCode -- make equal the 2 column names

ll_df.rename(columns={"Postal Code": "PostalCode"}, inplace=True)

# Merging the 2 dataframes

nbll_df = pd.merge(nb_df, ll_df, on='PostalCode', how='outer')
nbll_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
