# Segmenting and Clustering Neighborhoods in Toronto  
### PART - 1 in brief. Scroll down for PART - 2

##### we do the part - 1 section as short as possible  

In [30]:
#import libraries
import pandas as pd
import numpy as np

#scrape the data from wikipedia and read into a pandas dataframe
wikilink="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
raw_df0=pd.read_html(wikilink, header=0)[0]

# lets check first 5 rows of the dataframe
raw_df0.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [31]:
# clean up the header
raw_df0.rename(columns={'Postcode': 'PostalCode', 'Neighbourhood': 'Neighborhood'}, inplace= True)

# check only the header
raw_df0.head(0)

Unnamed: 0,PostalCode,Borough,Neighborhood


Now lets check the number of rows and columns with the 'shape' feature

In [32]:
print('total number of rows = ', raw_df0.shape[0])
print ('number of \'Not assigned\' in Borough = ', raw_df0.Borough.str.count('Not assigned').sum())
print ('number of \'Not assigned\' in Neighborhood = ', raw_df0.Neighborhood.str.count('Not assigned').sum())

total number of rows =  288
number of 'Not assigned' in Borough =  77
number of 'Not assigned' in Neighborhood =  78


Drop 77 rows with 'Not assigned' Borough

In [33]:
#select rows with 'Burough' as 'Not assigned'
indexNames = raw_df0[raw_df0['Borough'] == 'Not assigned' ].index

# now just drop all the selected rows
raw_df1= raw_df0.drop(indexNames)

# basic rule, reset index after rows deleted.
raw_df1.reset_index(drop=True, inplace=True)

#lets check howmany rows are now in the dataframe
raw_df1.shape[0]

211

Perfect!   
Now find the row with Neighborhood as 'Not assigned' and assign the corresponding Borough name to the Neighborhood

In [34]:
raw_df1.loc[raw_df1['Neighborhood']=='Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood
6,M7A,Queen's Park,Not assigned


In [35]:
raw_df1.loc[raw_df1['Neighborhood']=='Not assigned','Neighborhood'] = raw_df1[raw_df1['Neighborhood']=='Not assigned']['Borough']
raw_df1.loc[raw_df1['PostalCode']=='M7A']

Unnamed: 0,PostalCode,Borough,Neighborhood
6,M7A,Queen's Park,Queen's Park


In [36]:
# replace 'Not assigned' with Borough name
raw_df1['Neighborhood'] = raw_df1['Neighborhood'].str.replace('Not assigned', 'Queen\'s Park')

#check the row again
raw_df1.loc[raw_df1['PostalCode']=='M7A']

Unnamed: 0,PostalCode,Borough,Neighborhood
6,M7A,Queen's Park,Queen's Park


In [18]:
df=(raw_df1.groupby('PostalCode').agg({'Borough':'first','Neighborhood' : ', '.join})
    .reset_index()
   )
df.head(2)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"


In [13]:
#lets check the number of rows in the clean dataframe
print('Cleaned dataframe \'df\' has ',df.shape[0], ' rows')

Cleaned dataframe 'df' has  103  rows


#### --end of part 1--

# PART - 2

importing geocoder failed with error.  
as such we will directly work from the csv file with geo coordinates  
#### Lets get the Postal Codes from csv file for our places

In [47]:
geolink= 'https://cocl.us/Geospatial_data'
geo_df= pd.read_csv(geolink)
geo_df.head(5)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Standardize the header labels to our 'df' dataframe

In [49]:
geo_df.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
geo_df.head(0)

Unnamed: 0,PostalCode,Latitude,Longitude


#### Let's merge to dataframes so that we get the Latitude and Longitude of each Poastal Codes to our 'df' dataframe

In [50]:
df_coord= df.merge(geo_df, how='inner', left_on=['PostalCode'], right_on=['PostalCode'])
df_coord.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


#### Lets check the number of rows and columns

In [51]:
print('This dataframe now has ', df_coord.shape[0], 'rows and ', df_coord.shape[1], 'columns')

This dataframe now has  103 rows and  5 columns


#### -- end of part 2 --