# Analizing Neighborhoods in Toronto

In [99]:
#!conda install lxml --yes

In [100]:
#!conda install html5lib --yes

In [101]:
#!conda install BeautifulSoup4  --yes

In [12]:
import pandas as pd
import numpy as np

### Getting dataframe from wikipedia

In [58]:
df = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M", attrs = {'class': 'wikitable'})

In [59]:
df = pd.DataFrame(df[0])
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Filtering to the dataframe

In [60]:
df_filtered = df['Borough'] != 'Not assigned'
df_filtered = df[df_filtered]
df_filtered.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [61]:
df_filtered['Neighbourhood'].replace('Not assigned', np.nan, inplace=True)
df_filtered['Neighbourhood'].fillna(df_filtered['Borough'])

2                     Parkwoods
3              Victoria Village
4                  Harbourfront
5              Lawrence Heights
6                Lawrence Manor
                 ...           
281    Kingsway Park South West
282                   Mimico NW
283          The Queensway West
284       Royal York South West
285              South of Bloor
Name: Neighbourhood, Length: 210, dtype: object

### Grouping the data so neighbourhoods that use same postcode are in the same row

In [62]:
df_grouped_by_postcode = df_filtered.groupby(['Postcode','Borough'])['Neighbourhood'].apply(list)
df_grouped = pd.DataFrame(df_grouped_by_postcode)
df_grouped.reset_index(level=['Postcode', 'Borough'],col_level=0, inplace=True)
df_grouped['Neighbourhood'] = df_grouped.Neighbourhood.apply(lambda x: ", ".join(map(str, x)))
df_grouped.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [63]:
df_grouped.shape

(103, 3)

### Part 2: Getting latitude and longitude with CSV geospatial data

In [64]:
#!pip install wget

In [65]:
import wget

In [66]:
file = wget.download('http://cocl.us/Geospatial_data')

  0% [                                                                                ]    0 / 2891100% [................................................................................] 2891 / 2891

In [84]:
df_geo = pd.read_csv('Geospatial_Coordinates.csv')
df_geo.rename(columns={'Postal Code':'Postcode'}, inplace=True)
df_geo.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [85]:
df_final = pd.merge(df_grouped, df_geo, how='left', left_on=['Postcode'], right_on=['Postcode'])
df_final.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
