## Capstone Project Assignment - Part 1

### Download data from wiki and assign to pandas dataframe using Pandas read_html function

In [1]:
import pandas as pd
import numpy as np
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Shape of Actual Dataframe

In [2]:
print("Actual shape of Frame",df.shape)

Actual shape of Frame (180, 3)


###  Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned

In [3]:
df.drop(df[df['Borough'] == 'Not assigned'].index, inplace = True) 

print("Shape of Frame after filtering ",df.shape)

Shape of Frame after filtering  (103, 3)


### More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma

####  -- Dataframe Already have the neighbour combined per postal code

In [4]:
df[df["Postal Code"]=="M5A"]

Unnamed: 0,Postal Code,Borough,Neighbourhood
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Check if any Postal Code have duplicate entry in dataframe

In [5]:
df['Postal Code'].duplicated().any()

False

### If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough


In [6]:
df["Neighbourhood"].loc[df["Neighbourhood"]=="Not assigned"] = df["Borough"]

### Checking if any row exists with Neighbourhood = "Not assigned"

In [7]:
df[df["Neighbourhood"]=="Not assigned"].count()

Postal Code      0
Borough          0
Neighbourhood    0
dtype: int64

### Final shape of Dataframe after Cleanup

In [8]:
print("Final shape",df.shape)

Final shape (103, 3)


### Reset Index of Dataframe after filtering

In [9]:
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [25]:
pip install geocoder

Collecting geocoder
  Downloading geocoder-1.38.1-py2.py3-none-any.whl (98 kB)
[K     |████████████████████████████████| 98 kB 7.2 MB/s  eta 0:00:01
Collecting ratelim
  Downloading ratelim-0.1.6-py2.py3-none-any.whl (4.0 kB)
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6
Note: you may need to restart the kernel to use updated packages.


In [18]:
df[df["Postal Code"]=="M5G"]

Unnamed: 0,Postal Code,Borough,Neighbourhood
24,M5G,Downtown Toronto,Central Bay Street


In [16]:
import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format('M5G'))
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]
print ("latitude:", latitude)
print ("longitude:", longitude)

KeyboardInterrupt: 

## As geocoder package not returning latitude and longitude for a given postal code, using the csv link to create dataframe.

In [21]:
import requests
import io
datastr = requests.get("http://cocl.us/Geospatial_data",allow_redirects=True).text
data_file = io.StringIO(datastr)
geo=pd.read_csv(data_file)
geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Creating combined Dataframe using df and geo 

In [23]:
df_geo=pd.merge(df, geo, on='Postal Code')
df_geo.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
