# Capstone Project File

I am not importing the libraries here to keep track of the purpose of various libraries, will import them based on the need

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [3]:
import requests # To request the page text 

html_file = requests.get(url).text

In [4]:
#Using BeautifulSoup to parse this file

from bs4 import BeautifulSoup

In [5]:
soup = BeautifulSoup(html_file,'lxml')

## Finding the table in html file

In [6]:
html_table = soup.find('tbody')
features = []
total_features = html_table.find_all('th')
for feature in total_features:
    features.append(feature.text.strip('\n')) 
    
features

['Postcode', 'Borough', 'Neighbourhood']

In [7]:
# Now we have list of features in the data, lets create  a data frame

import pandas as pd
df = pd.DataFrame(columns=features)
df

Unnamed: 0,Postcode,Borough,Neighbourhood


In [8]:
html_table_allrows = html_table.find_all('tr')

for r_index, rows in enumerate(html_table_allrows):
    columns = rows.find_all('td') #Finding the total data entries in a single row
    for col,feat in zip(columns,features):
        df.loc[r_index,feat] = col.text.strip('\n')
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


## Cleaning the data


In [9]:
#Ignore cells with a Borough that is Not assigned.

df1=df[df.Borough != 'Not assigned']
df1.reset_index(inplace=True)
df1.drop('index',axis=1,inplace=True)
df1.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [10]:
# Combining the Neighbourhoods having same Postcode
df_clean = df1.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index()
df_clean.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


##  Assigning Borough where neighbourhood is not assigned

In [11]:
t=(df_clean.loc[df_clean['Neighbourhood']=='Not assigned']).index.values
for ii in t:
    df_clean.replace(to_replace=df_clean.iloc[ii]['Neighbourhood'], value=df_clean.iloc[ii]['Borough'],inplace=True)
df_clean.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [12]:
print(df_clean.shape)

(103, 3)


## Getting the Latitude and Longitude using the given file as geocoder seems to not working propoerly

In [13]:
df_latlong = pd.read_csv('Geospatial_Coordinates.csv')
df_latlong.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
df_latlong.rename(columns={'Postal Code':'Postcode'},inplace=True)
df_latlong.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [15]:
df_latlong.shape  # Just to make sure the entries in the csv file

(103, 3)

In [16]:
df_final = pd.merge(df_clean,df_latlong,on='Postcode')
df_final.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Lets see how many postcodes are there in each Borough

In [17]:
df_final.groupby(['Borough']).count().Postcode

Borough
Central Toronto      9
Downtown Toronto    18
East Toronto         5
East York            5
Etobicoke           12
Mississauga          1
North York          24
Queen's Park         1
Scarborough         17
West Toronto         6
York                 5
Name: Postcode, dtype: int64

### North York has the maximum Post codes, lets select North York as a central point and analyze its surroundings 

In [20]:
# Finding the coordinates of the North York

!conda install -c conda-forge geopy --yes # Installing geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

central_point = 'North York, Ontario, Canada'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(central_point)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of North York are {}, {}.'.format(latitude, longitude))

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

The geograpical coordinate of North York are 43.7708175, -79.4132998.


In [21]:
#Creating a data frame containing only North York data
df_NY = df_final[df_final['Borough'] == 'North York'].reset_index().drop('index',axis=1)
df_NY.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M2H,North York,Hillcrest Village,43.803762,-79.363452
1,M2J,North York,"Fairview,Henry Farm,Oriole",43.778517,-79.346556
2,M2K,North York,Bayview Village,43.786947,-79.385975
3,M2L,North York,"Silver Hills,York Mills",43.75749,-79.374714
4,M2M,North York,"Newtonbrook,Willowdale",43.789053,-79.408493


In [22]:
# Lets show the points on the map
!conda install -c conda-forge folium=0.5.0 --yes
import folium 

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [23]:
map_NY = folium.Map(location=[latitude,longitude],zoom_start=11)
for lat, lng, label in zip(df_NY['Latitude'], df_NY['Longitude'], df_NY['Postcode']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_NY)
    
map_NY

## Defining FourSquare Credentials

In [24]:
# @hidden_cell
CLIENT_ID = '3ZOLFC23B0ORXRPPPQ2LLXZFJFNRGD1VB2VSQR02VIAXBYIZ' # your Foursquare ID
CLIENT_SECRET = 'YXQQI3I3FLOCI3UZQ5ME0G1KDDENJUYHFTI5D5JISLWSV4YD' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 3ZOLFC23B0ORXRPPPQ2LLXZFJFNRGD1VB2VSQR02VIAXBYIZ
CLIENT_SECRET:YXQQI3I3FLOCI3UZQ5ME0G1KDDENJUYHFTI5D5JISLWSV4YD


In [25]:
neighborhood_latitude = df_NY.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df_NY.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = df_NY.loc[0, 'Postcode'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of M2H are 43.8037622, -79.3634517.


## Now lets find the top 100 venues within M2H postcode

In [26]:
LIMIT = 100
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=3ZOLFC23B0ORXRPPPQ2LLXZFJFNRGD1VB2VSQR02VIAXBYIZ&client_secret=YXQQI3I3FLOCI3UZQ5ME0G1KDDENJUYHFTI5D5JISLWSV4YD&v=20180605&ll=43.8037622,-79.3634517&radius=500&limit=100'

In [27]:
import requests # library to request the data
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5d97201ea87921002cf76c6e'},
 'response': {'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.808262204500004,
    'lng': -79.3572281853783},
   'sw': {'lat': 43.7992621955, 'lng': -79.3696752146217}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4ad9dce6f964a520651b21e3',
       'name': "Eagle's Nest Golf Club",
       'location': {'address': '10000 Dufferin Rd',
        'lat': 43.805454826002794,
        'lng': -79.36418592243415,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.805454826002794,
          'lng': -79.36418592243415}],
        'distance': 197,
        'cc': 'CA',
        'city': 'Toronto

In [28]:
venues = results['response']['groups'][0]['items']
venues

[{'reasons': {'count': 0,
   'items': [{'summary': 'This spot is popular',
     'type': 'general',
     'reasonName': 'globalInteractionReason'}]},
  'venue': {'id': '4ad9dce6f964a520651b21e3',
   'name': "Eagle's Nest Golf Club",
   'location': {'address': '10000 Dufferin Rd',
    'lat': 43.805454826002794,
    'lng': -79.36418592243415,
    'labeledLatLngs': [{'label': 'display',
      'lat': 43.805454826002794,
      'lng': -79.36418592243415}],
    'distance': 197,
    'cc': 'CA',
    'city': 'Toronto',
    'state': 'ON',
    'country': 'Canada',
    'formattedAddress': ['10000 Dufferin Rd', 'Toronto ON', 'Canada']},
   'categories': [{'id': '4bf58dd8d48988d1e6941735',
     'name': 'Golf Course',
     'pluralName': 'Golf Courses',
     'shortName': 'Golf Course',
     'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/parks_outdoors/golfcourse_',
      'suffix': '.png'},
     'primary': True}],
   'photos': {'count': 0, 'groups': []}},
  'referralId': 'e-0-4ad9dce6f964a5206

In [29]:
import json
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

In [30]:
nearby_venues = json_normalize(venues) # flatten JSON
nearby_venues

Unnamed: 0,reasons.count,reasons.items,referralId,venue.categories,venue.id,venue.location.address,venue.location.cc,venue.location.city,venue.location.country,venue.location.crossStreet,venue.location.distance,venue.location.formattedAddress,venue.location.labeledLatLngs,venue.location.lat,venue.location.lng,venue.location.state,venue.name,venue.photos.count,venue.photos.groups
0,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4ad9dce6f964a520651b21e3-0,"[{'id': '4bf58dd8d48988d1e6941735', 'name': 'G...",4ad9dce6f964a520651b21e3,10000 Dufferin Rd,CA,Toronto,Canada,,197,"[10000 Dufferin Rd, Toronto ON, Canada]","[{'label': 'display', 'lat': 43.80545482600279...",43.805455,-79.364186,ON,Eagle's Nest Golf Club,0,[]
1,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4e1fad69fa761d67106a9989-1,"[{'id': '4bf58dd8d48988d15e941735', 'name': 'P...",4e1fad69fa761d67106a9989,50 Francine drive,CA,North York,Canada,Leslie,231,"[50 Francine drive (Leslie), North York ON, Ca...","[{'label': 'display', 'lat': 43.80451541369957...",43.804515,-79.366138,ON,AY Jackson Pool,0,[]
2,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-5331a799498e3bfbae4a5a04-2,"[{'id': '4bf58dd8d48988d1c0941735', 'name': 'M...",5331a799498e3bfbae4a5a04,,CA,Toronto,Canada,,234,"[Toronto ON, Canada]","[{'label': 'display', 'lat': 43.80168506227364...",43.801685,-79.363938,ON,Villa Madina,0,[]
3,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4c64ca1b772bef3bfadbc2aa-3,"[{'id': '4bf58dd8d48988d1e5941735', 'name': 'D...",4c64ca1b772bef3bfadbc2aa,Aspenwood Drive,CA,Toronto,Canada,btwn Don Mills & Leslie,296,"[Aspenwood Drive (btwn Don Mills & Leslie), To...","[{'label': 'display', 'lat': 43.80553928816196...",43.805539,-79.360695,ON,Duncan Creek Park,0,[]


In [31]:
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

In [32]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [33]:
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Eagle's Nest Golf Club,Golf Course,43.805455,-79.364186
1,AY Jackson Pool,Pool,43.804515,-79.366138
2,Villa Madina,Mediterranean Restaurant,43.801685,-79.363938
3,Duncan Creek Park,Dog Run,43.805539,-79.360695


In [34]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postcode', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(nearby_venues)

In [35]:
NY_venues = getNearbyVenues(names=df_NY['Postcode'],
                                   latitudes=df_NY['Latitude'],
                                   longitudes=df_NY['Longitude']
                                  )

M2H
M2J
M2K
M2L
M2M
M2N
M2P
M2R
M3A
M3B
M3C
M3H
M3J
M3K
M3L
M3M
M3N
M4A
M5M
M6A
M6B
M6L
M9L
M9M


In [36]:
NY_venues.head()

Unnamed: 0,Postcode,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M2H,43.803762,-79.363452,Eagle's Nest Golf Club,43.805455,-79.364186,Golf Course
1,M2H,43.803762,-79.363452,AY Jackson Pool,43.804515,-79.366138,Pool
2,M2H,43.803762,-79.363452,Villa Madina,43.801685,-79.363938,Mediterranean Restaurant
3,M2H,43.803762,-79.363452,Duncan Creek Park,43.805539,-79.360695,Dog Run
4,M2J,43.778517,-79.346556,The LEGO Store,43.778207,-79.343483,Toy / Game Store


In [37]:
NY_venues.groupby('Postcode').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M2H,4,4,4,4,4,4
M2J,63,63,63,63,63,63
M2K,4,4,4,4,4,4
M2M,1,1,1,1,1,1
M2N,33,33,33,33,33,33
M2P,4,4,4,4,4,4
M2R,5,5,5,5,5,5
M3A,2,2,2,2,2,2
M3B,4,4,4,4,4,4
M3C,22,22,22,22,22,22


In [38]:
print('There are {} uniques categories.'.format(len(NY_venues['Venue Category'].unique())))

There are 103 uniques categories.


In [39]:
# one hot encoding
NY_onehot = pd.get_dummies(NY_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
NY_onehot['Postcode'] = NY_venues['Postcode'] 

# move neighborhood column to the first column
fixed_columns = [NY_onehot.columns[-1]] + list(NY_onehot.columns[:-1])
NY_onehot = NY_onehot[fixed_columns]

NY_onehot.head()

Unnamed: 0,Postcode,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Bakery,Bank,Bar,Baseball Field,...,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Video Store,Vietnamese Restaurant,Wings Joint,Women's Store
0,M2H,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M2H,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M2H,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M2H,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M2J,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [40]:
NY_grouped = NY_onehot.groupby('Postcode').mean().reset_index()
NY_grouped

Unnamed: 0,Postcode,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Bakery,Bank,Bar,Baseball Field,...,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Video Store,Vietnamese Restaurant,Wings Joint,Women's Store
0,M2H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M2J,0.0,0.0,0.015873,0.0,0.031746,0.031746,0.015873,0.0,0.015873,...,0.0,0.015873,0.0,0.015873,0.031746,0.015873,0.0,0.0,0.015873,0.031746
2,M2K,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M2M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M2N,0.0,0.0,0.0,0.030303,0.0,0.0,0.0,0.0,0.0,...,0.030303,0.0,0.0,0.0,0.0,0.0,0.0,0.030303,0.0,0.0
5,M2P,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,M2R,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,M3A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,M3B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,M3C,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
num_top_venues = 5

for hood in NY_grouped['Postcode']:
    print("----"+hood+"----")
    temp = NY_grouped[NY_grouped['Postcode'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----M2H----
                      venue  freq
0               Golf Course  0.25
1                      Pool  0.25
2  Mediterranean Restaurant  0.25
3                   Dog Run  0.25
4                  Pharmacy  0.00


----M2J----
                  venue  freq
0        Clothing Store  0.14
1  Fast Food Restaurant  0.08
2           Coffee Shop  0.08
3         Women's Store  0.03
4                Bakery  0.03


----M2K----
                 venue  freq
0   Chinese Restaurant  0.25
1                 Bank  0.25
2                 Café  0.25
3  Japanese Restaurant  0.25
4    Accessories Store  0.00


----M2M----
               venue  freq
0                Gym   1.0
1  Accessories Store   0.0
2          Juice Bar   0.0
3          Pet Store   0.0
4               Park   0.0


----M2N----
              venue  freq
0       Coffee Shop  0.09
1  Ramen Restaurant  0.09
2       Pizza Place  0.06
3    Sandwich Place  0.06
4              Café  0.06


----M2P----
               venue  freq
0              