In [1]:
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
!pip install geocoder



In [3]:
!pip install folium



In [4]:
from geopy.geocoders import Nominatim

In [5]:
import folium

In [6]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [7]:
# Reading the page into a object
wikipage = urllib.request.urlopen(url)

In [8]:
# Read page using Beautiful Soup
wikipage_soup = BeautifulSoup(wikipage, "lxml")

In [9]:
# Importing all avaibale tables from the page into a variable
right_table=wikipage_soup.find('table', class_='wikitable sortable')
right_table

<table class="wikitable sortable">
<tbody><tr>
<th>Postal Code
</th>
<th>Borough
</th>
<th>Neighborhood
</th></tr>
<tr>
<td>M1A
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A
</td>
<td>North York
</td>
<td>Parkwoods
</td></tr>
<tr>
<td>M4A
</td>
<td>North York
</td>
<td>Victoria Village
</td></tr>
<tr>
<td>M5A
</td>
<td>Downtown Toronto
</td>
<td>Regent Park, Harbourfront
</td></tr>
<tr>
<td>M6A
</td>
<td>North York
</td>
<td>Lawrence Manor, Lawrence Heights
</td></tr>
<tr>
<td>M7A
</td>
<td>Downtown Toronto
</td>
<td>Queen's Park, Ontario Provincial Government
</td></tr>
<tr>
<td>M8A
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>
<tr>
<td>M9A
</td>
<td>Etobicoke
</td>
<td>Islington Avenue, Humber Valley Village
</td></tr>
<tr>
<td>M1B
</td>
<td>Scarborough
</td>
<td>Malvern, Rouge
</td></tr>
<tr>
<td>M2B
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3B
</td>
<td>

In [10]:
# The next step would be to retrive column values from the above table data
# We will ignore the headings and only take the actual row values
A=[]
B=[]
C=[]

for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        if cells[1].text.strip()!= "Not assigned": #this is done to only pasrse values for which Borough is Not assigned
            A.append(cells[0].text.strip())
            B.append(cells[1].text.strip())
            C.append(cells[2].text.strip())

In [11]:
# converting above data into a dataframe using Pandas
torronto_data=pd.DataFrame(A,columns=['Postal Code'])
torronto_data['Borough']=B
torronto_data['Neighborhood']=C
torronto_data

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [12]:
# shape of the dataset
torronto_data.shape

(103, 3)

In [13]:
# analysing values based on Borough
torronto_data['Borough'].value_counts()

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East York            5
East Toronto         5
York                 5
Mississauga          1
Name: Borough, dtype: int64

In [14]:
# furthermore grouping data using borough and analysing the results
torronto_grouped = torronto_data.groupby('Borough').count()
torronto_grouped.head()

Unnamed: 0_level_0,Postal Code,Neighborhood
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1
Central Toronto,9,9
Downtown Toronto,19,19
East Toronto,5,5
East York,5,5
Etobicoke,12,12


In [15]:
geolocator = Nominatim(user_agent="segment_clustering")

In [16]:
torronto_data['Latitude'] = ""
torronto_data['Longitude'] = ""

In [17]:
i = 0
for postal_code in torronto_data['Postal Code']:
#     print(postal_code)
    location = geolocator.geocode('{}, Toronto, Ontario'.format(postal_code))
    if(location):
        torronto_data.at[0,'Latitude'] = location.latitude
        torronto_data.at[0,'Longitude'] = location.longitude
    i = i+1

In [18]:
torronto_data.notna()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,True,True,True,True,True
1,True,True,True,True,True
2,True,True,True,True,True
3,True,True,True,True,True
4,True,True,True,True,True
...,...,...,...,...,...
98,True,True,True,True,True
99,True,True,True,True,True
100,True,True,True,True,True
101,True,True,True,True,True


In [19]:
#As most of the above values resulted into NaN for Latitude and Longitude
# we will be utilising the given excel to fill this data

csv_url = "https://cocl.us/Geospatial_data"
data = pd.read_csv(csv_url)

In [20]:
data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [21]:
torronto_merged = torronto_data.drop(["Latitude", "Longitude"], axis=1)

In [22]:
torronto_merged = torronto_merged.join(data.set_index('Postal Code'), on='Postal Code')
torronto_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [23]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(torronto_merged['Borough'].unique()),
        torronto_merged.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


In [24]:
#Taking out the data from Downtown Toronto for analysis
dt_data = torronto_merged[torronto_merged['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
dt_data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [25]:
#taking geo location of Downtown Toronto
address = 'Downtown Toronto, Ontario'
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Downtown Toronto are 43.6563221, -79.3809161.


In [26]:
map_dt = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, lng, label in zip(dt_data['Latitude'], dt_data['Longitude'], dt_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_dt)  
    
map_dt

In [27]:
#getting credentials for Foursquare

CLIENT_ID = 'SJS2KZSN4VLUN2CC0VHJDEUPHYMD55VDC3EJ3OKZHLTI0XAW' # your Foursquare ID
CLIENT_SECRET = '0J5HHRXS3BFOQC4PBKDBYNQRVMM2FHLCHODF4FJOCJE2ZHS1' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: SJS2KZSN4VLUN2CC0VHJDEUPHYMD55VDC3EJ3OKZHLTI0XAW
CLIENT_SECRET:0J5HHRXS3BFOQC4PBKDBYNQRVMM2FHLCHODF4FJOCJE2ZHS1


In [28]:
import requests # library to handle requests
from pandas.io.json import json_normalize

In [29]:
neighborhood_latitude = dt_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = dt_data.loc[0, 'Longitude'] # neighborhood longitude value
neighborhood_name = dt_data.loc[0, 'Neighborhood'] # neighborhood name

In [30]:
radius =500
LIMIT =100
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, neighborhood_latitude, neighborhood_longitude, VERSION, radius, LIMIT)

In [31]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5eef8b64618f43001b03dc52'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Corktown',
  'headerFullLocation': 'Corktown, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 45,
  'suggestedBounds': {'ne': {'lat': 43.6587599045, 'lng': -79.3544279001486},
   'sw': {'lat': 43.6497598955, 'lng': -79.36684389985142}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '54ea41ad498e9a11e9e13308',
       'name': 'Roselle Desserts',
       'location': {'address': '362 King St E',
        'crossStreet': 'Trinity St',
        'lat': 43.653446723052674,
        'lng': -79.3620167174383,
        'labeledLatLngs': [{'label': 'display',
 

In [32]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [33]:
venues = results['response']['groups'][0]['items']    
nearby_venues = json_normalize(venues) # flatten JSON
# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]
# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

  


Unnamed: 0,name,categories,lat,lng
0,Roselle Desserts,Bakery,43.653447,-79.362017
1,Tandem Coffee,Coffee Shop,43.653559,-79.361809
2,Cooper Koo Family YMCA,Distribution Center,43.653249,-79.358008
3,Body Blitz Spa East,Spa,43.654735,-79.359874
4,Dominion Pub and Kitchen,Pub,43.656919,-79.358967


In [34]:
#As done in the lab, repeating the same process for all boroughs

def getNearvyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    for name, lat, long in zip(names,latitudes,longitudes):
        url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, lat, long, VERSION, radius, LIMIT)        
        results = requests.get(url).json()['response']['groups'][0]['items']
    
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
    
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Borough', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)
        

In [35]:
#We will be analysing the data for Boroughs having the word Torront
dt_data = torronto_merged
dt_torr = dt_data[dt_data['Borough'].str.contains("Toronto", regex=True)]

torronto_venues = getNearvyVenues(names=dt_torr['Borough'],
                                   latitudes=dt_torr['Latitude'],
                                   longitudes=dt_torr['Longitude']
                                  )

In [36]:
torronto_venues.head()

Unnamed: 0,Borough,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Downtown Toronto,43.65426,-79.38316,Roselle Desserts,43.653447,-79.362017,Bakery
1,Downtown Toronto,43.65426,-79.38316,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,Downtown Toronto,43.65426,-79.38316,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,Downtown Toronto,43.65426,-79.38316,Body Blitz Spa East,43.654735,-79.359874,Spa
4,Downtown Toronto,43.65426,-79.38316,Dominion Pub and Kitchen,43.656919,-79.358967,Pub


In [37]:
#Grouping data using venue category
torronto_venues.groupby("Venue Category").count()

#Found that there are a total of 236 unique venue categories

Unnamed: 0_level_0,Borough,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude
Venue Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Afghan Restaurant,1,1,1,1,1,1
Airport,1,1,1,1,1,1
Airport Food Court,1,1,1,1,1,1
Airport Gate,1,1,1,1,1,1
Airport Lounge,2,2,2,2,2,2
...,...,...,...,...,...,...
Vietnamese Restaurant,6,6,6,6,6,6
Wine Bar,10,10,10,10,10,10
Wine Shop,2,2,2,2,2,2
Women's Store,1,1,1,1,1,1


In [38]:
#getting data as onehot encoded based on venue_category
torr_onehot = pd.get_dummies(torronto_venues[['Venue Category']], prefix="", prefix_sep="")
torr_onehot['Borough'] = torronto_venues['Borough'] 
fixed_columns = [torr_onehot.columns[-1]] + list(torr_onehot.columns[:-1])
torr_onehot = torr_onehot[fixed_columns]
torr_onehot.head()

Unnamed: 0,Borough,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,Downtown Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Downtown Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Downtown Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Downtown Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Downtown Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
#We find the occurences of each category in a Borough
torr_grouped = torr_onehot.groupby('Borough').sum().reset_index()
torr_grouped

Unnamed: 0,Borough,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,Central Toronto,0,0,0,0,0,0,0,2,0,...,1,2,0,1,0,1,0,0,0,1
1,Downtown Toronto,1,1,1,1,2,2,1,17,2,...,0,1,3,13,2,4,8,1,1,7
2,East Toronto,0,0,0,0,0,0,0,3,0,...,0,2,0,0,0,0,1,0,0,2
3,West Toronto,0,0,0,0,0,0,0,0,1,...,0,0,0,3,0,1,1,1,0,2


In [71]:
torr_grouped.set_index("Borough", inplace=True)

In [102]:
#Analysing which category has the max value for each borough
torr_max_borough = pd.DataFrame(torr_grouped.idxmax(axis=1))

In [103]:
torr_max_borough.reset_index(inplace=True)

In [104]:
#These are the most occuring spots in each Borough
torr_max_borough.rename(columns={0:"Popular Spot"})

Unnamed: 0,Borough,Popular Spot
0,Central Toronto,Coffee Shop
1,Downtown Toronto,Coffee Shop
2,East Toronto,Greek Restaurant
3,West Toronto,Café
