### Installing required packages

In [6]:
!pip install bs4
!pip install requests
!pip install html5lib
!pip install folium

Collecting bs4
  Downloading https://files.pythonhosted.org/packages/10/ed/7e8b97591f6f456174139ec089c769f89a94a1a4025fe967691de971f314/bs4-0.0.1.tar.gz
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/dsxuser/.cache/pip/wheels/a0/b0/b2/4f80b9456b87abedbc0bf2d52235414c3467d8889be38dd472
Successfully built bs4
Installing collected packages: bs4
Successfully installed bs4-0.0.1
Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 15.4MB/s eta 0:00:01
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/81/6d/31c83485189a2521a75b4130f1fee5364f772a0375f81afff619004e5237/branca-0.4.0-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.0 folium-0.10.

### Loading required packages

In [7]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
from geopy.geocoders import Nominatim
import folium
from sklearn.cluster import KMeans

<h1><font color = 'green'>Section 1: Obtain dataframe from wikipedia page</font></h1>

In [8]:
#Link of wikipedia Pages for Toronto Postal codes
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
webpage = requests.get(URL) 

In [9]:
#Covert page to BeautifulSoup Object and extract the table
soup = BeautifulSoup(webpage.content, 'html5lib') 
table = soup.find('tbody')

In [10]:
#Load table data to a dataframe
postalArr = []
boroughArr = []
hoodArr = []

for ind, row in enumerate(table.findAll('tr')):
    if (ind == 0):
        continue
    else:
        postalArr.append(row.findChildren()[0].text[:-1])
        boroughArr.append(row.findChildren()[1].text[:-1])
        hoodArr.append(row.findChildren()[2].text[:-1])

torontoDict = {'Postal_Code':postalArr, 'Borough':boroughArr, 'Neighborhood':hoodArr}
df = pd.DataFrame(torontoDict)

#Dataframe from wikipedia Table is created

<h1><font color = 'red'>Problem 1: Dataframe from wikipedia page after cleaning</font></h1>

In [30]:
#Clean the dataframe
df = df[df['Borough'] != 'Not assigned']
df = df.sort_values('Postal_Code').reset_index(drop=True)
display(df)
print('Shape of dataframe is : {}'.format(df.shape))

#Dataframe is cleaned and sorted

Unnamed: 0,Postal_Code,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park
7,M1L,Scarborough,Golden Mile / Clairlea / Oakridge
8,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West
9,M1N,Scarborough,Birch Cliff / Cliffside West


Shape of dataframe is : (103, 3)


<h1><font color = 'green'>Section 2: get Latitude and Longitude data</font></h1>

In [12]:
#This dataframe splits all the Neighborhood

df_new = pd.DataFrame(columns=['Postal_Code', 'Borough', 'Neighborhood'])

for i in range(len(df)):
    postal_code = df.loc[i,'Postal_Code']
    borough = df.loc[i,'Borough']
    hoodlist = df.loc[i,'Neighborhood']
    
    for j in hoodlist.split('/'):
        df_new = df_new.append({'Postal_Code': postal_code, 'Borough': borough, 'Neighborhood': j.strip()}, ignore_index=True)

#Dataframe df_new is created
        

In [13]:
#NOT USED DUE TO LOT OF ERRORS IN RETREIVING POSTAL CODE LATITUDE AND LONGITUDE DATA

def getLocation(postal_code):
    location = None
    latitude = None
    longitude = None
    
    address = postal_code+', Toronto, Ontario'
    geolocator = Nominatim(user_agent="ny_explorer")
    location = geolocator.geocode(address)
    if (location != None):
        latitude = location.latitude
        longitude = location.longitude
    
    return latitude, longitude


In [14]:
#Load Latitude and Longitude Values from online CSV file

df_ll = pd.read_csv('https://cocl.us/Geospatial_data')
df_ll.rename(columns={'Postal Code':'Postal_Code'}, inplace = True)
df_ll = df_ll.sort_values('Postal_Code').reset_index(drop=True)

#Data loaded to dataframe df_ll

<h1><font color = 'red'>Problem 2: Dataframe with Latitude and Longitude Values</font></h1>

In [29]:
#Combine the two dataframes
df2 = pd.merge(df, df_ll, how ='inner', on ='Postal_Code')
df2

Unnamed: 0,Postal_Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park,43.727929,-79.262029
7,M1L,Scarborough,Golden Mile / Clairlea / Oakridge,43.711112,-79.284577
8,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West,43.716316,-79.239476
9,M1N,Scarborough,Birch Cliff / Cliffside West,43.692657,-79.264848


<h1><font color = 'green'>Section 3: Create Clusters</font></h1>

In [16]:
#Get Latitude and longitude values of Toronto
torontoStr = 'Toronto, Ontario'
geolocator = Nominatim(user_agent="t_explorer")
locationToronto = geolocator.geocode(torontoStr)
locationToronto = [locationToronto.latitude, locationToronto.longitude]
print(locationToronto)

[43.6534817, -79.3839347]


In [17]:
#Create map of toronto and mark the postal codes along with 500m radius around them

map_Toronto = folium.Map(location= locationToronto, zoom_start=10)

for lat, long in zip(df2['Latitude'], df2['Longitude']):
   
    folium.Circle([lat, long],
                  radius=1000,
                  color = '#ffd633',
                  opacity = 0.2,
                  fill_color = '#ffd633',
                  fill_opacity = 0.2,
                   ).add_to(map_Toronto)
    
for pc, bor, neigh, lat, long in zip(df2['Postal_Code'], df2['Borough'], df2['Neighborhood'], df2['Latitude'], df2['Longitude']):
    label = '{}, {}, {}'.format(pc, bor, neigh)
    
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=label,
        color = '#ff3333',
        fill_color = '#ff3333',
        fill_opacity = 1,
        parse_html=False).add_to(map_Toronto)
    
map_Toronto

In [None]:
#Credentials to access Foursquare API (Hidden)

In [33]:
# The code was removed by Watson Studio for sharing.

In [19]:
#Function to get URL for each postal code (explore)

LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # Radius of Search

def getURL(lat, long):

    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        lat, 
        long, 
        radius, 
        LIMIT)
    return url

In [20]:
#Create a new dataframe to store the venue data along with the previous data
df3Columns = list(df2.columns)
addColumns = ['Venue', 'VLatitude', 'VLongitude', 'Category']
df3Columns.extend(addColumns)

df3 = pd.DataFrame(columns = df3Columns)

#Loop over each Postal Code
for row in range(len(df2)):
    pc = df2.loc[row, 'Postal_Code']
    bor = df2.loc[row, 'Borough']
    neigh = df2.loc[row, 'Neighborhood']
    lat = df2.loc[row, 'Latitude']
    long = df2.loc[row, 'Longitude']
    
    try:
        url = getURL(lat,long)
        results = requests.get(url).json()
        itemsDict = results['response']['groups'][0]['items']
    except:
        continue
    
    #Loop over every venue in each postal code
    for item in itemsDict:
        try:
            venueName = item['venue']['name']
            vLat = float(item['venue']['location']['lat'])
            vLong = float(item['venue']['location']['lng'])
            category = item['venue']['categories'][0]['name']

            newRow = [pc, bor, neigh, lat, long, venueName, vLat, vLong, category]
            
            #Append data to the new dataFrame
            df3 = df3.append(pd.Series(newRow, index = df3Columns), ignore_index = True)
        except:
            continue

In [21]:
display(df3)
print('Shape of the dataframe is : {}'.format(df3.shape))

Unnamed: 0,Postal_Code,Borough,Neighborhood,Latitude,Longitude,Venue,VLatitude,VLongitude,Category
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
1,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353,T Hamilton & Son Roofing Inc,43.807985,-79.198194,Construction & Landscaping
2,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
3,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497,Scarborough Historical Society,43.788755,-79.162438,History Museum
4,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497,SEBS Engineering Inc. (Sustainable Energy and ...,43.782371,-79.156820,Construction & Landscaping
5,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
6,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711,Big Bite Burrito,43.766299,-79.190720,Mexican Restaurant
7,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711,Enterprise Rent-A-Car,43.764076,-79.193406,Rental Car Location
8,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711,RBC Royal Bank,43.766790,-79.191151,Bank
9,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711,Woburn Medical Centre,43.766631,-79.192286,Medical Center


Shape of the dataframe is : (2192, 9)


In [22]:
#Display all venue categories
total_categories = len(df3['Category'].unique())
print('Total Venue categories are : {}'.format(total_categories))

Total Venue categories are : 270


In [23]:
#Create new dataframe of the postal codes and the venue categories
torontoVenues = pd.get_dummies(df3[['Category']], prefix="", prefix_sep="")
torontoVenues.insert(0, 'Postal_Code', df3[['Postal_Code']])
torontoVenues = torontoVenues.groupby('Postal_Code').mean().reset_index()

In [24]:
display(torontoVenues.head())
print('The shape of this dataframe is {}'.format(torontoVenues.shape))

Unnamed: 0,Postal_Code,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


The shape of this dataframe is (99, 271)


In [25]:
#Model to create clusters

kclusters = 6

df_clustering = torontoVenues.drop('Postal_Code',axis = 1)

kMeansModel = KMeans(n_clusters = kclusters, random_state = 0)
kMeansModel.fit(df_clustering)

clustered_arr = kMeansModel.labels_

<h1><font color = 'red'>Problem 3: Cluster Dataframe and Map</font></h1>

In [26]:
#Final dataframe showing which postal code belongs to which cluster

final_df = df3[['Postal_Code', 'Latitude', 'Longitude']].groupby('Postal_Code').mean().reset_index()
final_df['Clusters'] = clustered_arr
display(final_df)

Unnamed: 0,Postal_Code,Latitude,Longitude,Clusters
0,M1B,43.806686,-79.194353,2
1,M1C,43.784535,-79.160497,0
2,M1E,43.763573,-79.188711,0
3,M1G,43.770992,-79.216917,0
4,M1H,43.773136,-79.239476,0
5,M1J,43.744734,-79.239476,0
6,M1K,43.727929,-79.262029,0
7,M1L,43.711112,-79.284577,0
8,M1M,43.716316,-79.239476,0
9,M1N,43.692657,-79.264848,0


In [27]:
#Show the different clusters on map

clusters_map = folium.Map(location = locationToronto, zoom_start = 10)
colors = ['#ff4d4d', '#4da6ff', '#5cd65c', '#ff66ff', '#ffc34d', '#333333']

for pc, lat, long, cluster in zip(final_df['Postal_Code'], final_df['Latitude'], final_df['Longitude'], final_df['Clusters']):
    label = 'Cluster:{}, {}'.format(cluster, pc)
    
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=label,
        color = colors[cluster],
        fill_color = colors[cluster],
        fill_opacity = 1,
        parse_html=False).add_to(clusters_map)
    
clusters_map

In [28]:
#The number of rows in final dataframe
print('The number of rows in final dataframe is {}'.format(final_df.shape[0]))

The number of rows in final dataframe is 99
