# Import Libraries

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

# Parsing 

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source,"lxml")


# Table on wikipedia

In [3]:
table=soup.find('table')
col_value=table.find_all('td')
#print(col_value)
count_ele=len(col_value)


In [4]:
postcode=[]
borough=[]
neighborhood=[]


In [5]:
for i in range(0,count_ele,3):
    postcode.append(col_value[i].text.strip())
    borough.append(col_value[i+1].text.strip())
    neighborhood.append(col_value[i+2].text.strip())

In [6]:
df_post = pd.DataFrame(data=[postcode, borough, neighborhood]).transpose()
df_post.columns = ['Postcode', 'Borough', 'Neighborhood']

In [7]:
df_post.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [8]:
df_post.drop(df_post[df_post['Borough'] == 'Not assigned'].index, inplace=True)
df_post.loc[df_post.Neighborhood == 'Not assigned', "Neighborhood"] = df_post.Borough

In [9]:
df_post.head()

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


# Grouping the data

In [10]:
df_grp = df_post.groupby(['Postcode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
df_grp.columns = ['Postcode', 'Borough', 'Neighborhood']

# The First Frame


In [11]:
df_grp.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


# Now dealing with latitude and longitude

In [12]:
df_latlng=pd.read_csv('http://cocl.us/Geospatial_data')
df_latlng.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
df_latlng.columns=['Postcode','Latitude','Longitude']

In [14]:
df_join=pd.merge(df_grp,df_latlng,on=['Postcode'],how='inner')
df_join

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


# Exploring and forming Clusters

In [17]:
toront_data=df_join[df_join['Borough']=='West Toronto'].reset_index(drop=True)
toront_data.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M6H,West Toronto,"Dovercourt Village, Dufferin",43.669005,-79.442259
1,M6J,West Toronto,"Little Portugal, Trinity",43.647927,-79.41975
2,M6K,West Toronto,"Brockton, Exhibition Place, Parkdale Village",43.636847,-79.428191
3,M6P,West Toronto,"High Park, The Junction South",43.661608,-79.464763
4,M6R,West Toronto,"Parkdale, Roncesvalles",43.64896,-79.456325


#Geographical coordinates of West toronto

In [25]:
from geopy.geocoders import Nominatim


In [22]:
address = 'West Toronto, TO'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of West Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of West Toronto are 43.653963, -79.387207.


# visualization of West Toronto neighbours

In [27]:
!conda install -c conda-forge folium=0.5.0 --yes

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/DSX-Python35

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    ca-certificates-2019.3.9   |       hecc5488_0         146 KB  conda-forge
    altair-2.2.2               |           py35_1         462 KB  conda-forge
    certifi-2018.8.24          |        py35_1001         139 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    openssl-1.0.2r             |       h14c3975_0         3.1 MB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         4.0 MB

The following NEW packages will

In [28]:
import folium

In [29]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toront_data['Latitude'], toront_data['Longitude'], toront_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

# Foursquare version

In [30]:
CLIENT_ID = 'RGPQKGEHBNDBLTLOPNAH1UIMBJU5NEAIGF233HQOKTCVGSAH' 
CLIENT_SECRET = 'KIUCG1P2ZA4NBXEIYMSXLPYCOCFQTKWPSWWDSWAXQ4UB5S4V' 
VERSION = '20180605' 

# Explore neighborhoods in West Toronto

In [33]:
LIMIT=100
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [34]:
toronto_venues = getNearbyVenues(names=toront_data['Neighborhood'],
                                   latitudes=toront_data['Latitude'],
                                   longitudes=toront_data['Longitude']
                                  )

Dovercourt Village, Dufferin
Little Portugal, Trinity
Brockton, Exhibition Place, Parkdale Village
High Park, The Junction South
Parkdale, Roncesvalles
Runnymede, Swansea


Size of resulting Dataframe

In [35]:
print(toronto_venues.shape)
toronto_venues.head()

(179, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Dovercourt Village, Dufferin",43.669005,-79.442259,The Greater Good Bar,43.669409,-79.439267,Bar
1,"Dovercourt Village, Dufferin",43.669005,-79.442259,Parallel,43.669516,-79.438728,Middle Eastern Restaurant
2,"Dovercourt Village, Dufferin",43.669005,-79.442259,Happy Bakery & Pastries,43.66705,-79.441791,Bakery
3,"Dovercourt Village, Dufferin",43.669005,-79.442259,FreshCo,43.667918,-79.440754,Supermarket
4,"Dovercourt Village, Dufferin",43.669005,-79.442259,Planet Fitness Toronto Galleria,43.667588,-79.442574,Gym / Fitness Center


In [37]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Brockton, Exhibition Place, Parkdale Village",21,21,21,21,21,21
"Dovercourt Village, Dufferin",20,20,20,20,20,20
"High Park, The Junction South",23,23,23,23,23,23
"Little Portugal, Trinity",62,62,62,62,62,62
"Parkdale, Roncesvalles",15,15,15,15,15,15
"Runnymede, Swansea",38,38,38,38,38,38


# Analyze each neighborhood

In [38]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")


toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighborhood,Antique Shop,Art Gallery,Arts & Crafts Store,Asian Restaurant,Bakery,Bank,Bar,Bistro,Bookstore,...,Supermarket,Sushi Restaurant,Tapas Restaurant,Tea Room,Thai Restaurant,Theater,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,"Dovercourt Village, Dufferin",0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Dovercourt Village, Dufferin",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Dovercourt Village, Dufferin",0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Dovercourt Village, Dufferin",0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,"Dovercourt Village, Dufferin",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


New Data Frame size

In [40]:
toronto_onehot.shape

(179, 90)

GROUPING OF NEIGHBORHOODS

In [43]:
toronto_group=toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_group

Unnamed: 0,Neighborhood,Antique Shop,Art Gallery,Arts & Crafts Store,Asian Restaurant,Bakery,Bank,Bar,Bistro,Bookstore,...,Supermarket,Sushi Restaurant,Tapas Restaurant,Tea Room,Thai Restaurant,Theater,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.047619,0.0,0.047619,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619
1,"Dovercourt Village, Dufferin",0.0,0.0,0.0,0.0,0.1,0.05,0.05,0.0,0.0,...,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"High Park, The Junction South",0.043478,0.0,0.043478,0.0,0.043478,0.0,0.086957,0.0,0.043478,...,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0
3,"Little Portugal, Trinity",0.0,0.016129,0.0,0.048387,0.032258,0.0,0.129032,0.016129,0.0,...,0.0,0.0,0.016129,0.0,0.0,0.016129,0.016129,0.032258,0.016129,0.016129
4,"Parkdale, Roncesvalles",0.0,0.0,0.0,0.0,0.0,0.066667,0.066667,0.0,0.066667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Runnymede, Swansea",0.0,0.0,0.0,0.0,0.0,0.0,0.026316,0.0,0.026316,...,0.0,0.052632,0.0,0.026316,0.0,0.0,0.026316,0.0,0.0,0.0


TOP 5 MOST COMMON VENUES

In [44]:
num_top_venues = 5

for hood in toronto_group['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_group[toronto_group['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Brockton, Exhibition Place, Parkdale Village----
               venue  freq
0     Breakfast Spot  0.10
1        Coffee Shop  0.10
2               Café  0.10
3      Grocery Store  0.05
4  Convenience Store  0.05


----Dovercourt Village, Dufferin----
                  venue  freq
0                Bakery  0.10
1              Pharmacy  0.10
2           Supermarket  0.10
3  Gym / Fitness Center  0.05
4               Brewery  0.05


----High Park, The Junction South----
                       venue  freq
0         Mexican Restaurant  0.09
1                        Bar  0.09
2                       Café  0.09
3               Antique Shop  0.04
4  Cajun / Creole Restaurant  0.04


----Little Portugal, Trinity----
              venue  freq
0               Bar  0.13
1       Men's Store  0.05
2  Asian Restaurant  0.05
3       Coffee Shop  0.05
4        Restaurant  0.03


----Parkdale, Roncesvalles----
            venue  freq
0       Gift Shop  0.13
1  Breakfast Spot  0.13
2      Restaurant  0

# Now let's put that into PANDAS dataframe

In [45]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [47]:
import numpy as np

In [48]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_group['Neighborhood']

for ind in np.arange(toronto_group.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_group.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Brockton, Exhibition Place, Parkdale Village",Coffee Shop,Café,Breakfast Spot,Yoga Studio,Burrito Place,Grocery Store,Gym,Furniture / Home Store,Italian Restaurant,Performing Arts Venue
1,"Dovercourt Village, Dufferin",Bakery,Pharmacy,Supermarket,Gym / Fitness Center,Brewery,Music Venue,Park,Discount Store,Coffee Shop,Pool
2,"High Park, The Junction South",Mexican Restaurant,Café,Bar,Antique Shop,Speakeasy,Diner,Park,Fast Food Restaurant,Cajun / Creole Restaurant,Flea Market
3,"Little Portugal, Trinity",Bar,Asian Restaurant,Coffee Shop,Men's Store,Cocktail Bar,Vietnamese Restaurant,Pizza Place,Bakery,Restaurant,New American Restaurant
4,"Parkdale, Roncesvalles",Gift Shop,Breakfast Spot,Cuban Restaurant,Restaurant,Movie Theater,Italian Restaurant,Bank,Bar,Dog Run,Bookstore


# Clustering of Neighborhoods

In [50]:
from sklearn.cluster import KMeans

In [51]:
kclusters = 3
toronto_group_clustering = toronto_group.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_group_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 2, 1, 1, 0, 1], dtype=int32)

In [52]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toront_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M6H,West Toronto,"Dovercourt Village, Dufferin",43.669005,-79.442259,2,Bakery,Pharmacy,Supermarket,Gym / Fitness Center,Brewery,Music Venue,Park,Discount Store,Coffee Shop,Pool
1,M6J,West Toronto,"Little Portugal, Trinity",43.647927,-79.41975,1,Bar,Asian Restaurant,Coffee Shop,Men's Store,Cocktail Bar,Vietnamese Restaurant,Pizza Place,Bakery,Restaurant,New American Restaurant
2,M6K,West Toronto,"Brockton, Exhibition Place, Parkdale Village",43.636847,-79.428191,1,Coffee Shop,Café,Breakfast Spot,Yoga Studio,Burrito Place,Grocery Store,Gym,Furniture / Home Store,Italian Restaurant,Performing Arts Venue
3,M6P,West Toronto,"High Park, The Junction South",43.661608,-79.464763,1,Mexican Restaurant,Café,Bar,Antique Shop,Speakeasy,Diner,Park,Fast Food Restaurant,Cajun / Creole Restaurant,Flea Market
4,M6R,West Toronto,"Parkdale, Roncesvalles",43.64896,-79.456325,0,Gift Shop,Breakfast Spot,Cuban Restaurant,Restaurant,Movie Theater,Italian Restaurant,Bank,Bar,Dog Run,Bookstore


FINALLY LET'S VISUALIZE THE RESULTING CLUSTERS

In [54]:
import matplotlib.cm as cm
import matplotlib.colors as colors


In [55]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters