In [6]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

 

## Download data and parse it

In [7]:

r = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(r.text, 'html.parser')
table=soup.find('table', attrs={'class':'wikitable sortable'})

## Get Headers

In [8]:
headers=table.findAll('th')
for i, head in enumerate(headers): headers[i]=str(headers[i]).replace("<th>","").replace("</th>","").replace("\n","")

## find all the items and skip first one

In [9]:
rows=table.findAll('tr')
rows=rows[1:len(rows)]

## skip all meta symbols and line feeds

In [10]:
for i, row in enumerate(rows): rows[i] = str(rows[i]).replace("\n</td></tr>","").replace("<tr>\n<td>","")

## make dataframe and expand rows

In [11]:

df=pd.DataFrame(rows)
df[headers] = df[0].str.split("</td>\n<td>", n = 2, expand = True) 
df.drop(columns=[0],inplace=True)

## skip not assigned boroughs

In [12]:
df = df.drop(df[(df.Borough == "Not assigned")].index)

## give "not assigned" same name as boroughs

In [13]:
df.Neighbourhood.replace("Not assigned", df.Borough, inplace=True)

## copy borough value to neighbourhood if NaN

In [14]:
df.Neighbourhood.fillna(df.Borough, inplace=True)

## drop duplicate rows

In [15]:
df=df.drop_duplicates()

In [16]:
## delete toronto annotation fom neighbourhood
df.update(
    df.Neighbourhood.loc[
        lambda x: x.str.contains('title')
    ].str.extract('title=\"([^\"]*)',expand=False))
df.update(
    df.Borough.loc[
       lambda x: x.str.contains('title')
    ].str.extract('title=\"([^\"]*)',expand=False))

In [17]:
#print(df);
df2 = pd.DataFrame({'Postcode':df.Postcode.unique()})
df2['Borough']=pd.DataFrame(list(set(df['Borough'].loc[df['Postcode'] == x['Postcode']])) for i, x in df2.iterrows())
df2['Neighborhood']=pd.Series(list(set(df['Neighbourhood'].loc[df['Postcode'] == x['Postcode']])) for i, x in df2.iterrows())
df2['Neighborhood']=df2['Neighborhood'].apply(lambda x: ', '.join(x))
df2.dtypes
df2.head(12)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park (Toronto)
5,M9A,Queen's Park (Toronto),Queen's Park (Toronto)
6,M1B,"Scarborough, Toronto","Malvern, Toronto, Rouge, Toronto"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [19]:
df2.shape

(103, 3)

# Geo location

In [21]:
from geopy.geocoders import Nominatim

In [22]:
dfll= pd.read_csv("http://cocl.us/Geospatial_data")
dfll.rename(columns={'Postal Code':'Postcode'}, inplace=True)
dfll.set_index("Postcode")
df2.set_index("Postcode")
toronto_data=pd.merge(df2, dfll)
toronto_data.head(12)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park (Toronto),43.662301,-79.389494
5,M9A,Queen's Park (Toronto),Queen's Park (Toronto),43.667856,-79.532242
6,M1B,"Scarborough, Toronto","Malvern, Toronto, Rouge, Toronto",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


# Clustering of Neighbourhoods

In [24]:
geo_data=toronto_data[toronto_data['Borough'].str.contains("Toronto")]
geo_data.head(12)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
4,M7A,Downtown Toronto,Queen's Park (Toronto),43.662301,-79.389494
5,M9A,Queen's Park (Toronto),Queen's Park (Toronto),43.667856,-79.532242
6,M1B,"Scarborough, Toronto","Malvern, Toronto, Rouge, Toronto",43.806686,-79.194353
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
12,M1C,"Scarborough, Toronto","Port Union, Toronto, Highland Creek (Toronto),...",43.784535,-79.160497
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
18,M1E,"Scarborough, Toronto","West Hill, Toronto, Morningside, Toronto, Guil...",43.763573,-79.188711
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [25]:
CLIENT_ID = '2K5LH2ZGIJBRCOZOFWPIPWT0YURQZRE400ZOZQOMKPSL2UIA' # your Foursquare ID
CLIENT_SECRET = 'FC0AG5Q30I3GIWXU25II3WQQPVID4LZAWAZBXP5GAXCMWWJ5' # your Foursquare Secret
VERSION = '20180604'

In [35]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)
        

In [36]:
toronto_venues = getNearbyVenues(names=toronto_data['Neighborhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

Parkwoods
Victoria Village
Regent Park
Lawrence Heights, Lawrence Manor
Queen's Park (Toronto)
Queen's Park (Toronto)
Malvern, Toronto, Rouge, Toronto
Don Mills North
Woodbine Gardens, Parkview Hill
Ryerson, Garden District
Glencairn
Cloverdale, Islington, Toronto, West Deane Park, Martin Grove, Princess Gardens
Port Union, Toronto, Highland Creek (Toronto), Rouge Hill
Flemingdon Park, Don Mills South
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
West Hill, Toronto, Morningside, Toronto, Guildwood
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn, Toronto
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Downsview North, Wilson Heights, Toronto
Thorncliffe Park
Adelaide, Richmond, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Oriole, Henry Farm
York University, Northwood Park
East Toronto
Harbourfront East, Toronto Islands, Union Station (Toronto)
Trinity–Bellwoods, Littl

In [37]:
toronto_venues.head(12)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Parkwoods,43.753259,-79.329656,Bella Vita Catering & Private Chef Service,43.756651,-79.331524,BBQ Joint
3,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
5,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant
6,Victoria Village,43.725882,-79.315572,The Frig,43.727051,-79.317418,French Restaurant
7,Victoria Village,43.725882,-79.315572,Eglinton Ave E & Sloane Ave/Bermondsey Rd,43.726086,-79.31362,Intersection
8,Victoria Village,43.725882,-79.315572,Pizza Nova,43.725824,-79.31286,Pizza Place
9,Regent Park,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery


In [38]:

toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, Richmond, King",100,100,100,100,100,100
"Agincourt, Toronto",4,4,4,4,4,4
"Albion Gardens, Beaumond Heights, Silverstone, Toronto, Mount Olive-Silverstone-Jamestown, South Steeles, Humbergate, Thistletown",11,11,11,11,11,11
"Alderwood, Toronto, Long Branch, Toronto",9,9,9,9,9,9
"Bathurst Manor, Downsview North, Wilson Heights, Toronto",20,20,20,20,20,20
Bayview Village,4,4,4,4,4,4
"Bedford Park, Toronto, Lawrence Manor East",25,25,25,25,25,25
Berczy Park,55,55,55,55,55,55
"Birch Cliff, Cliffside West",4,4,4,4,4,4
"Brockton, Exhibition Place, Parkdale Village",22,22,22,22,22,22


In [39]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot.drop(['Neighborhood'],axis=1,inplace=True) 
toronto_onehot.insert(loc=0, column='Neighborhood', value=toronto_venues['Neighborhood'] )
toronto_onehot.shape

(2213, 271)

In [40]:

toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, Richmond, King",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,...,0.02,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0
1,"Agincourt, Toronto",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Albion Gardens, Beaumond Heights, Silverstone,...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Alderwood, Toronto, Long Branch, Toronto",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bathurst Manor, Downsview North, Wilson Height...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [42]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head(12)


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, Richmond, King",Coffee Shop,Café,Steakhouse,Cosmetics Shop,Bakery,Restaurant,Bar,Asian Restaurant,Thai Restaurant,Gym
1,"Agincourt, Toronto",Lounge,Breakfast Spot,Latin American Restaurant,Skating Rink,Yoga Studio,Eastern European Restaurant,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
2,"Albion Gardens, Beaumond Heights, Silverstone,...",Grocery Store,Fried Chicken Joint,Pizza Place,Coffee Shop,Sandwich Place,Fast Food Restaurant,Pharmacy,Beer Store,Liquor Store,Video Store
3,"Alderwood, Toronto, Long Branch, Toronto",Pizza Place,Skating Rink,Gym,Coffee Shop,Pool,Pub,Sandwich Place,Pharmacy,General Travel,General Entertainment
4,"Bathurst Manor, Downsview North, Wilson Height...",Coffee Shop,Ice Cream Shop,Frozen Yogurt Shop,Sandwich Place,Diner,Middle Eastern Restaurant,Restaurant,Deli / Bodega,Supermarket,Sushi Restaurant
5,Bayview Village,Café,Bank,Chinese Restaurant,Japanese Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Yoga Studio
6,"Bedford Park, Toronto, Lawrence Manor East",Fast Food Restaurant,Coffee Shop,Sandwich Place,Italian Restaurant,Thai Restaurant,Pizza Place,Pub,Restaurant,Café,Butcher
7,Berczy Park,Coffee Shop,Cocktail Bar,Farmers Market,Seafood Restaurant,Bakery,Cheese Shop,Café,Steakhouse,Beer Bar,Basketball Stadium
8,"Birch Cliff, Cliffside West",College Stadium,Skating Rink,General Entertainment,Café,Ethiopian Restaurant,Empanada Restaurant,Event Space,Electronics Store,Eastern European Restaurant,Dim Sum Restaurant
9,"Brockton, Exhibition Place, Parkdale Village",Coffee Shop,Café,Breakfast Spot,Grocery Store,Bakery,Performing Arts Venue,Pet Store,Office,Nightclub,Climbing Gym


In [44]:
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
from sklearn.datasets.samples_generator import make_blobs
from bs4 import BeautifulSoup
import lxml
kclusters = 5
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [45]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,1.0,Food & Drink Shop,Park,BBQ Joint,Eastern European Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
1,M4A,North York,Victoria Village,43.725882,-79.315572,0.0,Pizza Place,Portuguese Restaurant,French Restaurant,Coffee Shop,Hockey Arena,Intersection,Electronics Store,Eastern European Restaurant,Empanada Restaurant,Ethiopian Restaurant
2,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636,0.0,Coffee Shop,Bakery,Park,Café,Pub,Restaurant,Mexican Restaurant,Beer Store,Bank,French Restaurant
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763,0.0,Furniture / Home Store,Clothing Store,Accessories Store,Arts & Crafts Store,Gift Shop,Event Space,Miscellaneous Shop,Coffee Shop,Boutique,Women's Store
4,M7A,Downtown Toronto,Queen's Park (Toronto),43.662301,-79.389494,0.0,Coffee Shop,Park,Gym,Beer Bar,Seafood Restaurant,Burger Joint,Burrito Place,Sandwich Place,Salad Place,Café


In [46]:
neighborhoods_venues_sorted.head()

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0,"Adelaide, Richmond, King",Coffee Shop,Café,Steakhouse,Cosmetics Shop,Bakery,Restaurant,Bar,Asian Restaurant,Thai Restaurant,Gym
1,0,"Agincourt, Toronto",Lounge,Breakfast Spot,Latin American Restaurant,Skating Rink,Yoga Studio,Eastern European Restaurant,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
2,0,"Albion Gardens, Beaumond Heights, Silverstone,...",Grocery Store,Fried Chicken Joint,Pizza Place,Coffee Shop,Sandwich Place,Fast Food Restaurant,Pharmacy,Beer Store,Liquor Store,Video Store
3,0,"Alderwood, Toronto, Long Branch, Toronto",Pizza Place,Skating Rink,Gym,Coffee Shop,Pool,Pub,Sandwich Place,Pharmacy,General Travel,General Entertainment
4,0,"Bathurst Manor, Downsview North, Wilson Height...",Coffee Shop,Ice Cream Shop,Frozen Yogurt Shop,Sandwich Place,Diner,Middle Eastern Restaurant,Restaurant,Deli / Bodega,Supermarket,Sushi Restaurant


ModuleNotFoundError: No module named 'folium'

In [47]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Manhattan are 43.653963, -79.387207.


In [52]:
import folium
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_clusters)
       
map_clusters

ModuleNotFoundError: No module named 'folium'