# Scraping the Wiki Page

In [2]:
#Load Panda libraries
import requests
import lxml.html as lh
import pandas as pd
from sklearn.cluster import KMeans
#!conda install -c conda-forge folium=0.5.0 --yes
import folium
#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

#Load the URL
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

#Load the Page
page = requests.get(url)

#Get the document content
doc = lh.fromstring(page.content)

#Get the rows
tr_elements = doc.xpath('//tr')

#Get the first n (20) number of rows
[len(T) for T in tr_elements[:20]]

#Initialize the empty data frame with column names
columnsTitles = ['PostalCode', 'Borough', 'Neighborhood']
neigh_df=pd.DataFrame(columns=['PostalCode', 'Borough', 'Neighborhood'])
neigh_df = neigh_df.reindex(columns=columnsTitles)

#Iterate through the rows
for i in range(0,20):
    
    #Iterate throwugh the columns
    for t in tr_elements[i]:
        
        #Get cell content as text
        row_content=t.text_content()
        
        #Verify if borough is not assigned and if at least one neighborhood exists
        if(('Not assigned' not in row_content) and '(' in row_content):
            
            #Get the street name and replace any new line characters
            street_name=row_content.replace("\n","")
            
            #Get the postal code
            postal_code=street_name[:3]            
            
            #Get the Borough
            borough=street_name[3:street_name.index('(')]
            
            #Get the Neighborhood
            neighborhood=street_name[street_name.index('(')+1:street_name.index(')')]
            
            #Replace slashes by commas in neighborhood
            neighborhood=neighborhood.replace("/",",")
            
            #Create a new row with above details and add to data frame
            neigh_df.loc[i]=[postal_code,borough,neighborhood]

#Print the data frame            
neigh_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M9A,Etobicoke,Islington Avenue
1,M9B,Etobicoke,"West Deane Park , Princess Gardens , Martin Gr..."
2,M9C,Etobicoke,"Eringate , Bloordale Gardens , Old Burnhamthor..."
3,M6E,York,Caledonia-Fairbanks
4,M6G,Downtown Toronto,Christie
5,M6H,West Toronto,"Dufferin , Dovercourt Village"
6,M6J,West Toronto,"Little Portugal , Trinity"
7,M6K,West Toronto,"Brockton , Parkdale Village , Exhibition Place"
8,M9L,North York,Humber Summit
9,M9M,North York,"Humberlea , Emery"


In [3]:
#Print the shape of the data frame
print("Shape of the Data Frame:",neigh_df.shape)

Shape of the Data Frame: (20, 3)


# Applying Geo Co-ordinates

In [4]:
#Load the GeoCode CSV
geocode_csv=pd.read_csv("http://cocl.us/Geospatial_data")

#Insert Latitude, Longitude Columns in Neighborhhod Data Frame
neigh_df.insert(3,"Latitude","")
neigh_df.insert(4,"Longitude","")

#Iterate through neighborhood and geocode dataframes and put geocode values for matching postal codes
for index1,row1 in neigh_df.iterrows():
    for index2,row2 in geocode_csv.iterrows():
        if(row1[0] == row2[0]):
            row1[3] = row2[1]
            row1[4] = row2[2]
neigh_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M9A,Etobicoke,Islington Avenue,43.6679,-79.5322
1,M9B,Etobicoke,"West Deane Park , Princess Gardens , Martin Gr...",43.6509,-79.5547
2,M9C,Etobicoke,"Eringate , Bloordale Gardens , Old Burnhamthor...",43.6435,-79.5772
3,M6E,York,Caledonia-Fairbanks,43.689,-79.4535
4,M6G,Downtown Toronto,Christie,43.6695,-79.4226
5,M6H,West Toronto,"Dufferin , Dovercourt Village",43.669,-79.4423
6,M6J,West Toronto,"Little Portugal , Trinity",43.6479,-79.4197
7,M6K,West Toronto,"Brockton , Parkdale Village , Exhibition Place",43.6368,-79.4282
8,M9L,North York,Humber Summit,43.7563,-79.566
9,M9M,North York,"Humberlea , Emery",43.7248,-79.5322


In [5]:
toronto_data = neigh_df[neigh_df['Borough'].str.contains('Toronto')].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M6G,Downtown Toronto,Christie,43.6695,-79.4226
1,M6H,West Toronto,"Dufferin , Dovercourt Village",43.669,-79.4423
2,M6J,West Toronto,"Little Portugal , Trinity",43.6479,-79.4197
3,M6K,West Toronto,"Brockton , Parkdale Village , Exhibition Place",43.6368,-79.4282
4,M6S,West Toronto,"Runnymede , Swansea",43.6516,-79.4844


In [6]:
#Display the geopgraphical co-ordinates of Toronto

address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [8]:
# create map of Toronto using latitude and longitude values
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood']):
  label = folium.Popup(label, parse_html=True)
  folium.CircleMarker(
       [lat, lng],
       radius=5,
       popup=label,
       color='blue',
      fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
    
toronto_map

# Exploring Toronto Neighborhoods

In [9]:
CLIENT_ID = 'JPW0ZSUCPJH3FBTS4CZ25NNRBV0OBFYR5VRZ3OH5A0UTUCRO' # Foursquare ID
CLIENT_SECRET = 'KKHTO2IUOBI5W4GRTK2D3ZYAM0NHCQARP0XUOLFPJS1BZFM0' # Foursquare Secret
VERSION = '20200324' # Foursquare API version
LIMIT = 100

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [10]:
toronto_venues = getNearbyVenues(names=toronto_data['Neighborhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )
print(toronto_venues.shape)
toronto_venues.head()

Christie
Dufferin , Dovercourt Village
Little Portugal , Trinity
Brockton , Parkdale Village , Exhibition Place
Runnymede , Swansea
Kensington Market , Chinatown , Grange Park
(239, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Christie,43.669542,-79.422564,Fiesta Farms,43.668471,-79.420485,Grocery Store
1,Christie,43.669542,-79.422564,Contra Cafe,43.669107,-79.426105,Café
2,Christie,43.669542,-79.422564,Vinny’s Panini,43.670679,-79.426148,Italian Restaurant
3,Christie,43.669542,-79.422564,Starbucks,43.67153,-79.4214,Coffee Shop
4,Christie,43.669542,-79.422564,Scout and Cash Caffe,43.66736,-79.419938,Café


#### Let's find out how many unique categories can be curated from all the returned venues

In [137]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 107 uniques categories.


# 3. Analyze Each Neighborhood

In [12]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()
toronto_onehot.shape
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped
toronto_grouped.shape

(6, 108)

#### Let's print each neighborhood along with the top 5 most common venues

In [13]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Brockton , Parkdale Village , Exhibition Place----
            venue  freq
0     Coffee Shop  0.09
1            Café  0.09
2  Breakfast Spot  0.09
3   Burrito Place  0.04
4         Stadium  0.04


----Christie----
           venue  freq
0  Grocery Store  0.22
1           Café  0.17
2           Park  0.11
3          Diner  0.06
4     Restaurant  0.06


----Dufferin , Dovercourt Village----
                  venue  freq
0              Pharmacy  0.12
1                Bakery  0.12
2  Gym / Fitness Center  0.06
3  Fast Food Restaurant  0.06
4               Brewery  0.06


----Kensington Market , Chinatown , Grange Park----
                           venue  freq
0                            Bar  0.07
1                           Café  0.06
2          Vietnamese Restaurant  0.06
3  Vegetarian / Vegan Restaurant  0.05
4                         Bakery  0.05


----Little Portugal , Trinity----
              venue  freq
0               Bar  0.13
1       Coffee Shop  0.07
2  Asian Restaurant  0

### Let's put that into a pandas dataframe and Sort the venues in descending order

In [14]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#### Create the new dataframe and display the top 10 venues for each neighborhood

In [15]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Brockton , Parkdale Village , Exhibition Place",Coffee Shop,Café,Breakfast Spot,Furniture / Home Store,Restaurant,Climbing Gym,Japanese Restaurant,Italian Restaurant,Nightclub,Intersection
1,Christie,Grocery Store,Café,Park,Nightclub,Candy Store,Coffee Shop,Athletics & Sports,Baby Store,Gas Station,Italian Restaurant
2,"Dufferin , Dovercourt Village",Bakery,Pharmacy,Brewery,Supermarket,Pool,Café,Park,Fast Food Restaurant,Gym / Fitness Center,Bar
3,"Kensington Market , Chinatown , Grange Park",Bar,Vietnamese Restaurant,Café,Bakery,Vegetarian / Vegan Restaurant,Coffee Shop,Mexican Restaurant,Dumpling Restaurant,Noodle House,Burger Joint
4,"Little Portugal , Trinity",Bar,Coffee Shop,Restaurant,Asian Restaurant,Pizza Place,Café,Men's Store,Vietnamese Restaurant,Wine Bar,Mac & Cheese Joint


# Cluster Neighborhoods

Run k-means to cluster the neighborhood into 5 clusters.

In [16]:
# set number of clusters

kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M6G,Downtown Toronto,Christie,43.6695,-79.4226,2,Grocery Store,Café,Park,Nightclub,Candy Store,Coffee Shop,Athletics & Sports,Baby Store,Gas Station,Italian Restaurant
1,M6H,West Toronto,"Dufferin , Dovercourt Village",43.669,-79.4423,1,Bakery,Pharmacy,Brewery,Supermarket,Pool,Café,Park,Fast Food Restaurant,Gym / Fitness Center,Bar
2,M6J,West Toronto,"Little Portugal , Trinity",43.6479,-79.4197,0,Bar,Coffee Shop,Restaurant,Asian Restaurant,Pizza Place,Café,Men's Store,Vietnamese Restaurant,Wine Bar,Mac & Cheese Joint
3,M6K,West Toronto,"Brockton , Parkdale Village , Exhibition Place",43.6368,-79.4282,3,Coffee Shop,Café,Breakfast Spot,Furniture / Home Store,Restaurant,Climbing Gym,Japanese Restaurant,Italian Restaurant,Nightclub,Intersection
4,M6S,West Toronto,"Runnymede , Swansea",43.6516,-79.4844,4,Pizza Place,Café,Coffee Shop,Italian Restaurant,Sushi Restaurant,Gourmet Shop,Diner,Latin American Restaurant,Juice Bar,Indie Movie Theater


### Visualize the resulting clusters

In [17]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#  Examine Clusters

### Determine the discriminating venue categories that distinguish each cluster

#### Cluster 1

In [18]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,West Toronto,0,Bar,Coffee Shop,Restaurant,Asian Restaurant,Pizza Place,Café,Men's Store,Vietnamese Restaurant,Wine Bar,Mac & Cheese Joint
5,Downtown Toronto,0,Bar,Vietnamese Restaurant,Café,Bakery,Vegetarian / Vegan Restaurant,Coffee Shop,Mexican Restaurant,Dumpling Restaurant,Noodle House,Burger Joint


#### Cluster 2

In [21]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,West Toronto,1,Bakery,Pharmacy,Brewery,Supermarket,Pool,Café,Park,Fast Food Restaurant,Gym / Fitness Center,Bar


#### Cluster 3

In [24]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,2,Grocery Store,Café,Park,Nightclub,Candy Store,Coffee Shop,Athletics & Sports,Baby Store,Gas Station,Italian Restaurant


#### Cluster 4

In [25]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,West Toronto,3,Coffee Shop,Café,Breakfast Spot,Furniture / Home Store,Restaurant,Climbing Gym,Japanese Restaurant,Italian Restaurant,Nightclub,Intersection


#### Cluster 5

In [26]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,West Toronto,4,Pizza Place,Café,Coffee Shop,Italian Restaurant,Sushi Restaurant,Gourmet Shop,Diner,Latin American Restaurant,Juice Bar,Indie Movie Theater
