<a href="https://colab.research.google.com/github/arohanajit/Coursera_Capstone/blob/master/Toronto_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import requests
from bs4 import BeautifulSoup
import requests
from pandas.io.json import json_normalize
import os
import numpy as np
import pandas as pd
import json
import geocoder 
from geopy.geocoders import Nominatim
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
print('Libraries imported.')

### Extracting Data
* URL is taken through requests method
* Using Beautiful soup library, table data is extracted

In [0]:
pageUrl = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(pageUrl).text

In [0]:
soup = BeautifulSoup(page, 'html.parser')
myTable = soup.find('table', class_="wikitable sortable")
headings = myTable.find_all('th')

### Pandas DataFrame
* An empty dataframe is created using extracted headings
* Values are extracted from table and appended after formatting
  * Not assigned values are removed first
  * White spaces are removed from list
  * Boroughs with multiple neighbourhoods are formatted

In [0]:
column = []
for i in headings:
    column.append(i.text[:-1])
toronto_hood = pd.DataFrame(columns=column)
toronto_hood

In [0]:
values = myTable.find_all('tr')
del values[0]
for i in values:
    temp = i.text.split('\n')
    if temp[3]=='Not assigned':
        continue
    else:
        temp = [temp[i] for i in range(len(temp)) if i%2!=0]
        temp[-1] = temp[-1].replace(' /',',')
        toronto_hood = toronto_hood.append({'Postal code':temp[0],
                                            'Borough':temp[1],
                                            'Neighborhood':temp[2]},ignore_index=True)

        

In [0]:
toronto_hood = toronto_hood.groupby(['Postal code','Borough'],as_index=False).agg(lambda x: ", ".join(x))
for index, row in toronto_hood.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
toronto_hood.head(10)

In [0]:
toronto_hood.shape

In [0]:
coordinates = pd.read_csv("https://raw.githubusercontent.com/arohanajit/Coursera_Capstone/master/Geospatial_Coordinates.csv")
coordinates.head()

In [0]:
coordinates.rename(columns={"Postal Code": "Postal code"}, inplace=True)
coordinates.head()

In [0]:
toronto_new = toronto_hood.merge(coordinates, on="Postal code", how="left")

In [0]:
print("Toronto has {} Boroughs and {} unique neighborhoods".format(len(toronto_new['Borough'].unique()),
                                                                       toronto_new.shape[0]))

In [0]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

In [0]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
for lat, lng, borough, neighborhood in zip(toronto_new['Latitude'], toronto_new['Longitude'], toronto_new['Borough'], toronto_new['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [0]:
highlight_hood = [i for i in toronto_new['Borough'].unique() if 'Toronto' in i]
print(highlight_hood)
subtoronto_data = toronto_new[toronto_new['Borough'].isin(highlight_hood)].reset_index(drop=True)
print(subtoronto_data.shape)
subtoronto_data.tail()

In [0]:
CLIENT_ID = '3KHJVC5U5PAOGTOZXR2Y5KCYDNGLUYOYDMY5LILY2W34MCAO' 
CLIENT_SECRET = 'L0O2VAQ2ETKJJPGWJ554DHC3052O2AVWGCTNYXEGFDG10D11' 
VERSION = '20180605' # Foursquare API version

print('Credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

In [0]:
LIMIT = 100
radius = 500
count = 0
venues_list = []
for lat,long,name,bor,code in zip(subtoronto_data['Latitude'],subtoronto_data['Longitude'],
                                  subtoronto_data['Neighborhood'],subtoronto_data['Borough'],
                                  subtoronto_data['Postal code']):
    

    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    lat, 
    long, 
    radius, 
    LIMIT)
    
    if count%5==0:
        print("{} done".format(count))
    tries=0
    while(1):
        tries+=1
        if tries%20==0:
            print("Try : {}...".format(tries))
        GET = requests.get(url).json()["response"]
        if len(GET)!=0:
            break
    results = GET['groups'][0]['items']
    count+=1
    venues_list.append([(
            code,
            bor,
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

In [0]:
import copy
x = copy.deepcopy(venues_list)

In [0]:
for i in x:
    for j in range(len(i)):
        if i[j][-1]=='Neighborhood':
            i[j] = (i[j][:-1], 'Community')
            print(i)

In [0]:
venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
venues.columns = [
                'Postal code',
                'Borough',
                'Neighborhood', 
                'Borough Latitude', 
                'Borough Longitude', 
                'Venue', 
                'Venue Latitude', 
                'Venue Longitude', 
                'Venue Category']

In [0]:
print(venues.shape)
venues.head()

In [0]:
venues.groupby(["Postal code", "Borough", "Neighborhood"]).count()

In [0]:
print('There are {} uniques categories.'.format(len(venues['Venue Category'].unique())))

In [0]:
toronto_onehot = pd.get_dummies(venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot['Postal code'] = venues['Postal code'] 
toronto_onehot['Borough'] = venues['Borough'] 
toronto_onehot['Neighborhood'] = venues['Neighborhood']
fixed_columns = [toronto_onehot.columns[-3:]] + list(toronto_onehot.columns[:-3])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

In [0]:
toronto_onehot.shape

In [0]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

In [0]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

In [0]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [0]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

In [0]:
kclusters = 5
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)
kmeans.labels_[0:10]

In [0]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged = subtoronto_data
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')


In [0]:
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype(int)
toronto_merged.head()

In [0]:
subtoronto_data.head()

In [0]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters