# Capstone Project Notebook - The Battle of Neighborhoods

Importing required required dependencies

In [None]:
!conda install -c conda-forge folium=0.5.0 --yes
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize 
from bs4 import BeautifulSoup 
import requests 
import json
import seaborn as sns
import matplotlib.pyplot as plt
import folium 
from folium import plugins
import matplotlib.cm as cm
import matplotlib.colors as col
from geopy.geocoders import Nominatim 
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
import sklearn.utils
from sklearn.preprocessing import StandardScaler

Collecting package metadata (current_repodata.json): / 

Scraping the list of postal codes of Toronto and creating our first dataframe

In [None]:
url = 'https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=945633050'
data = requests.get(url).text
soup = BeautifulSoup(data, 'lxml')
table = soup.find('table', {'class': 'wikitable sortable'})
columns_name = ['PostalCode', 'Borough', 'Neighborhood']
df = pd.DataFrame(columns = columns_name)
df

Filling the dataframe with the data we scraped from Wikipedia

In [None]:
for row in table.find_all('tr'):
    row_data=[]
    for data in row.find_all('td'):
        row_data.append(data.text.strip())
    if len(row_data)==3:
        df.loc[len(df)] = row_data

Here's what the dataframe looks like now

In [None]:
df.head(10)

Removing 'Not assigned' values from the Borough column

In [None]:
de = df [df['Borough'] == 'Not assigned'].index
df.drop(de, inplace=True)
df.loc[df['Neighborhood'] =='Not assigned' , 'Neighborhood'] = df['Borough']
r_df = df.groupby(['PostalCode', 'Borough'], sort=False).agg(', '.join)
df = r_df.reset_index()
df.head(10)

Reading a csv file locally stored in IBM cloud, the file contains geospatial coordinates of Toronto

In [None]:
import os, types
import pandas as pd
from botocore.client import Config
import ibm_boto3
def __iter__(self): return 0
if os.environ.get('RUNTIME_ENV_LOCATION_TYPE') == 'external':
    endpoint_45422a1df12848e1b6dd8cefeb73dedf = 'https://s3.eu.cloud-object-storage.appdomain.cloud'
else:
    endpoint_45422a1df12848e1b6dd8cefeb73dedf = 'https://s3.private.eu.cloud-object-storage.appdomain.cloud'
client_45422a1df12848e1b6dd8cefeb73dedf = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='JiLjohth3a9Ysr812SlPj99DbF-4nq0zLD4jc6WEn92s',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url=endpoint_45422a1df12848e1b6dd8cefeb73dedf)
body = client_45422a1df12848e1b6dd8cefeb73dedf.get_object(Bucket='courseracapstone-donotdelete-pr-wx2u4brhxefhzw',Key='Geospatial_Coordinates.csv')['Body']
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )
geo = pd.read_csv(body)
geo.head(10)


Merging DF that contains Boroughs and Neighborhoods with GEO that contains geospatial coordinates

In [None]:
geo.columns = ['PostalCode', 'Latitude', 'Longitude']
t_df = pd.merge(df, geo, on='PostalCode')
t_df.head(10)

Scarping the GeoJSON file

In [None]:
url = 'https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/a083c865-6d60-4d1d-b6c6-b0c8a85f9c15?format=geojson&projection=4326'
geo_toronto = requests.get(url).json()
geo_toronto

In [None]:
url_ward = 'https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/7672dac5-b383-4d7c-90ec-291dc69d37bf?format=geojson&projection=4326'
ward_geo = requests.get(url_ward).json()
ward_geo

Defining my Foursquare credentials and version for the API request URL, then the Foursquare category

In [None]:
CLIENT_ID, CLIENT_SECRET = 'YH35RXLK42JP41JLWUREXE0MYACNNX4N21EQMGEKIEUWQBGS','PZIFJCHLXQ55S2Q0CSFKKSMHRYBDI4BOR5NMX2XFULU5C1QX'
VERSION = '20180605' 
plc = '4bf58dd8d48988d1c3941735'

Defining a function that gets the category type

In [None]:
def get_category_type(row):
    try:
        ctg_list = row['categories']
    except:
        ctg_list = row['venue.categories']
        
    if len(ctg_list) == 0:
        return None
    else:
        return ctg_list[0]['name']

Defining a function that obtains Moroccan restaurants of every Toronto neighbourhood

In [None]:
def getNearbyVenues(names, latitudes, longitudes, LIMIT = 300, radius = 2000, categoryId = plc):
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&categoryId={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT,
            categoryId)
        results = requests.get(url).json()['response']['groups'][0]['items']
        venues_list.append([(
            name,
            lat, 
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood',
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue Name', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Storing the neighbourhoods in a new dataframe

In [None]:
nwm_df = getNearbyVenues(
                                names = t_df['Neighborhood'],
                                latitudes = t_df['Latitude'],
                                longitudes = t_df['Longitude']
                          )

Defining and setting up the coordinates of Toronto

In [None]:
address = 'Toronto, ON'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

Importing neighbourhood profiles dataframe

In [None]:
body = client_45422a1df12848e1b6dd8cefeb73dedf.get_object(Bucket='courseracapstone-donotdelete-pr-wx2u4brhxefhzw',Key='neighbourhood-profiles-2016-csv.csv')['Body']
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )
neighbourhood_profiles = pd.read_csv(body)
neighbourhood_profiles.head(10)

Locating the row with African origins' data

In [None]:
test = neighbourhood_profiles.iloc[[0,1138]].transpose().reset_index()
test.head(10)

Droping other unnecessary rows and renaming the columns and changing the data types of the Population and Neighbourhood Number columns

In [None]:
new_df = test.iloc[6:]
new_df.columns = ['Neighborhood','Neighbourhood Number','Population']
new_df = new_df.set_index('Neighborhood')[['Neighbourhood Number','Population']]
new_df['Population'] = new_df['Population'].replace(',','', regex = True).astype(float)
new_df['Neighbourhood Number'] = new_df['Neighbourhood Number'].astype(float)
new_df.head(10)

Scraping Business Improvement Areas data

In [None]:
url = 'https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/d173e644-ace0-45e0-be43-8ba02fb116eb?format=geojson&projection=4326'
geo_area = requests.get(url).json()

Creating a dataframe which contains number of neighborhoods for each borough

In [None]:
nn = t_df.groupby('Borough')['Neighborhood'].count().sort_values(ascending=False).to_frame()
nn

Setting up a plot that represents the above dataframe

In [None]:
sns.set_style('whitegrid')
sns.set_palette('Dark2')
nn.plot(kind='bar', figsize=(20,10), color=(0.2, 0.4, 0.6, 0.6))
plt.ylabel('Number of neighborhoods', size=25)
plt.xlabel('Boroughs', size=25)
plt.title('The number of neighborhoods for each borough', size=25)
plt.xticks(rotation=45, size=15)
plt.legend(fontsize=20)
plt.show()

Now we create a map of Toronto and it's neighborhoods

In [None]:
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=10)
for lat, lng, borough, neighborhood in zip(t_df['Latitude'], t_df['Longitude'], t_df['Borough'],
                                          t_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat, lng],
    radius=5,
    popup=label,
    color=(0.2, 0.4, 0.6),
    fill=True,
    fill_color='purple',
    fill_opacity=0.6,
    parse_html=False).add_to(toronto_map)
toronto_map

Let's get back to our dataframe tha contains population and neighbourhood number data

In [None]:
new_df.head(10)

Let's make a new dataframe out of it that contains the population of African origins of each neighborhood 

In [None]:
nd_gp = new_df.groupby('Neighborhood')['Population'].sum().sort_values(ascending=False).to_frame()
nd_gp

And make a new dataframe that contains neighborhoods populated with more than 1000 African origin

In [None]:
africa_top = nd_gp[nd_gp['Population'] > 1000]
at = africa_top.head(10)
africa_top

Let's turn it into a plot of it's top 10 of the most populated

In [None]:
sns.set_style('white')
sns.set_context('poster')
at.plot(kind='bar', figsize=(25,10), color=(0.2, 0.4, 0.6, 0.6))
plt.xlabel('Neighborhood', size=25)
plt.ylabel('Population', size=25)
plt.title('10 most populated African origins neighborhoods in Toronto', size=25)
plt.xticks(rotation=45, size=25)
plt.yticks(size=20)
plt.yticks(size=25)
plt.show()

In [None]:
africa_bot = nd_gp[nd_gp['Population'] <= 105]
africa_bot

same thing here but for the least populated

In [None]:
sns.set_style('white')
sns.set_context('poster')
africa_bot.plot(kind='bar', figsize=(25,10), color=(0.2, 0.4, 0.6, 0.6))
plt.xlabel('Neighborhoods', size=25)
plt.ylabel('Population', size=25)
plt.title('10 least populated African origins neighborhoods in Toronto', size=25)
plt.xticks(rotation=45, size=25)
plt.yticks(size=20)
plt.yticks(size=25)
plt.show()

After exploring the population, let's explore the restaurants so let's get back to our neighborhood dataframe

In [None]:
nwm_df.head(10)

In [None]:
nwm_df.groupby('Venue Category')['Venue Category'].count().sort_values(ascending=False).to_frame()

Let's remove rows with other venue categories and keep the ones with Moroccan Restaurant category

In [None]:
nwm_df = nwm_df[nwm_df['Venue Category'] == 'Moroccan Restaurant']
nwm_df.head(10)

Let's make a new dataframe to see how many Moroccan restaurants are in each Neighborhood

In [None]:
mrc_nh = nwm_df[['Neighborhood', 'Venue Name']].groupby(['Neighborhood']).size().reset_index(name='Number of Moroccan Restaurants')
mrc_nh.sort_values('Number of Moroccan Restaurants', ascending=False, inplace=True)
mrc_nh = mrc_nh.reset_index().drop('index', axis=1).set_index('Neighborhood')
mrc_nh

Let's merge the above dataframe with our t_df which we previously created to contain data about Toronto

In [None]:
mrc = pd.merge(t_df, mrc_nh, how = 'right', on = ['Neighborhood'] )
mrc

In [None]:
mrc_boro = mrc[['Borough', 'Number of Moroccan Restaurants']].groupby(['Borough']).sum().sort_values('Number of Moroccan Restaurants', ascending=False)
mrc_boro

Looks like Downtown Toronto has the most Moroccan Restaurants and follows it East Toronto and East York, let's make a map and see these's restaurants' concentration

In [None]:
restaurants_map = folium.Map(location=[latitude,longitude], tiles='cartodbpositron', zoom_start=11)
restaurants_map.choropleth(
    geo_data=geo_area,
    data = new_df,
    columns=['Neighbourhood Number','Population'],
    key_on='feature.properties.AREA_SHORT_CODE',
    line_weight = 0.2,
    fill_color='YlOrBr',
    fill_opacity= 0.3,
    line_opacity= 0.2, 
    name = 'Choropleth Map of Neighborhoods')
ward_style = lambda x: {'fillColor': '#aeaeae', 'color': '#aeaeae','fillOpacity':'0.1'}
folium.GeoJson(
    ward_geo,
    style_function = ward_style,
).add_to(restaurants_map)
for index,row in nwm_df.iterrows():
    folium.CircleMarker(
        [row["Venue Latitude"], row["Venue Longitude"]],
        radius=5,
        color='purple',
        fill= True,
        fill_color='purple',
        fill_opacity= 0.1,
        popup='Moroccan Restaurant',
    ).add_to(restaurants_map)
restaurants_map

In [None]:
nwm_arr = nwm_df[['Venue Latitude', 'Venue Longitude']].values
nwm_arr = nwm_arr.tolist()

Let's turn it into a heat map

In [None]:
from folium import plugins
folium.plugins.HeatMap(nwm_arr, radius=15).add_to(restaurants_map)
restaurants_map

Now let's create a map for business improvement area

In [None]:
business_improvement_map = folium.Map(location=[latitude,longitude], tiles='cartodbpositron', zoom_start=10)
for index,row in nwm_df.iterrows():
    folium.CircleMarker(
        [row["Venue Latitude"], row["Venue Longitude"]],
        radius=5,
        color='purple',
        fill= True,
        fill_color='purple',
        fill_opacity= 0.1,
        popup='Japanese Restaurant',
    ).add_to(business_improvement_map)
business_improvement_map
ward_style = lambda x: {'fillColor': 'grey', 'color': 'grey','fillOpacity':'0.1'}
folium.GeoJson(
    ward_geo,
    style_function = ward_style
).add_to(business_improvement_map)
area_style = lambda x: {'fillColor': 'black', 'color': 'black','fillOpacity':'0.2'}
folium.GeoJson(
    geo_area,
    style_function = area_style,
    name='Business Improvement Areas'
).add_to(business_improvement_map)
business_improvement_map

Now let's prepare our data for DBSCAN 

In [None]:
xs = np.asarray(nwm_df['Venue Latitude']) 
ys = np.asarray(nwm_df['Venue Longitude'])

In [None]:
nwm_df['xm'] = xs.tolist()
nwm_df['ym'] = ys.tolist()

In [None]:
cluster_data = nwm_df[['xm', 'ym']]
cluster_data = np.nan_to_num(cluster_data)
cluster_data

Now let's move to DBSCAN modeling

In [None]:
neigh = NearestNeighbors(n_neighbors=len(cluster_data))
nbrs = neigh.fit(cluster_data)
distances, indices = nbrs.kneighbors(cluster_data)
distances.mean()

In [None]:
distances = np.sort(distances, axis=0)
distances

In [None]:
distances = distances[:,-1]

Let's graph the optimal Epsilon values

In [None]:
sns.set_style('whitegrid')
sns.set_context('poster')
sns.set_palette('Dark2')
plt.figure(figsize=(20,8))
plt.plot(distances)
plt.scatter(x=25, y=0.225, color=(0.2, 0.4, 0.6, 0.6), marker='*')
plt.title('Epsilon Value', size=20)
plt.annotate('',
            xy=(25, 0.224),
            xytext=(30, 0.21),
            xycoords='data',
            arrowprops=dict(arrowstyle='<-', connectionstyle='arc3', color='red', lw=2))
plt.annotate('Optimal Epsilon Value',
             xy=(30, 0.205),
             va='bottom',
             ha='left',
             fontsize=20
            )

In [None]:
cluster_data = StandardScaler().fit_transform(cluster_data)

In [None]:
db = DBSCAN(eps=0.35, min_samples=6).fit(cluster_data)
labels = db.labels_
labels

In [None]:
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
nwm_df["Cluster_DB"]=labels

realClusterNum=len(set(labels)) - (1 if -1 in labels else 0)
clusterNum = len(set(labels))

In [None]:
nwm_df[['Neighborhood', 'Venue Name', 'Cluster_DB']]

In [None]:
set(labels)

In [None]:
nwm_df['Cluster_DB'] = nwm_df['Cluster_DB'].astype(int).tolist()
k_clusters = nwm_df['Cluster_DB'].max() + 1 
k_clusters

In [None]:
nwm_df.groupby('Cluster_DB')['Venue Name'].count().sort_values(ascending=False).to_frame()

Let's visualize the results

In [None]:
x = np.arange(k_clusters)
ys = [i + x + (i*x)**2 for i in range(k_clusters)]
colors_array = cm.tab20(np.linspace(0, 1, len(ys)))
rainbow = [col.rgb2hex(i) for i in colors_array]
db_map = folium.Map(location=[latitude, longitude], zoom_start=10)
for lat, lon, neigh, cluster in zip(nwm_df['Venue Latitude'], nwm_df['Venue Longitude'],
                                    nwm_df['Neighborhood'], nwm_df['Cluster_DB']):
    label = folium.Popup(str(neigh) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color = 'purple',
        fill=True,
        fill_color=rainbow[cluster-0],
        fill_opacity=0.7).add_to(db_map)
db_map

Let's improve the looks

In [None]:
x = np.arange(k_clusters)
ys = [i + x + (i*x)**2 for i in range(k_clusters)]
colors_array = cm.tab20(np.linspace(0, 1, len(ys)))
rainbow = [col.rgb2hex(i) for i in colors_array]
business_improvement_map.choropleth(
    geo_data=geo_toronto,
    data = new_df,
    columns=['Neighbourhood Number','Population'],
    key_on='feature.properties.AREA_SHORT_CODE',
    line_weight = 0.2,
    fill_color='YlOrBr',
    fill_opacity= 0.3,
    line_opacity= 0.2, 
    legend_name='Population of African origins',
    name = 'Choropleth Map of Neighborhoods 140')
for lat, lon, neigh, cluster in zip(nwm_df['Venue Latitude'], nwm_df['Venue Longitude'],
                                    nwm_df['Neighborhood'], nwm_df['Cluster_DB']):
    label = folium.Popup(str(neigh) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='purple',
        fill=True,
        fill_color=rainbow[cluster-0],
        fill_opacity=0.7).add_to(business_improvement_map)
business_improvement_map

We can notice 3 locations that can be considered candidates to have a new successful Moroccan restaurant:
* First Canadian Place, Underground city
* Alderwood, Long Branch
* Maryvale, Wexford