# Capstone Project 
## Comparing Toronto and New York City
##### Let's start by installing and importing the libraries required along the project

In [1]:
# Importing and installing the neccesary modules
!conda install -c conda-forge folium=0.5.0 --yes
!conda install -c conda-forge geopy --yes 

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  52.09 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  28.72 MB/s
vincent-0.4.4- 100% |################################| Time: 0:00:00  28.79 MB/s
folium-0.5.0-p 100% |################################| Time: 0:00:00  35.60 MB/s
Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0   conda-forge
    geopy:         1.18.1-py_0 conda-forge

geographiclib- 100% |###################

In [2]:
# Importing and installing the neccesary modules
import pandas as pd
import numpy as np
import requests
from sklearn.cluster import KMeans
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
from geopy.geocoders import Nominatim
import json
from pandas.io.json import json_normalize
pd.options.mode.chained_assignment = None
print('Libraries imported.')

Libraries imported.


In [3]:
# The code was removed by Watson Studio for sharing.

With the next cell of code, we could easily get the nearby venues of the dataframe we pass to the function

In [4]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### New York City Data

Let's start to download the data for New York City!

In [5]:
!wget -q -O 'newyork_data.json' https://ibm.box.com/shared/static/fbpwbovar7lf8p5sgddm06cgipa2rxpe.json
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)
neighborhoods_data = newyork_data['features']
column_names = ['Borough', 'Neighborhood', 'City', 'Latitude', 'Longitude']
neighborhoods = pd.DataFrame(columns=column_names)
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'City': 'New York City',
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [6]:
# Coordinates of the NYC
NYClatitude=40.730862
NYClongitude=-73.987156
neighborhoods

Unnamed: 0,Borough,Neighborhood,City,Latitude,Longitude
0,Bronx,Wakefield,New York City,40.894705,-73.847201
1,Bronx,Co-op City,New York City,40.874294,-73.829939
2,Bronx,Eastchester,New York City,40.887556,-73.827806
3,Bronx,Fieldston,New York City,40.895437,-73.905643
4,Bronx,Riverdale,New York City,40.890834,-73.912585
5,Bronx,Kingsbridge,New York City,40.881687,-73.902818
6,Manhattan,Marble Hill,New York City,40.876551,-73.910660
7,Bronx,Woodlawn,New York City,40.898273,-73.867315
8,Bronx,Norwood,New York City,40.877224,-73.879391
9,Bronx,Williamsbridge,New York City,40.881039,-73.857446


In [7]:
# New York Map
map_newyork = folium.Map(location=[NYClatitude, NYClongitude], zoom_start=10)
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

#### Toronto Data

In [8]:
text=pd.read_html('http://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M',flavor='bs4') # Extracting the data with Beautiful Soup using read_html
df=text[0] # Extracting the first table of the file
df.columns=df.iloc[0] # Reseting the columns' name so the first row becomes the names
df.drop(0,axis=0,inplace=True) # Removing the first row so we don't have duplicates
df.rename(columns={'Postcode':'PostalCode','Neighbourhood':'Neighborhood'},inplace=True) # Changing the name of Postcode to PostalCode
df=df[df.Borough != 'Not assigned'] # Dropping the rows whose Borough is not defined
df.reset_index(drop=True,inplace=True) # Reseting the index
for i in df.index:     # Loop to assign the value of the neighbourhood if not assigned to the same Borough.
    if df.Neighborhood[i] == 'Not assigned':
        df.Neighborhood[i] = df.Borough[i]
df=df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(lambda Neighbourhood: ', '.join(Neighbourhood)).to_frame() # Grouping the results by Borough and joining the results 
df.reset_index(inplace=True) # Reseting the index again
latlng=pd.read_csv('https://cocl.us/Geospatial_data') # Downloading the data from a csv file
df['City']='Toronto'
df['Latitude']=0.0      # To define the new columns. 0.0 to determine that we are using floats.
df['Longitude']=0.0     # To define the new columns. 0.0 to determine that we are using floats.
# Comparing both dataframes and copy data
for i in df.index:
    for j in latlng.index:
        if df['PostalCode'][i]==latlng['Postal Code'][j]:
            df['Latitude'][i]=latlng['Latitude'][j]
            df['Longitude'][i]=latlng['Longitude'][j]
df.drop('PostalCode',axis=1,inplace=True)

In [9]:
# Coordinates of Toronto
TORlatitude=43.6532
TORlongitude=-79.3832
df

Unnamed: 0,Borough,Neighborhood,City,Latitude,Longitude
0,Scarborough,"Rouge, Malvern",Toronto,43.806686,-79.194353
1,Scarborough,"Highland Creek, Rouge Hill, Port Union",Toronto,43.784535,-79.160497
2,Scarborough,"Guildwood, Morningside, West Hill",Toronto,43.763573,-79.188711
3,Scarborough,Woburn,Toronto,43.770992,-79.216917
4,Scarborough,Cedarbrae,Toronto,43.773136,-79.239476
5,Scarborough,Scarborough Village,Toronto,43.744734,-79.239476
6,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",Toronto,43.727929,-79.262029
7,Scarborough,"Clairlea, Golden Mile, Oakridge",Toronto,43.711112,-79.284577
8,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",Toronto,43.716316,-79.239476
9,Scarborough,"Birch Cliff, Cliffside West",Toronto,43.692657,-79.264848


In [10]:
map_toronto = folium.Map(location=[TORlatitude, TORlongitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{} - {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Toronto and New York City

From now on, both dataframes will be merged to continue the project with only one dataframe, featuring Borough, Neighborhood, City, Latitude and Longitude of each neighborhood.
In the case of the Toronto map, they will be organized according to the postal code associated, so the study will be held for a group of neighborhoods altogether

In [11]:
neighs=df.append(neighborhoods)
neighs.reset_index(inplace=True)
neighs.drop('index',axis=1,inplace=True)
neighs

Unnamed: 0,Borough,Neighborhood,City,Latitude,Longitude
0,Scarborough,"Rouge, Malvern",Toronto,43.806686,-79.194353
1,Scarborough,"Highland Creek, Rouge Hill, Port Union",Toronto,43.784535,-79.160497
2,Scarborough,"Guildwood, Morningside, West Hill",Toronto,43.763573,-79.188711
3,Scarborough,Woburn,Toronto,43.770992,-79.216917
4,Scarborough,Cedarbrae,Toronto,43.773136,-79.239476
5,Scarborough,Scarborough Village,Toronto,43.744734,-79.239476
6,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",Toronto,43.727929,-79.262029
7,Scarborough,"Clairlea, Golden Mile, Oakridge",Toronto,43.711112,-79.284577
8,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",Toronto,43.716316,-79.239476
9,Scarborough,"Birch Cliff, Cliffside West",Toronto,43.692657,-79.264848


In [12]:
neighs_venues=getNearbyVenues(names=neighs['Neighborhood'],latitudes=neighs['Latitude'],longitudes=neighs['Longitude'])

Rouge, Malvern
Highland Creek, Rouge Hill, Port Union
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park, Ionview, Kennedy Park
Clairlea, Golden Mile, Oakridge
Cliffcrest, Cliffside, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Scarborough Town Centre, Wexford Heights
Maryvale, Wexford
Agincourt
Clarks Corners, Sullivan, Tam O'Shanter
Agincourt North, L'Amoreaux East, Milliken, Steeles East
L'Amoreaux West, Steeles West
Upper Rouge
Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
Silver Hills, York Mills
Newtonbrook, Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park, Don Mills South
Bathurst Manor, Downsview North, Wilson Heights
Northwood Park, York University
CFB Toronto, Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens, Parkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The D

### One hot encoding

In [14]:
neighs_onehot=pd.get_dummies(neighs_venues[['Venue Category']], prefix="", prefix_sep="")
neighs_onehot['Neighborhood']=neighs_venues['Neighborhood']
fixed_columns = [neighs_onehot.columns[-1]]+list(neighs_onehot.columns[:-1])
neighs_onehot=neighs_onehot[fixed_columns]
neighs_group=neighs_onehot.groupby('Neighborhood').mean().reset_index()

In [15]:
neighs_group

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,...,Warehouse Store,Waste Facility,Watch Shop,Waterfront,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,"Adelaide, King, Richmond",0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.000000,0.010000,0.000000,0.000000,0.010000
1,Agincourt,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000
4,"Alderwood, Long Branch",0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000
5,Allerton,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000
6,Annadale,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000
7,Arden Heights,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000
8,Arlington,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000
9,Arrochar,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000


Now it is time to cluster each neighborhood with its similars

In [16]:
kclusters = 20
neighs_clustering=neighs_group.drop('Neighborhood',1)
kmeans=KMeans(n_clusters=kclusters, random_state=0).fit(neighs_clustering)
kmeans.labels_[0:10]

array([16, 16,  8,  6,  7,  6,  7,  7, 19, 10], dtype=int32)

Now, it could be grouped the whole dataframe obtaining BOROUGH, NEIGHBORHOOD, LAT, LONG, AND CLUSTER LABEL

In [17]:
neighs_preinter=neighs_group.copy()
neighs_preinter['Cluster Labels']=kmeans.labels_
neighs_inter=neighs_preinter[['Neighborhood','Cluster Labels']]
neighs_inter2=neighs.copy()
neighs_inter2.sort_values('Neighborhood',inplace=True)
neighs_inter2.reset_index(inplace=True)
neighs_inter2.drop('index',axis=1,inplace=True)
neighs_final=neighs_inter2.merge(neighs_inter, on='Neighborhood',how='outer')
neighs_final.dropna(axis=0,inplace=True)
neighs_final['Cluster Labels']=neighs_final['Cluster Labels'].astype('int64')
neighs_final

Unnamed: 0,Borough,Neighborhood,City,Latitude,Longitude,Cluster Labels
0,Downtown Toronto,"Adelaide, King, Richmond",Toronto,43.650571,-79.384568,16
1,Scarborough,Agincourt,Toronto,43.794200,-79.262029,16
2,Scarborough,"Agincourt North, L'Amoreaux East, Milliken, St...",Toronto,43.815252,-79.284577,8
3,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",Toronto,43.739416,-79.588437,6
4,Etobicoke,"Alderwood, Long Branch",Toronto,43.602414,-79.543484,7
5,Bronx,Allerton,New York City,40.865788,-73.859319,6
6,Staten Island,Annadale,New York City,40.538114,-74.178549,7
7,Staten Island,Arden Heights,New York City,40.549286,-74.185887,7
8,Staten Island,Arlington,New York City,40.635325,-74.165104,19
9,Staten Island,Arrochar,New York City,40.596313,-74.067124,10


## Map with the Clusters

In [18]:
maplat=(NYClatitude+TORlatitude)/2
maplong=(NYClongitude+TORlongitude)/2
map_clusters = folium.Map(location=[maplat, maplong], zoom_start=7.25)
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
markers_colors = []
for lat, lon, poi, cluster in zip(neighs_final['Latitude'], neighs_final['Longitude'], neighs_final['Neighborhood'], neighs_final['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=1,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Final part, selecting the similar neighborhoods to yours
A defined function will help to find every Neighborhood and its associates in the other city, so, if you select a neighborhood from Toronto, the map will open in New York and viceversa.
If the search is not very accurate, the number of clusters could be increased or decreased (depending on if we have plenty of neighborhoods or no neighborhoods at all) and find as many neighborhoods as one desires.

In [19]:
def myNeighborhood(inp):
    neighborhoodtofind=input('Type your neighborhood: ')
    value=neighs_final[neighs_final['Neighborhood'].str.contains(neighborhoodtofind)]
    num=value.iloc[0]['Cluster Labels']
    cit=value.iloc[0]['City']
    if cit == 'New York City':
        finlat=TORlatitude
        finlong=TORlongitude
    else:
        finlat=NYClatitude
        finlong=NYClongitude
    similarneighs=neighs_final[neighs_final['Cluster Labels']==num]
    final_map = folium.Map(location=[finlat, finlong], zoom_start=11)
    for lat, lon, poi, bor in zip(similarneighs['Latitude'], similarneighs['Longitude'], similarneighs['Neighborhood'], similarneighs['Borough']):
        label = folium.Popup(str(poi)+str(bor), parse_html=True)
        folium.CircleMarker(
            [lat, lon],
            radius=3,
            popup=label,
            color='black',
            fill=True,
            fill_color='white',
            fill_opacity=0.7).add_to(final_map)
    final_map
    return(final_map)

###### THREE EXAMPLES OF THE CODE

In [20]:
myNeighborhood(0)

Type your neighborhood: Queen's Park


In [21]:
myNeighborhood(1)

Type your neighborhood: Noho


In [22]:
myNeighborhood(2)

Type your neighborhood: Willowbrook
