# Coursera - IBM Data Science Professional Certification - Week4


### Import Modules to run analysis

In [1]:
import requests
from bs4 import BeautifulSoup as bs

import json # library to handle JSON files

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import geocoder

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium # map rendering library

import warnings
warnings.filterwarnings("ignore")

## A description of the problem and a discussion of the background.

In San Diego, California, a client has hired your firm to help them identify in which neighbourhood they should open the 2nd location of their very successful French Restaurant.  

The client has indicated they are happy with the success they have found in the neighbourhood of Nork Park and that they would like to open their 2nd location in the neighbourhood that is the most similar to North Park.  There are over 100 neighbourhoods in San Diego, so choosing which neighbourhood is the most simliar won't be easy. 

## A description of the data

The following wikipage contains a list of all of over 100 neighbourhoods in San Diego, with links to wikipages for just about all of them.  

https://en.wikipedia.org/wiki/List_of_communities_and_neighborhoods_of_San_Diego

This list of neighborhoods will be used to create a table of latitude and longitude coordinates for each neighborhood either by scraping the coordinates from the respective wikipage about the neighborhood or by using the Nominatum package in python.  

These latitude and longidue coordinates will be used to obtain the Foursqaure API data about the venue types in each neighborhood.  This foursquare API data about the venues and venue categories awill be used to identify which neighborhoods are the most similar to North Park.

An example of how the data will be used is outlined below, by grabbing the relevant data for the neighborhood of North Park.

### Functions to complete the analysis

In [2]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhoods', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [4]:
# function to get the lat/long of each neighborhood
def get_lat_long(address):
    geolocator = Nominatim()
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
#    print(latitude, longitude)
    return latitude, longitude

In [14]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

### Foursquare credentials

In [3]:
CLIENT_ID = 'your_client_id_here' # your Foursquare ID
CLIENT_SECRET = 'your_client_secret_here' # your Foursquare Secret
VERSION = '20180604'
#LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 42R2YOP4MTNEEGBLCFEBVLKE1S1EFC5AUWUIS45M3Q4Y00SK
CLIENT_SECRET:X4THSP32AF2EVUC4M4O2IP3KYLKSF2WHW5BNKSQM4GYBF2N1


### We are looking to find the neighborhood that is the most similar to North Park according to the venue information contained in the Foursquare API venue data.
1.  Start by getting the latitude and longitude of North Park to be used as input into the Foursquare API

In [5]:
address = "North Park, San Diego, CA"
neighborhood_latitude, neighborhood_longitude, = get_lat_long(address)

#### Gather the category information about each venue in North Park, within 500 meters of the neighborhoods lat/long

In [6]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=42R2YOP4MTNEEGBLCFEBVLKE1S1EFC5AUWUIS45M3Q4Y00SK&client_secret=X4THSP32AF2EVUC4M4O2IP3KYLKSF2WHW5BNKSQM4GYBF2N1&v=20180604&ll=32.7408842,-117.1305877&radius=500&limit=100'

### Convert the JSON results of the North Park venue category data to a DataFrame

In [22]:
results = requests.get(url).json()
#results.keys()
venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues) # flatten JSON
print(nearby_venues.shape)
nearby_venues.head()

(24, 28)


Unnamed: 0,reasons.count,reasons.items,referralId,venue.categories,venue.delivery.id,venue.delivery.provider.icon.name,venue.delivery.provider.icon.prefix,venue.delivery.provider.icon.sizes,venue.delivery.provider.name,venue.delivery.url,venue.id,venue.location.address,venue.location.cc,venue.location.city,venue.location.country,venue.location.crossStreet,venue.location.distance,venue.location.formattedAddress,venue.location.labeledLatLngs,venue.location.lat,venue.location.lng,venue.location.neighborhood,venue.location.postalCode,venue.location.state,venue.name,venue.photos.count,venue.photos.groups,venue.venuePage.id
0,0,"[{'reasonName': 'globalInteractionReason', 'ty...",e-0-56391bfacd105fac6c403c63-0,"[{'primary': True, 'id': '4bf58dd8d48988d1ce94...",,,,,,,56391bfacd105fac6c403c63,3382 30th St,US,San Diego,United States,,117,"[3382 30th St, San Diego, CA 92104, United Sta...","[{'label': 'display', 'lng': -117.129334340072...",32.740871,-117.129334,,92104,CA,Pete's Seafood & Sandwich,0,[],
1,0,"[{'reasonName': 'globalInteractionReason', 'ty...",e-0-536add8e11d21c7853ef0883-1,"[{'primary': True, 'id': '50327c8591d4c4b30a58...",,,,,,,536add8e11d21c7853ef0883,3000 Upas St,US,San Diego,United States,at 30th St,91,"[3000 Upas St (at 30th St), San Diego, CA 9210...","[{'label': 'display', 'lng': -117.129948735237...",32.741511,-117.129949,"North Park, San Diego, CA",92104,CA,Modern Times Flavordome,0,[],
2,0,"[{'reasonName': 'globalInteractionReason', 'ty...",e-0-577d9c47498e861ea1dbf937-2,"[{'primary': True, 'id': '4bf58dd8d48988d16c94...",,,,,,,577d9c47498e861ea1dbf937,3501 30th St,US,San Diego,United States,Myrtle Ave,165,"[3501 30th St (Myrtle Ave), San Diego, CA 9210...","[{'label': 'display', 'lng': -117.129959960439...",32.742278,-117.12996,,92104,CA,Dunedin North Park,0,[],
3,0,"[{'reasonName': 'globalInteractionReason', 'ty...",e-0-480262c5f964a520f14e1fe3-3,"[{'primary': True, 'id': '4bf58dd8d48988d1ca94...",,,,,,,480262c5f964a520f14e1fe3,3448 30th St,US,San Diego,United States,at Myrtle Ave,130,"[3448 30th St (at Myrtle Ave), San Diego, CA 9...","[{'label': 'display', 'lng': -117.130176723003...",32.742005,-117.130177,,92104,CA,Lefty's Chicago Pizzeria,0,[],
4,0,"[{'reasonName': 'globalInteractionReason', 'ty...",e-0-4b8348aef964a520b00031e3-4,"[{'primary': True, 'id': '4bf58dd8d48988d10c94...",,,,,,,4b8348aef964a520b00031e3,3408 30th St,US,San Diego,United States,Upas,66,"[3408 30th St (Upas), San Diego, CA 92104, Uni...","[{'label': 'display', 'lng': -117.130247645030...",32.741409,-117.130248,,92104,CA,The Smoking Goat,0,[],


### Grab only the relevant data

In [12]:
# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]
nearby_venues.shape

(24, 4)

In [15]:
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

In [16]:
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

### Take a look at the first few rows of cleaned up North Park venue data 

In [17]:
nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Pete's Seafood & Sandwich,Seafood Restaurant,32.740871,-117.129334
1,Modern Times Flavordome,Brewery,32.741511,-117.129949
2,Dunedin North Park,Burger Joint,32.742278,-117.12996
3,Lefty's Chicago Pizzeria,Pizza Place,32.742005,-117.130177
4,The Smoking Goat,French Restaurant,32.741409,-117.130248


### Take a quick look at the category breakdown for North Park

In [18]:
nearby_venues['categories'].value_counts()

Café                       3
Italian Restaurant         2
Thai Restaurant            2
French Restaurant          1
Seafood Restaurant         1
Breakfast Spot             1
Flower Shop                1
Taco Place                 1
Coffee Shop                1
Noodle House               1
Burger Joint               1
Brewery                    1
Pizza Place                1
Liquor Store               1
Park                       1
Antique Shop               1
Fast Food Restaurant       1
New American Restaurant    1
Lounge                     1
Pet Store                  1
Name: categories, dtype: int64

### One hot encode the the Venue Categories for North Park

In [21]:
# one hot encoding
northpark_onehot = pd.get_dummies(nearby_venues[['categories']], prefix="", prefix_sep="")

# add neighborhood column to dataframe
northpark_onehot['Neighborhood'] = 'North Park'

# move neighborhood column to the first column
fixed_columns = [northpark_onehot.columns[-1]] + list(northpark_onehot.columns[:-1])
northpark_onehot = northpark_onehot[fixed_columns]

print(northpark_onehot.shape)
northpark_onehot.head()

(24, 21)


Unnamed: 0,Neighborhood,Antique Shop,Breakfast Spot,Brewery,Burger Joint,Café,Coffee Shop,Fast Food Restaurant,Flower Shop,French Restaurant,Italian Restaurant,Liquor Store,Lounge,New American Restaurant,Noodle House,Park,Pet Store,Pizza Place,Seafood Restaurant,Taco Place,Thai Restaurant
0,North Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,North Park,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,North Park,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,North Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,North Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Get the mean score for each venue type

In [23]:
northpark_onehot.groupby('Neighborhood').mean()

Unnamed: 0_level_0,Antique Shop,Breakfast Spot,Brewery,Burger Joint,Café,Coffee Shop,Fast Food Restaurant,Flower Shop,French Restaurant,Italian Restaurant,Liquor Store,Lounge,New American Restaurant,Noodle House,Park,Pet Store,Pizza Place,Seafood Restaurant,Taco Place,Thai Restaurant
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
North Park,0.041667,0.041667,0.041667,0.041667,0.125,0.041667,0.041667,0.041667,0.041667,0.083333,0.041667,0.041667,0.041667,0.041667,0.041667,0.041667,0.041667,0.041667,0.041667,0.083333


### This process will be repeated  for all 100+ neighborhoods in San Diego
- the data will be aggregated into a single DataFrame and grouped by neighborhood using the mean value for each venue category in that neighborhood.
- The neighborhoods will then be clustered using Kmeans to locate those neighborhoods which are most similar to North Park.