# Coursera Capstone Project Report: Python data analysis section

In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import requests
import json
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('ticks')
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium

## Part 1: get the dataset with New York neighborhoods

In [2]:
# Get New York dataset from IBM
!wget -q -O 'newyork_data.json' https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs/newyork_data.json

In [3]:
# Load json file
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [4]:
# The relevant data is in the features key (list of neighborhoods)
neighborhoods_data = newyork_data['features']

In [5]:
# Transform into pandas dataframe. First create an empty dataframe just with column names
columns = ['borough', 'neighborhood', 'latitude', 'longitude']
df = pd.DataFrame(columns=columns)

In [6]:
# Add data from the json file
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough']
    neighborhood_name = data['properties']['name']
    
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    df = df.append({'borough': borough, 'neighborhood': neighborhood_name,
                    'latitude': neighborhood_lat, 'longitude': neighborhood_lon}, ignore_index=True)

In [7]:
# Check the dataframe
df.head()

Unnamed: 0,borough,neighborhood,latitude,longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [8]:
# Query shape
df.shape

(306, 4)

In [9]:
# Check how many boroughs there are in New York
df['borough'].value_counts()

Queens           81
Brooklyn         70
Staten Island    63
Bronx            52
Manhattan        40
Name: borough, dtype: int64

In [10]:
# Get the coordinates of NYC
address = 'New York City, NY'
geolocator = Nominatim(user_agent='ny_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

In [11]:
# Check
(latitude, longitude)

(40.7127281, -74.0060152)

### Visualization of NYC neighborhoods

In [12]:
# Create a map of NYC with neighborhoods superimposed
map_nyc = folium.Map(location=[latitude,longitude],zoom_start=10)

# markers
for lat, lng, borough, neighborhood in zip(df['latitude'],df['longitude'], 
                                           df['borough'], df['neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label,parse_html=True)
    folium.CircleMarker([lat,lng], radius=5, popup=label, color='blue',
                       fill=True, fill_color='#3186cc', fill_opacity=0.7,
                       parse_html=False).add_to(map_nyc)

In [13]:
# Display map
map_nyc

## Part 2: generate data on New York Venues

In [14]:
# Neighborhoods will be clustered using K-means scoring according to venues fetched from foursquare  
# For foursquare, setting a radius of 500 m and a limit of 100 
client_id = 'NEEHI2G5RWYX3OP2MZLPO2BZ2OHCP3AWKNODF4BUNOUU2RFH'
client_secret = 'BFGHOGYJTFSCLID4CD3PDTEKHNVAOIADUMQSEB30FJ2UZSBS'
access_token = 'SWORXKIH4BWZTDCV2BDZDWIOOLMSAYTKWLOPN5XGGHT0Z0AI'
version = '20180605'
limit = 100
radius= 500

In [15]:
# Split the neighborhood data into lists to use in a for loop
names = df['neighborhood']
latitudes = df['latitude']
longitudes = df['longitude']

In [16]:
# Function that loops over each neighborhood, sends a foursquare request, and gets venues
def getnearbyvenues(names, latitudes, longitudes):
    venues_list=[]
    
    # Looping through each neighborhood
    for name, lat, lng, in zip(names, latitudes, longitudes):
        
        # API request
        url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(client_id,
                                                                                                                           client_secret,lat,
                                                                                                                           lng,version,radius,
                                                                                                                           limit)
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        # Paste the venue data into the previously-initialized list
        venues_list.append([(
        name,
        lat,
        lng,
        v['venue']['name'],
        v['venue']['location']['lat'],
        v['venue']['location']['lng'],
        v['venue']['categories'][0]['name']) for v in results])
    
    # Once the loop is finished grab all the data and put it into a dataframe
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['neighborhood', 'neighborhood latitude',
                            'neighborhood longitude', 'venue',
                            'venue latitude', 'venue longitude', 'venue category']
    
    # Return the dataframe
    return nearby_venues

In [None]:
nyc_venues = getnearbyvenues(names, latitudes, longitudes)

In [None]:
# Quick view of the data and shape of the dataframe
nyc_venues

In [None]:
# Check for repeats. One would expect that big brand names are repeated many times over across the city
nyc_venues['venue'].value_counts()

In [None]:
# See the number of venues per neighborhood
grouptab = nyc_venues.groupby('neighborhood').count()
grouptab[['venue']]

In [None]:
# The shape of this dataframe gives us the total number of neighborhoods
grouptab.shape

In [None]:
# Per neighborhood analysis with onehot encoding
nyc_onehot = pd.get_dummies(nyc_venues[['venue category']], prefix='', prefix_sep='')

nyc_onehot['neighborhood'] = nyc_venues['neighborhood']

fixed_columns = [nyc_onehot.columns[-1]]+list(nyc_onehot.columns[:-1])
nyc_onehot = nyc_onehot[fixed_columns]

In [None]:
nyc_onehot.head()

In [None]:
# Group the onehot dataframe by neighborhood
nyc_onehot_grp = nyc_onehot.groupby('neighborhood').mean().reset_index()

In [None]:
nyc_onehot_grp.head()

In [None]:
# Sort venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
# Dataframe with top 10 venues for each neighborhood
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} most common venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th most common venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['neighborhood'] = nyc_onehot_grp['neighborhood']

for ind in np.arange(nyc_onehot_grp.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(nyc_onehot_grp.iloc[ind, :], 
                                                                          num_top_venues)

neighborhoods_venues_sorted.head()

In [None]:
# Visualize the shape of the above dataframe
neighborhoods_venues_sorted.shape

## Part 3: fitting a K-means clustering algorithm

In [None]:
# A K-means model will be trained below, using k = 5

# Select number of clusters for neighborhoods
kclusters = 5

# Prepare a new dataframe for model fitting
nyc_grp_clust = nyc_onehot_grp.drop('neighborhood',1)

In [None]:
nyc_grp_clust.head()

In [None]:
# Fit the model
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(nyc_grp_clust)

In [None]:
# Check the first ten labels
kmeans.labels_[0:10]

In [None]:
# Check the number of labels
kmeans.labels_.size

In [None]:
# Paste those cluster labels into a new dataframe
neighborhoods_venues_sorted.insert(0,  'cluster labels', kmeans.labels_)
df_nyc_merged = df
df_nyc_merged = df_nyc_merged.join(neighborhoods_venues_sorted.set_index('neighborhood'), on='neighborhood')

In [None]:
# Drop NaN values
df_nyc_merged_dropna = df_nyc_merged.dropna()

In [None]:
# Typeset cluster labels as integers
df_nyc_merged_dropna['cluster labels'] = df_nyc_merged_dropna['cluster labels'].astype(int)

In [None]:
# Check the updated dataframe 
df_nyc_merged_dropna.head()

In [None]:
# Check how many neighborhoods in each cluster
df_nyc_merged_dropna['cluster labels'].value_counts()

## Visualization of clusters and analysis

In [None]:
# Visualize clusters in the map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0,1,len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Markers
markers_colors = []
for lat, lon, poi, cluster in zip(df_nyc_merged_dropna['latitude'], 
                                  df_nyc_merged_dropna['longitude'],
                                  df_nyc_merged_dropna['neighborhood'], 
                                  df_nyc_merged_dropna['cluster labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
    [lat, lon],
    radius=5,
    popup=label,
    color=rainbow[cluster-1],
    fill=True,
    fill_color=rainbow[cluster-1],
    fill_opacity=0.7).add_to(map_clusters)

In [None]:
# Render map
map_clusters

In [None]:
# Separate each cluster into its own dataframe
df_cluster_0 = df_nyc_merged_dropna.where(df_nyc_merged_dropna['cluster labels'] == 0).dropna()
df_cluster_1 = df_nyc_merged_dropna.where(df_nyc_merged_dropna['cluster labels'] == 1).dropna()
df_cluster_2 = df_nyc_merged_dropna.where(df_nyc_merged_dropna['cluster labels'] == 2).dropna()
df_cluster_3 = df_nyc_merged_dropna.where(df_nyc_merged_dropna['cluster labels'] == 3).dropna()
df_cluster_4 = df_nyc_merged_dropna.where(df_nyc_merged_dropna['cluster labels'] == 4).dropna()

In [None]:
# Visualize each
df_cluster_0

In [None]:
df_cluster_0.shape

In [None]:
df_cluster_1

In [None]:
df_cluster_2

In [None]:
df_cluster_3

In [None]:
df_cluster_4

In [None]:
# Clusters 0 and 2 are much larger than 1, 2, and 4, so the analysis will focus on contrasting clusters 0 and
# 2. 1, 2 and 4 will be treated as outlier neighborhoods, as they comprise only 5 neighborhoods.

In [None]:
# Zip the dataframes into a list
dflist = [df_cluster_0, df_cluster_2]

In [None]:
# Define a function that takes a dataframe and gets the mode for each of the top 10 most common venue columns
def getvenues(df):
    listclust = []
    for a,b,c,d,e,f,g,h,i,j in zip(df['1st most common venue'].mode(), df['2nd most common venue'].mode(),
              df['3rd most common venue'].mode(), df['4th most common venue'].mode(),
              df['5th most common venue'].mode(), df['6th most common venue'].mode(),
              df['7th most common venue'].mode(), df['8th most common venue'].mode(),
              df['9th most common venue'].mode(), df['10th most common venue'].mode()):
        listclust.append(a)
        listclust.append(b)
        listclust.append(c)
        listclust.append(d)
        listclust.append(e)
        listclust.append(f)
        listclust.append(g)
        listclust.append(h)
        listclust.append(i)
        listclust.append(j)
        
    return listclust

In [None]:
# Run the function on both dataframes of interest
d0,d2 = [getvenues(x) for x in dflist]

In [None]:
(len(d0), len(d2))

In [None]:
# Create a dataframe with the modes
popular_venues_0_2 = pd.DataFrame({'cluster 0': d0, 'cluster 2': d2})

In [None]:
popular_venues_0_2

In [None]:
# Create a dictionary for manual counting of the data above. We'll call cluster 0 "Cluster A", and cluster 2
# "Cluster B"
counts = {'labels': ['Food', 'Shops', 'Transportation'],
          'cluster A': [2, 5, 3], 'cluster B': [8, 2, 0]}

In [None]:
# Weld the dictionary into a dataframe
categories = pd.DataFrame(counts)

In [None]:
# Visualize the dataframe
categories

In [None]:
# Generate a bar chart
%matplotlib inline
import matplotlib.pyplot as plt
categories.plot(x='labels', 
                y=['cluster A', 'cluster B'], 
                kind='bar', figsize = (12,8));
plt.xticks(rotation=45);
plt.xlabel('Categories')
plt.ylabel('Share of the top 10 most common venues')
plt.legend(['cluster A: 16 neighborhoods', 'cluster B: 284 neighborhoods']);
plt.tight_layout();