In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import numpy as np

import json # library to handle JSON files
!pip install geopy
!pip install folium
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt

# import k-means from clustering stage
from sklearn.cluster import KMeans

# import MinMaxScaler for min-max normalization
from sklearn.preprocessing import MinMaxScaler

import folium # map rendering library
from folium.plugins import HeatMap # for HeatMap function
import seaborn as sns # 

from PIL import Image # for importing image
from matplotlib.pyplot import imshow # for ploting image on the jupyter environment

print('Libraries imported.')



usage: conda-script.py [-h] [-V] command ...
conda-script.py: error: unrecognized arguments: # uncomment this line if you haven't completed the Foursquare API lab


Libraries imported.


### Exploring Zadar neighborhoods

First, taking into consideration that there are no geocoordinates of the Zadar neighborhoods, they needed to be obtained through alternate sources. So, I manually found and extracted latitude and longitude values for each neighborhood in the city, creating an xlsx file named Zadar_neighborhoods. There are in total 26 neighborhoods.

In [2]:
zadar = pd.read_excel('Zadar_neighborhoods.xlsx')

In [3]:
print("This file has {} observations/neighborhoods and {} columns.".format(zadar.shape[0],zadar.shape[1]))
zadar.head()

This file has 25 observations/neighborhoods and 3 columns.


Unnamed: 0,Neighborhood,Latitude,Longitude
0,Brodarica,44.123611,15.226111
1,Voštarnica,44.116944,15.235833
2,Peninsula,44.114444,15.225556
3,Plovanija,44.126111,15.248333
4,Špada,44.128889,15.24


In [4]:
CLIENT_ID = '00NKY30JGU43KLGX1UOXYHKP1Z3GMOVZ4LBBIOKKQ0KGULZ4' # your Foursquare ID
CLIENT_SECRET = '1VGIJXEMS4WGPEFXQ2EZYER5UZU3AGBU4KTGGR1S10LVREZ4' # your Foursquare Secret
VERSION = '20180605'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 00NKY30JGU43KLGX1UOXYHKP1Z3GMOVZ4LBBIOKKQ0KGULZ4
CLIENT_SECRET:1VGIJXEMS4WGPEFXQ2EZYER5UZU3AGBU4KTGGR1S10LVREZ4


In [5]:
address = 'Zadar'

geolocator = Nominatim(user_agent="zadar_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Zadar, Croatia are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Zadar, Croatia are 44.1186078, 15.232136.


##### Creating a Folium map of Zadar

In [6]:
# create map of Zadar using latitude and longitude values
zadar_map = folium.Map(location=[latitude, longitude], zoom_start=12.00)

# add markers to map
for lat, lng, neighborhood in zip(zadar['Latitude'], zadar['Longitude'], zadar['Neighborhood']): #zip for multiple objects iterations'
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng], # iterates over latitude and longitude
        radius=4,
        popup=label,
        color='blue',
        fill=True,
        fill_color='red',
        fill_opacity=0.7,
        parse_html=False).add_to(zadar_map) # same as append, but to follium maps.
    
zadar_map

In [7]:
def getNearbyVenues(names, latitudes, longitudes, radius=350, LIMIT = 500):
    
    venues_list=[] # initiates venues_list
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['icon']['prefix'].split("/")[5],
            v['venue']['categories'][0]['name']) for v in results])
        
      

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude',
                  'Main Venue Category',           
                  'Venue Category']
    
    return(nearby_venues)

In [8]:
zadar_venues = getNearbyVenues(names = zadar['Neighborhood'],
                                   latitudes = zadar['Latitude'],
                                   longitudes = zadar['Longitude']
                                  )

#the function getNearbyVenues print the names of the neighborhoods.

Brodarica
Voštarnica
Peninsula
Plovanija
Špada
Skročini
Bokanjac
Bili Brig
Crvene kuće
Bulevar
Stanovi
Arbanasi
Jazine
Borik
Puntamika
Mocire
Petrići
Belafuža
Maslina
Smiljevac
Ričina
Sinjoretovo
Gazenica
Višnjik
Diklo


In [9]:
print(zadar_venues.shape)
zadar_venues.head(20)

(144, 8)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Main Venue Category,Venue Category
0,Brodarica,44.123611,15.226111,Slatka tajna,44.122089,15.227802,food,Donut Shop
1,Brodarica,44.123611,15.226111,Maraska Park,44.122748,15.224458,parks_outdoors,Garden
2,Brodarica,44.123611,15.226111,Palacinka bar,44.122658,15.224491,food,Restaurant
3,Brodarica,44.123611,15.226111,Djiga,44.122731,15.224572,parks_outdoors,Harbor / Marina
4,Brodarica,44.123611,15.226111,Marex,44.122187,15.223801,education,Cafeteria
5,Brodarica,44.123611,15.226111,Sfinga Park,44.126215,15.227031,parks_outdoors,Park
6,Brodarica,44.123611,15.226111,Restaurant Lungo Mare,44.125226,15.225334,food,Mediterranean Restaurant
7,Voštarnica,44.116944,15.235833,Fast food Papica,44.115216,15.232901,food,Fast Food Restaurant
8,Voštarnica,44.116944,15.235833,Mala Posta,44.116898,15.232496,travel,Bus Station
9,Voštarnica,44.116944,15.235833,da vinci,44.114765,15.23313,food,Café


In [15]:
#!pip install openpyxl 
# for exporting to excel



In [10]:
#exporting to excel file

zadar_venues.to_excel('zadar_venues.xlsx',index = None, header=True)

Getting venues for each neighborhood

In [11]:
zadar_count = zadar_venues.groupby(['Neighborhood','Neighborhood Latitude','Neighborhood Longitude']).count().reset_index().iloc[:,0:4].rename(columns = {"Venue": "Counts"})
zadar_count.head(26)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Counts
0,Arbanasi,44.102222,15.241389,4
1,Belafuža,44.132222,15.234722,3
2,Bili Brig,44.119722,15.260833,2
3,Bokanjac,44.144167,15.245,1
4,Borik,44.133333,15.216389,8
5,Brodarica,44.123611,15.226111,7
6,Bulevar,44.110556,15.246667,1
7,Crvene kuće,44.112222,15.261111,1
8,Diklo,44.138333,15.218333,6
9,Gazenica,44.097222,15.273333,4


In [12]:
# Getting a mean number of venues per neighborhood
print('Mean number of venues: ' + str(round(zadar_count['Counts'].mean())))

Mean number of venues: 6


In [13]:
# unique venue categories

print('There are {} uniques categories.'.format(len(zadar_venues['Venue Category'].unique())))

There are 63 uniques categories.


### Analyzing neighborhoods

Analyzing each neighborhoods' main venue category

In [14]:
# one hot encoding
zadar_onehot_main = pd.get_dummies(zadar_venues[['Main Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
zadar_onehot_main[['Neighborhood','Neighborhood Latitude','Neighborhood Longitude']] = zadar_venues[['Neighborhood','Neighborhood Latitude','Neighborhood Longitude']]

# move neighborhood column to the first column
fixed_columns_1 = zadar_onehot_main.columns[[zadar_onehot_main.columns.get_loc("Neighborhood"),
                                           zadar_onehot_main.columns.get_loc('Neighborhood Latitude'),
                                           zadar_onehot_main.columns.get_loc('Neighborhood Longitude')]].tolist() 

fixed_columns_2 = zadar_onehot_main.columns[:-4].tolist() 

zadar_onehot_main = zadar_onehot_main[fixed_columns_1 + fixed_columns_2]

zadar_onehot_main.head(25)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,arts_entertainment,building,education,food,nightlife,parks_outdoors,shops
0,Brodarica,44.123611,15.226111,0,0,0,1,0,0,0
1,Brodarica,44.123611,15.226111,0,0,0,0,0,1,0
2,Brodarica,44.123611,15.226111,0,0,0,1,0,0,0
3,Brodarica,44.123611,15.226111,0,0,0,0,0,1,0
4,Brodarica,44.123611,15.226111,0,0,1,0,0,0,0
5,Brodarica,44.123611,15.226111,0,0,0,0,0,1,0
6,Brodarica,44.123611,15.226111,0,0,0,1,0,0,0
7,Voštarnica,44.116944,15.235833,0,0,0,1,0,0,0
8,Voštarnica,44.116944,15.235833,0,0,0,0,0,0,0
9,Voštarnica,44.116944,15.235833,0,0,0,1,0,0,0


Grouping by neighborhood - showing percentages

In [15]:
zadar_grouped_mean = zadar_onehot_main.groupby(['Neighborhood','Neighborhood Latitude','Neighborhood Longitude']).mean().reset_index()
zadar_grouped_mean

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,arts_entertainment,building,education,food,nightlife,parks_outdoors,shops
0,Arbanasi,44.102222,15.241389,0.0,0.0,0.0,0.25,0.0,0.25,0.0
1,Belafuža,44.132222,15.234722,0.0,0.333333,0.0,0.0,0.333333,0.0,0.333333
2,Bili Brig,44.119722,15.260833,0.0,0.0,0.0,0.5,0.0,0.0,0.5
3,Bokanjac,44.144167,15.245,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,Borik,44.133333,15.216389,0.125,0.0,0.0,0.625,0.0,0.0,0.125
5,Brodarica,44.123611,15.226111,0.0,0.0,0.142857,0.428571,0.0,0.428571,0.0
6,Bulevar,44.110556,15.246667,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,Crvene kuće,44.112222,15.261111,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8,Diklo,44.138333,15.218333,0.166667,0.0,0.0,0.333333,0.166667,0.0,0.166667
9,Gazenica,44.097222,15.273333,0.0,0.0,0.0,0.25,0.0,0.0,0.75


In [16]:
#storing neighborhoods grouped by venues to excel file - percentages

zadar_grouped_mean.to_excel('zadar_grouped_venues.xlsx')

Grouping by neighborhood - showing numbers

In [17]:
zadar_grouped_numbers = zadar_onehot_main.groupby(['Neighborhood','Neighborhood Latitude','Neighborhood Longitude']).sum().reset_index()
zadar_grouped_numbers.to_excel('zadar_grouped_num_main.xlsx')
zadar_grouped_numbers

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,arts_entertainment,building,education,food,nightlife,parks_outdoors,shops
0,Arbanasi,44.102222,15.241389,0,0,0,1,0,1,0
1,Belafuža,44.132222,15.234722,0,1,0,0,1,0,1
2,Bili Brig,44.119722,15.260833,0,0,0,1,0,0,1
3,Bokanjac,44.144167,15.245,0,0,0,0,0,0,1
4,Borik,44.133333,15.216389,1,0,0,5,0,0,1
5,Brodarica,44.123611,15.226111,0,0,1,3,0,3,0
6,Bulevar,44.110556,15.246667,0,0,0,0,0,1,0
7,Crvene kuće,44.112222,15.261111,0,0,0,0,1,0,0
8,Diklo,44.138333,15.218333,1,0,0,2,1,0,1
9,Gazenica,44.097222,15.273333,0,0,0,1,0,0,3


In [18]:
zadar_heat_map = folium.Map(location=[latitude, longitude], zoom_start=12.00)
zadar_heat_map.add_child(HeatMap(data=zadar_grouped_numbers[['Neighborhood Latitude','Neighborhood Longitude', 'food']].groupby(['Neighborhood Latitude','Neighborhood Longitude']).sum().reset_index().values.tolist(), radius=30, max_zoom=13, min_opacity = 0.4, max_val = 24,gradient={.7: 'green', .94: 'orange', 1: 'red'}))

In [19]:
num_top_venues = 5

for hood in zadar_grouped_mean['Neighborhood']:
    print("----"+hood+"----")
    temp = zadar_grouped_mean[zadar_grouped_mean['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Arbanasi----
                    venue   freq
0   Neighborhood Latitude  44.10
1  Neighborhood Longitude  15.24
2                    food   0.25
3          parks_outdoors   0.25
4      arts_entertainment   0.00


----Belafuža----
                    venue   freq
0   Neighborhood Latitude  44.13
1  Neighborhood Longitude  15.23
2                building   0.33
3               nightlife   0.33
4                   shops   0.33


----Bili Brig----
                    venue   freq
0   Neighborhood Latitude  44.12
1  Neighborhood Longitude  15.26
2                    food   0.50
3                   shops   0.50
4      arts_entertainment   0.00


----Bokanjac----
                    venue   freq
0   Neighborhood Latitude  44.14
1  Neighborhood Longitude  15.24
2                   shops   1.00
3      arts_entertainment   0.00
4                building   0.00


----Borik----
                    venue   freq
0   Neighborhood Latitude  44.13
1  Neighborhood Longitude  15.22
2                 

### Clustering

Selecting only the neighborhoods with more than 3 venues

In [29]:
zadar_grouped_numbers_10 = zadar_grouped_numbers[zadar_grouped_numbers["food"] >= 2]
zadar_food = zadar_grouped_numbers_10[["Neighborhood", "Neighborhood Latitude", "Neighborhood Longitude","food"]]

In [30]:
zadar_food

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,food
4,Borik,44.133333,15.216389,5
5,Brodarica,44.123611,15.226111,3
8,Diklo,44.138333,15.218333,2
10,Jazine,44.111111,15.234444,2
13,Peninsula,44.114444,15.225556,24
16,Puntamika,44.131389,15.206389,2
17,Ričina,44.105,15.252778,2
20,Smiljevac,44.109722,15.2525,3
23,Voštarnica,44.116944,15.235833,2


In [31]:
kclusters = 4

kl_clustering = zadar_food.drop(["Neighborhood"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(kl_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([2, 3, 0, 0, 1, 0, 0, 3, 0])

In [32]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
zadar_merged = zadar_food.copy()

# add clustering labels
zadar_merged["Cluster Labels"] = kmeans.labels_

In [38]:
zadar_merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
zadar_merged.head(9)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,food,Cluster Labels
4,Borik,44.133333,15.216389,5,2
5,Brodarica,44.123611,15.226111,3,3
8,Diklo,44.138333,15.218333,2,0
10,Jazine,44.111111,15.234444,2,0
13,Peninsula,44.114444,15.225556,24,1
16,Puntamika,44.131389,15.206389,2,0
17,Ričina,44.105,15.252778,2,0
20,Smiljevac,44.109722,15.2525,3,3
23,Voštarnica,44.116944,15.235833,2,0


In [34]:
zadar_neighborhoods = zadar_food.iloc[:,0].values
zadar_neighborhoods

array(['Borik', 'Brodarica', 'Diklo', 'Jazine', 'Peninsula', 'Puntamika',
       'Ričina', 'Smiljevac', 'Voštarnica'], dtype=object)

In [35]:
neighborhoodList = []
df_zadar_neighs = pd.DataFrame({"Neighborhood": zadar_neighborhoods})
df_zadar_neighs

Unnamed: 0,Neighborhood
0,Borik
1,Brodarica
2,Diklo
3,Jazine
4,Peninsula
5,Puntamika
6,Ričina
7,Smiljevac
8,Voštarnica


In [36]:
zadar_merged = zadar_merged.join(df_zadar_neighs.set_index("Neighborhood"), on="Neighborhood")

print(zadar_merged.shape)
zadar_merged.head() # check the last columns!

(9, 5)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,food,Cluster Labels
4,Borik,44.133333,15.216389,5,2
5,Brodarica,44.123611,15.226111,3,3
8,Diklo,44.138333,15.218333,2,0
10,Jazine,44.111111,15.234444,2,0
13,Peninsula,44.114444,15.225556,24,1


In [37]:
# create clusters map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(zadar_merged['Neighborhood Latitude'], zadar_merged['Neighborhood Longitude'], zadar_merged['Neighborhood'], zadar_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=25,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Conclusion: 4 separate clusters were created, as optimal number of clusters. The most restaurants are situated in the Peninsula, so in that sense it is not reccommended to open a restaurant there. However, the tourist circulation is the highest in that region, so further analysis of tourist movements should be observed to carry out this analysis in satisfying way. Since there is only 3 food restaurants in the neighborhood of Smiljevac, and 3 in Brodarica neighborhood, in between of them there aren't many restaurants (Jazine-Smiljevac-Stanovi-Višnjik quadrant). In that sense, since tourist mostly, when exiting the city core spend time (or have their apartments rented) there, it is logical that the potential restaurant is in that area.