# Capstane Project for the Battle of Neighboorhoods

## <span style="color:blue"> With limited time in hand, we compare the top tourist spots in European cities and advise on which city to go to and what route to take </span>

### Importing and installing important libraries

In [1]:
#Importing and installing important libraries
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import tkinter as tk


#Installing further tools
! pip install beautifulsoup4
! pip install lxml
! pip install html5lib
! pip install conda
! pip install xlrd
! pip install geopy

Collecting beautifulsoup4
[?25l  Downloading https://files.pythonhosted.org/packages/3b/c8/a55eb6ea11cd7e5ac4bacdf92bac4693b90d3ba79268be16527555e186f0/beautifulsoup4-4.8.1-py3-none-any.whl (101kB)
[K     |████████████████████████████████| 102kB 27.6MB/s ta 0:00:01
[?25hCollecting soupsieve>=1.2 (from beautifulsoup4)
  Downloading https://files.pythonhosted.org/packages/81/94/03c0f04471fc245d08d0a99f7946ac228ca98da4fa75796c507f61e688c2/soupsieve-1.9.5-py2.py3-none-any.whl
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.8.1 soupsieve-1.9.5
Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/ec/be/5ab8abdd8663c0386ec2dd595a5bc0e23330a0549b8a91e32f38c20845b6/lxml-4.4.1-cp36-cp36m-manylinux1_x86_64.whl (5.8MB)
[K     |████████████████████████████████| 5.8MB 33.3MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.4.1
Collecting conda
[?25l  Downloading https://files.pythonhoste

In [2]:
#import requests
#Getting website from the url through get
website_url = requests.get("https://en.wikipedia.org/wiki/List_of_urban_areas_in_the_European_Union").text

#Importing beautifulsoup to scrape data for the most populus European Cities
from bs4 import BeautifulSoup
soup = BeautifulSoup(website_url,"lxml")

My_table = soup.find("table",{"class":"wikitable"}) #Searches for only one table in the whole link
Rows = My_table.find("tbody").findAll("tr"); #Searches for only one table tbody tag in My_table
#Use find when there is only one entity. In out wikipedia page, there is only one entity that is 
#why we have used find and not findall

#Making objects to store arrays for different data types
Ra=[]
Ua=[]
St=[]
Pop=[]
Den=[]


for row in Rows:
  Tds = row.findAll("td")
  if len(Tds) > 0:
    Ra.append(Tds[0].text.strip())
    Ua.append(Tds[1].find("a").get("title"))
    St.append(Tds[3].find("a").get("title"))
    Pop.append(Tds[4].text.strip())
    Den.append(Tds[7].text.strip())

In [3]:
# Transferring arrays into dataframes

df=pd.DataFrame(Ra,columns=['Rank'])
df['City']=Ua
df['Country']=St
df['Population']=Pop
df['Density per km^2']=Den
print(df.dtypes)
print(df.index)
df = df.replace(r'^\s*$', np.nan, regex=True) #Replacing empty spaces with NaN values

Rank                object
City                object
Country             object
Population          object
Density per km^2    object
dtype: object
RangeIndex(start=0, stop=92, step=1)


In [10]:
# There are some empy cells therefore we beed to remove cities with incomplete data
srt=df.dropna()
srt=srt.reset_index(drop=True)
srt

Unnamed: 0,Rank,City,Country,Population,Density per km^2
0,1,Paris,France,10950000,3800
1,2,London,United Kingdom,10470000,5900
2,3,Ruhr,Germany,6670000,2800
3,4,Madrid,Spain,6610000,4600
4,5,Milan,Italy,5280000,2800
...,...,...,...,...,...
81,85,Aachen,Germany,545000,1500
82,87,Bologna,Italy,530000,3400
83,89,Grenoble,France,515000,985
84,90,Saarbrücken,Germany,510000,2200


In [11]:
#Converting objects in the dataframe to integers
srt['Rank']=srt['Rank'].astype(int)
srt['Population']=srt['Population'].apply(lambda x: int(x.split()[0].replace(',', '')))
srt['Density per km^2']=srt['Density per km^2'].apply(lambda x: int(x.split()[0].replace(',', '')))
print(srt.dtypes)
print(srt.index)
srt

Rank                 int64
City                object
Country             object
Population           int64
Density per km^2     int64
dtype: object
RangeIndex(start=0, stop=86, step=1)


Unnamed: 0,Rank,City,Country,Population,Density per km^2
0,1,Paris,France,10950000,3800
1,2,London,United Kingdom,10470000,5900
2,3,Ruhr,Germany,6670000,2800
3,4,Madrid,Spain,6610000,4600
4,5,Milan,Italy,5280000,2800
...,...,...,...,...,...
81,85,Aachen,Germany,545000,1500
82,87,Bologna,Italy,530000,3400
83,89,Grenoble,France,515000,985
84,90,Saarbrücken,Germany,510000,2200


In [7]:
#Now that we have the data for the cities. We are going to find the latitude and longtitude of each
#We use the library/tool geopy to find the latitude and longitude of each city
#Cities with no latitude and longitude are ignroed
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="Earth")
lat_list=[] #Creating empty list for latitude
long_list=[] #Creating empty lists for longitude
results=[]
ignored_city=[]
Cite=[]
Ct=srt['City']

for city in Ct:
      results=geolocator.geocode(city)
      if results is not None:
       lat_list.append(results.latitude)
       long_list.append(results.longitude)
       Cite.append(city) 
      else:
       ignored_city.append(city)

In [8]:
# Transferring arrays into dataframes


dt_cord=pd.DataFrame(Cite,columns=['City'])
dt_cord['Latitude']=lat_list
dt_cord['Longitude']=long_list
print(dt_cord.dtypes)
print(dt_cord.index)
dt_cord

City          object
Latitude     float64
Longitude    float64
dtype: object
RangeIndex(start=0, stop=83, step=1)


Unnamed: 0,City,Latitude,Longitude
0,Paris,48.856610,2.351499
1,London,51.507322,-0.127647
2,Ruhr,51.517518,7.143918
3,Madrid,40.416705,-3.703582
4,Milan,45.466800,9.190500
...,...,...,...
78,Aachen,50.776351,6.083862
79,Bologna,44.493671,11.343035
80,Grenoble,45.187560,5.735782
81,Saarbrücken,49.234362,6.996379


#### 3 cities did not have latitude and longitude data and thus were not included in the used data

### Combining the Cities and Latitude Longitude data into a single dataframe

In [12]:
gd_data= pd.merge(srt,dt_cord, how='inner', on='City')
gd_data

Unnamed: 0,Rank,City,Country,Population,Density per km^2,Latitude,Longitude
0,1,Paris,France,10950000,3800,48.856610,2.351499
1,2,London,United Kingdom,10470000,5900,51.507322,-0.127647
2,3,Ruhr,Germany,6670000,2800,51.517518,7.143918
3,4,Madrid,Spain,6610000,4600,40.416705,-3.703582
4,5,Milan,Italy,5280000,2800,45.466800,9.190500
...,...,...,...,...,...,...,...
78,85,Aachen,Germany,545000,1500,50.776351,6.083862
79,87,Bologna,Italy,530000,3400,44.493671,11.343035
80,89,Grenoble,France,515000,985,45.187560,5.735782
81,90,Saarbrücken,Germany,510000,2200,49.234362,6.996379


### Using folium to create the map of cities with available latituted and longitude

#### Importing Libraries and other tools for clustering and analysis

In [13]:

# library to handle JSON files
import json
!pip install geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!pip install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')


Usage:   
  pip install [options] <requirement specifier> [package-index-options] ...
  pip install [options] -r <requirements file> [package-index-options] ...
  pip install [options] [-e] <vcs project url> ...
  pip install [options] [-e] <local project path> ...
  pip install [options] <archive url/path> ...

no such option: --yes
Libraries imported.


In [14]:
#Creating Initial map of europe
Eu_lat= 54.5260
Eu_long= 15.2551

Eur_map= folium.Map(location=[Eu_lat, Eu_long], zoom_start=4)
Eur_map
#Creating markers and maps for different european cities
locations= gd_data[['Latitude', 'Longitude']]
locationlist=locations.values.tolist()
for point in range(0, len(locationlist)):
    folium.Marker(locationlist[point], popup=gd_data['City'][point]).add_to(Eur_map)
Eur_map

### After checking the map further. 4 more cities were dropped as their latitude and logitude were incorrect


In [15]:
mr_drop=gd_data
mr_drop = mr_drop.set_index("City")
mr_drop = mr_drop.drop({"Athens","Liverpool Urban Area","Las Palmas", "Greater Bristol"}, axis=0)
mr_drop.reset_index(level=0, inplace=True)
mr_drop

Unnamed: 0,City,Rank,Country,Population,Density per km^2,Latitude,Longitude
0,Paris,1,France,10950000,3800,48.856610,2.351499
1,London,2,United Kingdom,10470000,5900,51.507322,-0.127647
2,Ruhr,3,Germany,6670000,2800,51.517518,7.143918
3,Madrid,4,Spain,6610000,4600,40.416705,-3.703582
4,Milan,5,Italy,5280000,2800,45.466800,9.190500
...,...,...,...,...,...,...,...
74,Aachen,85,Germany,545000,1500,50.776351,6.083862
75,Bologna,87,Italy,530000,3400,44.493671,11.343035
76,Grenoble,89,France,515000,985,45.187560,5.735782
77,Saarbrücken,90,Germany,510000,2200,49.234362,6.996379


# Using Foursqaure API to get location data for Airports

In [16]:
CLIENT_ID = 'MCRY1N35J0BPBAJR1NC250JOMNWW0WX2QQ2CUE3MWRLZK4ZC' # your Foursquare ID
CLIENT_SECRET = 'DJQOLN02XZA252QEXEK2D3TOH2GYS4LD5XG0JETP4V15ORSO' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: MCRY1N35J0BPBAJR1NC250JOMNWW0WX2QQ2CUE3MWRLZK4ZC
CLIENT_SECRET:DJQOLN02XZA252QEXEK2D3TOH2GYS4LD5XG0JETP4V15ORSO


#### Generating URL's with different latitude and longitude for each city

In [17]:
nhbe_lat = mr_drop.Latitude
nhbe_long= mr_drop.Longitude
b_url=[]
i = 0
while i < len(nhbe_lat):
    LIMIT = 500 # limit of number of venues returned by Foursquae API
    radius= 5000 # Define radius
    b_url.append('https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&intent=browse&categoryId=4bf58dd8d48988d1eb931735'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    nhbe_lat[i], 
    nhbe_long[i], 
    radius, 
    LIMIT))
    i += 1

bu=pd.DataFrame(b_url)
bu.columns=["url"]
bu

Unnamed: 0,url
0,https://api.foursquare.com/v2/venues/explore?&...
1,https://api.foursquare.com/v2/venues/explore?&...
2,https://api.foursquare.com/v2/venues/explore?&...
3,https://api.foursquare.com/v2/venues/explore?&...
4,https://api.foursquare.com/v2/venues/explore?&...
...,...
74,https://api.foursquare.com/v2/venues/explore?&...
75,https://api.foursquare.com/v2/venues/explore?&...
76,https://api.foursquare.com/v2/venues/explore?&...
77,https://api.foursquare.com/v2/venues/explore?&...


In [18]:
surl=pd.merge(mr_drop,bu, right_index=True, left_index=True)
surl

Unnamed: 0,City,Rank,Country,Population,Density per km^2,Latitude,Longitude,url
0,Paris,1,France,10950000,3800,48.856610,2.351499,https://api.foursquare.com/v2/venues/explore?&...
1,London,2,United Kingdom,10470000,5900,51.507322,-0.127647,https://api.foursquare.com/v2/venues/explore?&...
2,Ruhr,3,Germany,6670000,2800,51.517518,7.143918,https://api.foursquare.com/v2/venues/explore?&...
3,Madrid,4,Spain,6610000,4600,40.416705,-3.703582,https://api.foursquare.com/v2/venues/explore?&...
4,Milan,5,Italy,5280000,2800,45.466800,9.190500,https://api.foursquare.com/v2/venues/explore?&...
...,...,...,...,...,...,...,...,...
74,Aachen,85,Germany,545000,1500,50.776351,6.083862,https://api.foursquare.com/v2/venues/explore?&...
75,Bologna,87,Italy,530000,3400,44.493671,11.343035,https://api.foursquare.com/v2/venues/explore?&...
76,Grenoble,89,France,515000,985,45.187560,5.735782,https://api.foursquare.com/v2/venues/explore?&...
77,Saarbrücken,90,Germany,510000,2200,49.234362,6.996379,https://api.foursquare.com/v2/venues/explore?&...


In [19]:
burl=surl["url"]
results=[]
ap_nb=[] ### Number of cities that have airports
CAA=[]
no_airp=[]   ## Indeces for cities with airports that are not close
dist=[]  ## Distance to the airport
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

filtered_columns = ['venue.name', 'venue.categories']
    #This loop finds the list of cities that have an airport within 5 Kilometers of the city raidus
for i in range(0, len(burl)):
               results= requests.get(burl[i]).json()
               venues = results['response']['groups'][0]['items']
               nearby_venues = json_normalize(venues)
               if nearby_venues.empty ==False:   
                ap_nb.append(nearby_venues)
                dist.append(results['response']['groups'][0]['items'][0]['venue']['location']['distance'])
               else:
                    no_airp.append(i)

In [37]:
so_airp=pd.DataFrame(no_airp)
for i in so_airp:
    ind=so_airp[i]
    fin_cou=surl.drop(index=ind)
fin_cou
ind_cou=fin_cou.reset_index(drop=True)
ind_cou
ind_cou["Distance"]=dist
pc_cou=ind_cou

In [38]:
pc_cou

Unnamed: 0,City,Rank,Country,Population,Density per km^2,Latitude,Longitude,url,Distance
0,London,2,United Kingdom,10470000,5900,51.507322,-0.127647,https://api.foursquare.com/v2/venues/explore?&...,608
1,Madrid,4,Spain,6610000,4600,40.416705,-3.703582,https://api.foursquare.com/v2/venues/explore?&...,2148
2,Milan,5,Italy,5280000,2800,45.4668,9.1905,https://api.foursquare.com/v2/venues/explore?&...,2845
3,Barcelona,6,Spain,5260000,4300,41.382894,2.177432,https://api.foursquare.com/v2/venues/explore?&...,841
4,Rome,8,Italy,3950000,3400,41.894802,12.485338,https://api.foursquare.com/v2/venues/explore?&...,1602
5,Lisbon,11,Portugal,2700000,2800,38.707751,-9.136592,https://api.foursquare.com/v2/venues/explore?&...,1292
6,Greater Manchester Urban Area,13,United Kingdom,2685000,4200,53.385561,-2.340603,https://api.foursquare.com/v2/venues/explore?&...,4371
7,Rotterdam,14,Netherlands,2670000,2700,51.922893,4.463179,https://api.foursquare.com/v2/venues/explore?&...,1659
8,Budapest,16,Hungary,2500000,1900,47.498382,19.040471,https://api.foursquare.com/v2/venues/explore?&...,987
9,Prague,17,Czech Republic,2300000,4600,50.087465,14.421254,https://api.foursquare.com/v2/venues/explore?&...,1327


In [35]:
#Creating Initial map of europe
Eu_lat= 54.5260
Eu_long= 15.2551

Eur_map_n= folium.Map(location=[Eu_lat, Eu_long], zoom_start=4)
Eur_map_n

#Creating markers and maps for different european cities
locations_n= ind_cou[['Latitude', 'Longitude']]
locationlist=locations_n.values.tolist()
for point in range(0, len(locationlist)):
    folium.Marker(locationlist[point], popup=ind_cou['City'][point]).add_to(Eur_map_n)
Eur_map_n

In [39]:
# set number of clusters
kclusters = 3
df = pd.DataFrame(pc_cou,columns=['Distance','Density per km^2'])

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df)

centroids = kmeans.cluster_centers_

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:6] 

pc_cou.insert(0, 'Cluster Labels', kmeans.labels_)

# create map
map_clusters = folium.Map(location=[Eu_lat, Eu_long], zoom_start=3)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, cluster in zip(pc_cou['Latitude'], pc_cou['Longitude'], pc_cou['Cluster Labels']):
    label = folium.Popup( ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examining Clusters

In [24]:
pc_cou.loc[pc_cou['Cluster Labels'] == 0, pc_cou.columns[[1] + list(range(5, pc_cou.shape[1]))]]

Unnamed: 0,City,Density per km^2,Latitude,Longitude,url,Distance
0,London,5900,51.507322,-0.127647,https://api.foursquare.com/v2/venues/explore?&...,608
1,Madrid,4600,40.416705,-3.703582,https://api.foursquare.com/v2/venues/explore?&...,2148
3,Barcelona,4300,41.382894,2.177432,https://api.foursquare.com/v2/venues/explore?&...,841
9,Prague,4600,50.087465,14.421254,https://api.foursquare.com/v2/venues/explore?&...,1327
13,Bucharest,6500,44.436141,26.10272,https://api.foursquare.com/v2/venues/explore?&...,3151
17,Valencia,5700,39.469901,-0.375951,https://api.foursquare.com/v2/venues/explore?&...,223
23,Sevilla,5600,37.38863,-5.99534,https://api.foursquare.com/v2/venues/explore?&...,1846
24,Gdańsk,5000,54.347629,18.645232,https://api.foursquare.com/v2/venues/explore?&...,1140
28,Thessaloniki,4300,40.640317,22.935272,https://api.foursquare.com/v2/venues/explore?&...,2243
29,Bilbao,5800,43.263005,-2.934992,https://api.foursquare.com/v2/venues/explore?&...,4166


In [27]:
pc_cou.loc[pc_cou['Cluster Labels'] == 1, pc_cou.columns[[1] + list(range(5, pc_cou.shape[1]))]]

Unnamed: 0,City,Density per km^2,Latitude,Longitude,url,Distance
2,Milan,2800,45.4668,9.1905,https://api.foursquare.com/v2/venues/explore?&...,2845
6,Greater Manchester Urban Area,4200,53.385561,-2.340603,https://api.foursquare.com/v2/venues/explore?&...,4371
10,Warsaw,3200,52.233717,21.071411,https://api.foursquare.com/v2/venues/explore?&...,4363
12,Brussels,2600,50.846557,4.351697,https://api.foursquare.com/v2/venues/explore?&...,4173
15,Vienna,3900,48.208354,16.372504,https://api.foursquare.com/v2/venues/explore?&...,3699
18,Stockholm,4300,59.325117,18.071093,https://api.foursquare.com/v2/venues/explore?&...,3347
26,Bergamo,3300,45.694495,9.669873,https://api.foursquare.com/v2/venues/explore?&...,3894
31,Catania,2900,37.502235,15.08738,https://api.foursquare.com/v2/venues/explore?&...,3785
34,Nuremberg,3000,49.453872,11.077298,https://api.foursquare.com/v2/venues/explore?&...,4513
35,Bremen,2400,53.07582,8.807165,https://api.foursquare.com/v2/venues/explore?&...,2988


In [28]:
pc_cou.loc[pc_cou['Cluster Labels'] == 2, pc_cou.columns[[1] + list(range(5, pc_cou.shape[1]))]]

Unnamed: 0,City,Density per km^2,Latitude,Longitude,url,Distance
4,Rome,3400,41.894802,12.485338,https://api.foursquare.com/v2/venues/explore?&...,1602
5,Lisbon,2800,38.707751,-9.136592,https://api.foursquare.com/v2/venues/explore?&...,1292
7,Rotterdam,2700,51.922893,4.463179,https://api.foursquare.com/v2/venues/explore?&...,1659
8,Budapest,1900,47.498382,19.040471,https://api.foursquare.com/v2/venues/explore?&...,987
11,Cologne/Bonn Region,2300,50.867793,7.138906,https://api.foursquare.com/v2/venues/explore?&...,1807
14,Frankfurt,3000,50.110644,8.682092,https://api.foursquare.com/v2/venues/explore?&...,630
16,Amsterdam,3200,52.37454,4.897976,https://api.foursquare.com/v2/venues/explore?&...,758
19,Porto,1900,41.149451,-8.610788,https://api.foursquare.com/v2/venues/explore?&...,963
20,Stuttgart,2900,48.778449,9.180013,https://api.foursquare.com/v2/venues/explore?&...,716
21,Copenhagen,2700,55.686724,12.570072,https://api.foursquare.com/v2/venues/explore?&...,1843


### ***Code for self-use and Experiment***

In [None]:
neighborhood_latitude = 48.856610 # neighborhood latitude value
neighborhood_longitude = 2.351499 # neighborhood longitude value

# type your answer here
LIMIT = 500 # limit of number of venues returned by Foursquae API
radius= 25000 # Define radius

#creating URL

url=  'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&intent=browse&categoryId=4bf58dd8d48988d1eb931735'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

In [None]:
results = requests.get(url).json()
#results

In [None]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues

In [335]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

0 venues were returned by Foursquare.
