In [1]:
import numpy as np
import pandas as pd

import requests
from pandas.io.json import json_normalize

from sklearn.cluster import KMeans

import folium
from IPython.display import Image
from IPython.core.display import HTML

# The Data
- The following dataframe was downloaded from the United Nations website in the  Department of Economic and Social Affairs Population Dynamics section. (https://population.un.org/wup/Download/)
- This dataset lists out every urban agglomeration in the World with a population of over 300,000 from 1950 to 2015 and  the UN's projections of the population in these places upto 2035.

In [2]:
df = pd.read_excel('Cities.xls',skiprows=range(16))
df.drop('Index',axis=1,inplace=True)
print(df.shape)
df.head()

(1860, 25)


Unnamed: 0,Country Code,Country or area,City Code,Urban Agglomeration,Note,Latitude,Longitude,1950,1955,1960,...,1990,1995,2000,2005,2010,2015,2020,2025,2030,2035
0,4,Afghanistan,20001,Herat,,34.34817,62.19967,82.468,85.751,89.166,...,183.465,207.19,233.991,275.678,358.691,466.703,605.575,752.91,897.041,1057.573
1,4,Afghanistan,20002,Kabul,,34.528887,69.17246,170.784,220.749,285.352,...,1549.32,1928.694,2401.109,2905.178,3289.005,3723.543,4221.532,4877.024,5737.138,6760.5
2,4,Afghanistan,20003,Kandahar,,31.61332,65.71013,82.199,89.785,98.074,...,233.243,263.395,297.456,336.746,383.498,436.741,498.002,577.128,679.278,800.461
3,4,Afghanistan,20004,Mazar-e Sharif,,36.70904,67.11087,30.0,37.139,45.979,...,135.153,152.629,172.372,206.403,283.532,389.483,532.689,681.531,816.04,962.262
4,8,Albania,20005,Tiranë (Tirana),,41.3275,19.81889,84.513,106.932,134.761,...,247.27,287.95,335.336,371.803,408.697,449.298,493.712,535.702,565.301,581.626


##### Sorting the values according to the 2020 projections of the populations

In [3]:
df.sort_values(2020,ascending=False,inplace=True)
df.reset_index(drop=True,inplace=True)
df.head()

Unnamed: 0,Country Code,Country or area,City Code,Urban Agglomeration,Note,Latitude,Longitude,1950,1955,1960,...,1990,1995,2000,2005,2010,2015,2020,2025,2030,2035
0,392,Japan,21671,Tokyo,70.0,35.6895,139.69171,11274.641,13712.679,16678.821,...,32530.003,33586.573,34449.908,35621.544,36859.626,37256.109,37393.129,37036.204,36573.799,36014.03
1,356,India,21228,Delhi,60.0,28.66667,77.21667,1369.369,1781.624,2282.962,...,9384.209,12138.233,15691.899,18691.33,21987.895,25865.875,30290.936,34665.569,38938.697,43345.059
2,156,China,20656,Shanghai,39.0,31.22222,121.45806,4288.091,5712.858,6865.312,...,8605.812,11072.063,14246.541,17055.788,20314.309,23482.181,27058.479,30482.14,32869.265,34341.242
3,76,Brazil,20287,São Paulo,,-23.5475,-46.63611,2334.038,3043.828,3969.759,...,14775.84,15913.473,17014.078,18288.134,19659.808,20883.046,22043.028,22990.007,23824.223,24490.136
4,484,Mexico,21853,Ciudad de México (Mexico City),74.0,19.427318,-99.141869,3365.081,4293.878,5479.184,...,15642.318,17017.469,18457.027,19276.065,20136.681,21339.781,21782.378,22752.414,24110.599,25414.624


##### Filtering out only the necessary columns from the above dataset and reordering it for easier interpretability

In [4]:
df_2020 = df[list(df.columns[i] for i in [0,1,3,5,6])+[2020]]
df_2020 = df_2020[list(df_2020.columns[i] for i in [2,1,3,4,5,0])] #Reordering the columns
df_2020.columns = list(df_2020.columns[:4])+ ['Population'] + [df_2020.columns[-1]] #Renaming
df_2020.head()

Unnamed: 0,Urban Agglomeration,Country or area,Latitude,Longitude,Population,Country Code
0,Tokyo,Japan,35.6895,139.69171,37393.129,392
1,Delhi,India,28.66667,77.21667,30290.936,356
2,Shanghai,China,31.22222,121.45806,27058.479,156
3,São Paulo,Brazil,-23.5475,-46.63611,22043.028,76
4,Ciudad de México (Mexico City),Mexico,19.427318,-99.141869,21782.378,484


# Cleaning the Data
Checking the number of null values in the not assigned data frame.

In [5]:
for i in df_2020.columns:
    print(i,df_2020[i].isnull().sum())

Urban Agglomeration 0
Country or area 0
Latitude 0
Longitude 0
Population 0
Country Code 0


##### Selecting only the 200 most populous cities of the World.

In [6]:
df_2020 = df_2020.loc[range(200)]
df_2020.tail()

Unnamed: 0,Urban Agglomeration,Country or area,Latitude,Longitude,Population,Country Code
195,Zibo,China,36.79056,118.06333,2639.735,156
196,Bamako,Mali,12.65,-8.0,2617.686,466
197,Birmingham (West Midlands),United Kingdom,52.4814,-1.8998,2607.437,826
198,Thiruvananthapuram,India,8.50694,76.95694,2584.752,356
199,Vancouver,Canada,49.24966,-123.11934,2581.079,124


##### Exploring this new reduced data set to see the countrywise split-up

In [7]:
df_2020['Country or area'].value_counts()

China                       44
India                       20
United States of America    19
Brazil                      11
Japan                        6
                            ..
Morocco                      1
Kenya                        1
Zambia                       1
Ecuador                      1
Greece                       1
Name: Country or area, Length: 69, dtype: int64

# Getting the Venues
- Getting the Restaurant data in each of the 250 most populous cities of the World using the Foursquare API

In [8]:
CLIENT_ID = 'V0IOJXOPEJUKA435MGKJRMZGECVQ1W2H2O0ABKF3BORHYKVF' 
CLIENT_SECRET = 'ZLF3WOTLHONE2F1QK0DUDSTF02GYA2LQI5TYSSHJ0S22ERPQ' 
VERSION = '20180604'
LIMIT = 1000
search_query = 'Restaurants'
radius = 50000

In [9]:
def get_restaurants(names,latitudes,longitudes):
    cat=[]
    n=[]
    for name,lat,long in zip(names,latitudes,longitudes):
        #print(name)
        url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, lat, long, VERSION, search_query, radius, LIMIT)
        results = requests.get(url).json()
        for i in range(len(results['response']['venues'])):
            cat.append(results['response']['venues'][i]['categories'])
            n.append(name)
    city_restaurants = pd.DataFrame({'City': n,'Restaurants':cat})
    return city_restaurants

In [10]:
df_r = get_restaurants(df_2020['Urban Agglomeration'],df_2020['Latitude'],df_2020['Longitude'])

In [11]:
print(df_r.shape)
df_r.head()

(2788, 2)


Unnamed: 0,City,Restaurants
0,Tokyo,"[{'id': '4bf58dd8d48988d1c4941735', 'name': 'R..."
1,Tokyo,"[{'id': '4bf58dd8d48988d120951735', 'name': 'F..."
2,Tokyo,"[{'id': '4bf58dd8d48988d110941735', 'name': 'I..."
3,Tokyo,"[{'id': '56aa371be4b08b9a8d573517', 'name': 'B..."
4,Tokyo,"[{'id': '4bf58dd8d48988d1c4941735', 'name': 'R..."


##### Grouping the Restaurants based on the category they belong to.

In [12]:
for i in range(len(df_r)):
    if len(df_r.loc[i,'Restaurants']) != 0:
        df_r.loc[i,'Restaurant type'] = df_r.loc[i,'Restaurants'][0]['name']
    else:
        df_r.drop(i,inplace=True)
print(len(df_r))
df_r.head()

2599


Unnamed: 0,City,Restaurants,Restaurant type
0,Tokyo,"[{'id': '4bf58dd8d48988d1c4941735', 'name': 'R...",Restaurant
1,Tokyo,"[{'id': '4bf58dd8d48988d120951735', 'name': 'F...",Food Court
2,Tokyo,"[{'id': '4bf58dd8d48988d110941735', 'name': 'I...",Italian Restaurant
3,Tokyo,"[{'id': '56aa371be4b08b9a8d573517', 'name': 'B...",Business Center
4,Tokyo,"[{'id': '4bf58dd8d48988d1c4941735', 'name': 'R...",Restaurant


In [13]:
df_dummies = pd.get_dummies(df_r['Restaurant type'])
df_dummies['City'] = df_r['City']
print(df_dummies.shape)
df_dummies.head()

(2599, 188)


Unnamed: 0,Advertising Agency,Afghan Restaurant,African Restaurant,American Restaurant,Anhui Restaurant,Apres Ski Bar,Arcade,Arepa Restaurant,Argentinian Restaurant,Art Gallery,...,Trade School,Turkish Home Cooking Restaurant,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse,Warehouse Store,Wings Joint,Yemeni Restaurant,City
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Tokyo
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Tokyo
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Tokyo
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Tokyo
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Tokyo


##### This Dataframe tells us the different kinds of Restaurants and how many of them there are in each of the cities.

In [14]:
df_grouped = df_dummies.groupby('City').sum()
print(df_grouped.shape)
df_grouped.head()

(160, 187)


Unnamed: 0_level_0,Advertising Agency,Afghan Restaurant,African Restaurant,American Restaurant,Anhui Restaurant,Apres Ski Bar,Arcade,Arepa Restaurant,Argentinian Restaurant,Art Gallery,...,Theme Restaurant,Trade School,Turkish Home Cooking Restaurant,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse,Warehouse Store,Wings Joint,Yemeni Restaurant
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abidjan,0,0,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Abuja,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Addis Ababa,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Ahmadabad,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Al Kuwayt (Kuwait City),0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


# Clustering the Data
- The KMeans algorithm was is used to cluster this data
- A total of 5 clusters were chosen.

In [15]:
kc = KMeans(n_clusters=5)
kc.fit(df_grouped.values)
df_grouped['clusters']=kc.labels_
for i,j in zip(df_grouped.index,df_grouped['clusters']):
    for k in range(len(df_2020)):
        if i == df_2020.loc[k,'Urban Agglomeration']:
            df_2020.loc[k,'Clusters'] = j
df_2020.head()

Unnamed: 0,Urban Agglomeration,Country or area,Latitude,Longitude,Population,Country Code,Clusters
0,Tokyo,Japan,35.6895,139.69171,37393.129,392,0.0
1,Delhi,India,28.66667,77.21667,30290.936,356,2.0
2,Shanghai,China,31.22222,121.45806,27058.479,156,2.0
3,São Paulo,Brazil,-23.5475,-46.63611,22043.028,76,2.0
4,Ciudad de México (Mexico City),Mexico,19.427318,-99.141869,21782.378,484,2.0


##### Plotting the Data onto the map of the world for  a better representation of the clustered Data

In [17]:
cities_map = folium.Map(zoom_start=8)
colours = ['red','blue','green','yellow','pink','orange','gray','white','purple']

for i in range(len(df_2020)):
    for j in df_2020['Clusters'].unique():
        if df_2020.loc[i,'Clusters'] == j:
            folium.features.CircleMarker(
                [df_2020.loc[i,'Latitude'], df_2020.loc[i,'Longitude']],
                radius=5,
                poup=df_2020.loc[i,'Urban Agglomeration'],
                fill=True,
                color=colours[int(j)],
                fill_color=colours[int(j)],
                fill_opacity=1.0
            ).add_to(cities_map)
cities_map