In [1]:
import numpy as np
import pandas as pd
import geocoder
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests
import json
from pandas import json_normalize # tranform JSON file into a pandas dataframe



In [2]:
from sklearn.cluster import KMeans
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium

In [3]:
res_sector_list_noida = pd.read_csv('list_of_residential_sectors_noida.txt')
res_sector_list_noida

Unnamed: 0,Sectors
0,Sector 11
1,Sector 12
2,Sector 14
3,Sector 15
4,Sector 15a
...,...
84,Sector 150
85,Sector 151
86,Sector 158
87,Sector 162


In [4]:
# create an empty dataframe
columns = ['Sector', 'Latitude', 'Longitude']
df = pd.DataFrame(columns=columns)
df

Unnamed: 0,Sector,Latitude,Longitude


In [5]:
location_dict = {}

for value in res_sector_list_noida['Sectors']:
    address = f"{value}, Noida, Uttar Pradesh"
    location_dict[value] = {}
    

    geolocator = Nominatim(user_agent="ny_explorer")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    
    location_dict[value]['Latitude'] = latitude
    location_dict[value]['Longitude'] = longitude
    
#print(f"The geograpical coordinate are {latitude}, {longitude}")

In [6]:
location_dict

{'Sector 11': {'Latitude': 28.598112450000002, 'Longitude': 77.33381361756622},
 'Sector 12': {'Latitude': 28.59511835, 'Longitude': 77.33848622983783},
 'Sector 14': {'Latitude': 28.58723095, 'Longitude': 77.30822173851368},
 'Sector 15': {'Latitude': 28.58279785, 'Longitude': 77.31022209445986},
 'Sector 15a': {'Latitude': 28.577996900000002,
  'Longitude': 77.30850903904091},
 'Sector 17': {'Latitude': 28.573746900000003, 'Longitude': 77.32003759056985},
 'Sector 19': {'Latitude': 28.5783966, 'Longitude': 77.3247692190847},
 'Sector 20': {'Latitude': 28.4741497, 'Longitude': 77.5244982},
 'Sector 21': {'Latitude': 28.586822750000003, 'Longitude': 77.3361621904921},
 'Sector 22': {'Latitude': 28.5952028, 'Longitude': 77.34727924425076},
 'Sector 23': {'Latitude': 28.592354399999998, 'Longitude': 77.3538133162243},
 'Sector 25': {'Latitude': 28.5830083, 'Longitude': 77.33981905600054},
 'Sector 26': {'Latitude': 28.579465550000002, 'Longitude': 77.33512251572104},
 'Sector 27': {'Lati

In [7]:
# populate dataframe with data from dictionary
for ind, sector in enumerate(location_dict):
    lat = location_dict[sector]['Latitude']
    long = location_dict[sector]['Longitude']
    
    df = df.append({"Sector": sector, "Latitude": lat, "Longitude": long}, ignore_index='True')
    
df

Unnamed: 0,Sector,Latitude,Longitude
0,Sector 11,28.598112,77.333814
1,Sector 12,28.595118,77.338486
2,Sector 14,28.587231,77.308222
3,Sector 15,28.582798,77.310222
4,Sector 15a,28.577997,77.308509
...,...,...,...
84,Sector 150,28.622575,77.374315
85,Sector 151,28.622575,77.374315
86,Sector 158,28.622575,77.374315
87,Sector 162,28.622575,77.374315


In [8]:
# export dataframe to file for future reference

df.to_csv('noida_res_sectors_geolocator_data.txt')

In [57]:
CLIENT_ID = 'YVE3NFYJ5LSHB0W2TEHOE4UNEOMCLDQMJOYCNT31IEXSUJF0' # your Foursquare ID
CLIENT_SECRET = 'A4OB5B2XE5WYYSTM35YV1QCNIFD5OXP3VUF03QQGM5GMSNAB' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

In [58]:
df = pd.read_csv('noida_res_sectors_geolocator_data.txt')
df.drop(columns='Unnamed: 0', axis=1, inplace=True)
df

Unnamed: 0,Sector,Latitude,Longitude
0,Sector 11,28.598112,77.333814
1,Sector 12,28.595118,77.338486
2,Sector 14,28.587231,77.308222
3,Sector 15,28.582798,77.310222
4,Sector 15a,28.577997,77.308509
...,...,...,...
84,Sector 150,28.622575,77.374315
85,Sector 151,28.622575,77.374315
86,Sector 158,28.622575,77.374315
87,Sector 162,28.622575,77.374315


In [59]:
#Create a function to explore each residential sector in Noida

def getNearbyVenues(names, latitudes, longitudes, radius=2000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = f"https://api.foursquare.com/v2/venues/explore?client_id={CLIENT_ID}&client_secret={CLIENT_SECRET}&ll={lat},{lng}&v={VERSION}&radius={radius}&limit={LIMIT}"
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)
    

df = df.iloc[0:1, :]
df
#df.drop(df.index[0])


In [60]:

noida_venues = getNearbyVenues(names=df['Sector'], latitudes=df['Latitude'], longitudes=df['Longitude'])


Sector 11
Sector 12
Sector 14
Sector 15
Sector 15a
Sector 17
Sector 19
Sector 20
Sector 21
Sector 22
Sector 23
Sector 25
Sector 26
Sector 27
Sector 28
Sector 29
Sector 30
Sector 31
Sector 33
Sector 34
Sector 35
Sector 36
Sector 37
Sector 39
Sector 40
Sector 41
Sector 42
Sector 43
Sector 44
Sector 45
Sector 46
Sector 47
Sector 48
Sector 49
Sector 50
Sector 51
Sector 52
Sector 53
Sector 55
Sector 56
Sector 61
Sector 62
Sector 66
Sector 70
Sector 71
Sector 72
Sector 73
Sector 74
Sector 75
Sector 76
Sector 77
Sector 78
Sector 79
Sector 82
Sector 92
Sector 93
Sector 99
Sector 100
Sector 104
Sector 105
Sector 107
Sector 108
Sector 110
Sector 112
Sector 113
Sector 115
Sector 116
Sector 117
Sector 118
Sector 119
Sector 120
Sector 121
Sector 122
Sector 128
Sector 130
Sector 131
Sector 132
Sector 133
Sector 134
Sector 135
Sector 137
Sector 141
Sector 142
Sector 143
Sector 150
Sector 151
Sector 158
Sector 162
Sector 168


In [61]:
noida_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Sector 11,28.598112,77.333814,Berco's Garden Restaurant,28.594078,77.338786,Chinese Restaurant
1,Sector 11,28.598112,77.333814,Domino's Pizza,28.601,77.323,Pizza Place
2,Sector 11,28.598112,77.333814,Haldiram's,28.586575,77.340732,Indian Restaurant
3,Sector 11,28.598112,77.333814,Jal Vayu Vihar Shopping Complex,28.584681,77.337839,Market
4,Sector 11,28.598112,77.333814,Subway,28.583095,77.329523,Sandwich Place


In [62]:
# export dataframe to file for future reference

noida_venues.to_csv('noida_res_sectors_foresquare_data_2000m.txt')

In [3]:
noida_venues = pd.read_csv('noida_res_sectors_foresquare_data.txt')
noida_venues.drop(columns='Unnamed: 0', axis=1, inplace=True)
noida_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Sector 11,28.598112,77.333814,Fresh Food Factory - Accha Khao Accha Khilao,28.580971,77.317292,Indian Restaurant
1,Sector 11,28.598112,77.333814,Lakshmi Coffee House,28.57014,77.332734,South Indian Restaurant
2,Sector 11,28.598112,77.333814,DLF Mall Of India,28.567503,77.321362,Shopping Mall
3,Sector 11,28.598112,77.333814,PVR Superplex,28.574257,77.353622,Multiplex
4,Sector 11,28.598112,77.333814,Berco's Garden Restaurant,28.594078,77.338786,Chinese Restaurant


In [63]:
# Check number of venues returned for each neighbourhood

noida_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Sector 100,4,4,4,4,4,4
Sector 104,34,34,34,34,34,34
Sector 105,7,7,7,7,7,7
Sector 107,8,8,8,8,8,8
Sector 108,13,13,13,13,13,13
...,...,...,...,...,...,...
Sector 79,34,34,34,34,34,34
Sector 82,10,10,10,10,10,10
Sector 92,10,10,10,10,10,10
Sector 93,13,13,13,13,13,13


In [64]:
# Number of unique venue categories

len(noida_venues['Venue Category'].unique())

74

In [65]:
# one hot encoding
noida_onehot = pd.get_dummies(noida_venues['Venue Category'])

# add neighborhood column to the dataframe
noida_onehot['Neighborhood'] = noida_venues['Neighborhood']

# move neighborhood column to the first column
first_column = noida_onehot.pop('Neighborhood')
noida_onehot.insert(0, 'Neighborhood', first_column)

noida_onehot.head()

Unnamed: 0,Neighborhood,ATM,American Restaurant,Arcade,Asian Restaurant,Athletics & Sports,BBQ Joint,Bakery,Bank,Bar,...,Snack Place,South Indian Restaurant,Spa,Sporting Goods Shop,Stadium,Tea Room,Thai Restaurant,Theme Park,Train Station,Vegetarian / Vegan Restaurant
0,Sector 11,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Sector 11,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Sector 11,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Sector 11,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Sector 11,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [66]:
noida_grouped = noida_onehot.groupby('Neighborhood').mean().reset_index()
noida_grouped

Unnamed: 0,Neighborhood,ATM,American Restaurant,Arcade,Asian Restaurant,Athletics & Sports,BBQ Joint,Bakery,Bank,Bar,...,Snack Place,South Indian Restaurant,Spa,Sporting Goods Shop,Stadium,Tea Room,Thai Restaurant,Theme Park,Train Station,Vegetarian / Vegan Restaurant
0,Sector 100,0.0,0.0,0.000000,0.0,0.0,0.000000,0.250000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0
1,Sector 104,0.0,0.0,0.000000,0.0,0.0,0.029412,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0
2,Sector 105,0.0,0.0,0.000000,0.0,0.0,0.000000,0.142857,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0
3,Sector 107,0.0,0.0,0.000000,0.0,0.0,0.000000,0.125000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0
4,Sector 108,0.0,0.0,0.000000,0.0,0.0,0.000000,0.076923,0.0,0.0,...,0.076923,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,Sector 79,0.0,0.0,0.000000,0.0,0.0,0.029412,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0
85,Sector 82,0.0,0.0,0.000000,0.0,0.0,0.000000,0.100000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0
86,Sector 92,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0
87,Sector 93,0.0,0.0,0.000000,0.0,0.0,0.000000,0.076923,0.0,0.0,...,0.076923,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0


In [70]:
k = 6

noida_grouped_clustering = noida_grouped.drop('Neighborhood', axis=1)

# run k-means clustering
kmeans = KMeans(n_clusters=k, random_state=0)
kmeans.fit(noida_grouped_clustering)

KMeans(n_clusters=6, random_state=0)

In [68]:
# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([3, 1, 2, 2, 2, 4, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 1, 0, 4, 1, 1, 2,
       2, 1, 1, 1, 4, 1, 1, 1, 4, 1, 1, 1, 4, 1, 4, 4, 4, 1, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3, 3,
       3, 3, 4, 2, 4, 4, 2, 1, 2, 4, 2, 1, 1, 4, 1, 1, 1, 1, 1, 2, 2, 2,
       3])

In [71]:
np.unique(kmeans.labels_, return_counts=True)

(array([0, 1, 2, 3, 4, 5]), array([ 1, 32,  9, 14, 32,  1], dtype=int64))

In [72]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [73]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = noida_grouped['Neighborhood']

for ind in np.arange(noida_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(noida_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Sector 100,Golf Course,Gym / Fitness Center,Bakery,Market,Dry Cleaner,Coffee Shop,Convenience Store,Department Store,Dessert Shop,Diner
1,Sector 104,Fast Food Restaurant,Indian Restaurant,Café,Food,Coffee Shop,Gym,Hotel,Pizza Place,Sandwich Place,Chinese Restaurant
2,Sector 105,Pizza Place,Pharmacy,Bakery,Market,Diner,Café,Flea Market,Fast Food Restaurant,Food,Electronics Store
3,Sector 107,Pizza Place,Thai Restaurant,Gym / Fitness Center,Bakery,Market,Café,Diner,Vegetarian / Vegan Restaurant,Dessert Shop,Clothing Store
4,Sector 108,Pizza Place,Café,Golf Course,Snack Place,Pharmacy,Plaza,Market,Coffee Shop,Diner,Ice Cream Shop


In [74]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [75]:
neighborhoods_venues_sorted.head()

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,3,Sector 100,Golf Course,Gym / Fitness Center,Bakery,Market,Dry Cleaner,Coffee Shop,Convenience Store,Department Store,Dessert Shop,Diner
1,1,Sector 104,Fast Food Restaurant,Indian Restaurant,Café,Food,Coffee Shop,Gym,Hotel,Pizza Place,Sandwich Place,Chinese Restaurant
2,2,Sector 105,Pizza Place,Pharmacy,Bakery,Market,Diner,Café,Flea Market,Fast Food Restaurant,Food,Electronics Store
3,2,Sector 107,Pizza Place,Thai Restaurant,Gym / Fitness Center,Bakery,Market,Café,Diner,Vegetarian / Vegan Restaurant,Dessert Shop,Clothing Store
4,2,Sector 108,Pizza Place,Café,Golf Course,Snack Place,Pharmacy,Plaza,Market,Coffee Shop,Diner,Ice Cream Shop


In [76]:
noida_merged = df
noida_merged.rename(columns = {'Sector': 'Neighborhood'}, inplace=True)
noida_merged.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Sector 11,28.598112,77.333814
1,Sector 12,28.595118,77.338486
2,Sector 14,28.587231,77.308222
3,Sector 15,28.582798,77.310222
4,Sector 15a,28.577997,77.308509


In [77]:
noida_merged = noida_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
noida_merged.head()

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Sector 11,28.598112,77.333814,4,Stadium,Market,Indian Restaurant,Multiplex,Sandwich Place,Butcher,Chinese Restaurant,Electronics Store,Pizza Place,Coffee Shop
1,Sector 12,28.595118,77.338486,4,Stadium,Indian Restaurant,Vegetarian / Vegan Restaurant,Sandwich Place,Butcher,Chinese Restaurant,Pizza Place,Market,Coffee Shop,Restaurant
2,Sector 14,28.587231,77.308222,4,Indian Restaurant,Hotel,Department Store,Pizza Place,Fast Food Restaurant,Plaza,Bakery,Restaurant,Gym,Market
3,Sector 15,28.582798,77.310222,4,Indian Restaurant,Sandwich Place,Chinese Restaurant,Café,Hotel,Restaurant,Arcade,Light Rail Station,Bakery,Department Store
4,Sector 15a,28.577997,77.308509,4,Indian Restaurant,Café,Chinese Restaurant,Restaurant,Fast Food Restaurant,Hotel,Coffee Shop,Sandwich Place,Plaza,Fried Chicken Joint


In [51]:
    address = 'Noida, Uttar Pradesh'
    geolocator = Nominatim(user_agent="ny_explorer")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude

In [79]:
# create map

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(noida_merged['Latitude'], noida_merged['Longitude'], noida_merged['Neighborhood'], noida_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters