In [1]:
import numpy as np
import pandas as pd
import folium
import json
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier

In [2]:
def one_hot(all_venues_df):
    #(1) One-hot encode the 'category' column and add 'postcode' porperties column to new df
    onehot_df = pd.get_dummies(all_venues_df[['category']], prefix="", prefix_sep="")
    onehot_df.insert(0, 'pincode', all_venues_df.loc[:, 'pincode'])
    onehot_df.insert(1, 'pc_lat', all_venues_df.loc[:, 'pc_lat'])
    onehot_df.insert(2, 'pc_lng', all_venues_df.loc[:, 'pc_lng'])
    onehot_df.insert(3, 'city', all_venues_df.loc[:, 'city'])
    return onehot_df
    
def get_features(onehot_df):
    #(2) Group-by 'postcode' and aggregate using 'mean'
    #This is the feature vector df containing features of every 'postcode'
    features_df = onehot_df.groupby('pincode').mean().reset_index()
    print("Onehot_DF:", onehot_df.shape)
    display(onehot_df.head())
    print("Features_df:", features_df.shape)
    display(features_df.head())
    return features_df

all_venues_df1 = pd.read_csv('Kolkata_all_venues.csv')
all_venues_df2 = pd.read_csv('Chennai_all_venues.csv')
all_venues_df1['city'] = 'Kolkata'
all_venues_df2['city'] = 'Chennai'


all_venues_df = pd.concat([all_venues_df1, all_venues_df2], ignore_index=True)
onehot_df = one_hot(all_venues_df)

onehot_df1 = onehot_df.loc[onehot_df['city'] == 'Kolkata']
onehot_df2 = onehot_df.loc[onehot_df['city'] == 'Chennai']

features_df1 = get_features(onehot_df1)
features_df2 = get_features(onehot_df2)

print("Features_df1:", features_df1.shape)
print("Features_df2:", features_df2.shape)


Onehot_DF: (56, 105)


Unnamed: 0,pincode,pc_lat,pc_lng,city,ATM,African Restaurant,Airport Lounge,Amphitheater,Arcade,Asian Restaurant,...,Stadium,Tea Room,Thai Restaurant,Theater,Train,Train Station,Vegetarian / Vegan Restaurant,Video Store,Warehouse Store,Women's Store
0,700001.0,22.5947,88.3645,Kolkata,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,700001.0,22.5947,88.3645,Kolkata,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,700001.0,22.5947,88.3645,Kolkata,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,700001.0,22.5947,88.3645,Kolkata,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,700007.0,22.5667,88.35,Kolkata,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Features_df: (17, 104)


Unnamed: 0,pincode,pc_lat,pc_lng,ATM,African Restaurant,Airport Lounge,Amphitheater,Arcade,Asian Restaurant,BBQ Joint,...,Stadium,Tea Room,Thai Restaurant,Theater,Train,Train Station,Vegetarian / Vegan Restaurant,Video Store,Warehouse Store,Women's Store
0,700001.0,22.5947,88.3645,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,700007.0,22.5667,88.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,700015.0,22.55,88.3833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,700019.0,22.529,88.368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,700022.0,22.55,88.3333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Onehot_DF: (309, 105)


Unnamed: 0,pincode,pc_lat,pc_lng,city,ATM,African Restaurant,Airport Lounge,Amphitheater,Arcade,Asian Restaurant,...,Stadium,Tea Room,Thai Restaurant,Theater,Train,Train Station,Vegetarian / Vegan Restaurant,Video Store,Warehouse Store,Women's Store
56,600001.0,13.093,80.2882,Chennai,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
57,600001.0,13.093,80.2882,Chennai,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
58,600001.0,13.093,80.2882,Chennai,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59,600001.0,13.093,80.2882,Chennai,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60,600003.0,13.0819,80.2781,Chennai,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Features_df: (42, 104)


Unnamed: 0,pincode,pc_lat,pc_lng,ATM,African Restaurant,Airport Lounge,Amphitheater,Arcade,Asian Restaurant,BBQ Joint,...,Stadium,Tea Room,Thai Restaurant,Theater,Train,Train Station,Vegetarian / Vegan Restaurant,Video Store,Warehouse Store,Women's Store
0,600001.0,13.093,80.2882,0.0,0.0,0.0,0.0,0.0,0.25,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0
1,600003.0,13.0819,80.2781,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.3,0.1,0.0,0.0,0.0
2,600004.0,13.0292,80.2708,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0
3,600005.0,13.0572,80.2778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.1,0.2,0.0,0.0,0.0
4,600006.0,13.071,80.2738,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0


Features_df1: (17, 104)
Features_df2: (42, 104)


In [3]:
def get_clusters(features_df, k):
    
    #(1) Cluster using only the features and not the 'pincode' column
    features_clustering = features_df.drop(['pincode', 'pc_lat', 'pc_lng'], axis=1)
    display(features_clustering.head())
    kmeans = KMeans(n_clusters=k, random_state=0).fit(features_clustering)
    
    print(kmeans.labels_)
    #(2) Add cluster column to features_df
    cluster_df = pd.DataFrame({
        'pincode': features_df.loc[:, 'pincode'],
        'pc_lat': features_df.loc[:, 'pc_lat'],
        'pc_lng': features_df.loc[:, 'pc_lng'],
        'cluster': kmeans.labels_
    })
    
    return cluster_df

k = 5
cluster_df2 = get_clusters(features_df2, k)
display(cluster_df2.head())
# display(cluster_df.tail())

Unnamed: 0,ATM,African Restaurant,Airport Lounge,Amphitheater,Arcade,Asian Restaurant,BBQ Joint,Bakery,Bar,Beach,...,Stadium,Tea Room,Thai Restaurant,Theater,Train,Train Station,Vegetarian / Vegan Restaurant,Video Store,Warehouse Store,Women's Store
0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.3,0.1,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,...,0.0,0.0,0.0,0.0,0.0,0.1,0.2,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0


[3 3 0 0 0 3 0 0 0 0 2 0 3 0 3 0 0 3 3 0 3 0 3 0 3 3 4 3 3 0 1 0 3 3 2 3 0
 3 3 3 3 3]


Unnamed: 0,pincode,pc_lat,pc_lng,cluster
0,600001.0,13.093,80.2882,3
1,600003.0,13.0819,80.2781,3
2,600004.0,13.0292,80.2708,0
3,600005.0,13.0572,80.2778,0
4,600006.0,13.071,80.2738,0


In [4]:
features_df2.columns

Index(['pincode', 'pc_lat', 'pc_lng', 'ATM', 'African Restaurant',
       'Airport Lounge', 'Amphitheater', 'Arcade', 'Asian Restaurant',
       'BBQ Joint',
       ...
       'Stadium', 'Tea Room', 'Thai Restaurant', 'Theater', 'Train',
       'Train Station', 'Vegetarian / Vegan Restaurant', 'Video Store',
       'Warehouse Store', 'Women's Store'],
      dtype='object', length=104)

In [5]:
#Classify
X_train = features_df2.drop(columns=['pincode', 'pc_lat', 'pc_lng'])
Y_train = cluster_df2['cluster']

knn_clf = KNeighborsClassifier(n_neighbors=5, weights='distance')
knn_clf.fit(X_train, Y_train)

print(knn_clf.score(X_train, Y_train))


1.0


In [6]:
X_test = features_df1.drop(columns=['pincode', 'pc_lat', 'pc_lng'])
Y_test = knn_clf.predict(X_test)
print(Y_test)
cluster_df1 = pd.DataFrame({
        'pincode': features_df1.loc[:, 'pincode'],
        'pc_lat': features_df1.loc[:, 'pc_lat'],
        'pc_lng': features_df1.loc[:, 'pc_lng'],
        'cluster': Y_test
    })
cluster_df1

[3 0 0 3 3 3 3 3 3 3 3 3 3 3 3 3 3]


Unnamed: 0,pincode,pc_lat,pc_lng,cluster
0,700001.0,22.5947,88.3645,3
1,700007.0,22.5667,88.35,0
2,700015.0,22.55,88.3833,0
3,700019.0,22.529,88.368,3
4,700022.0,22.55,88.3333,3
5,700027.0,22.532,88.3232,3
6,700031.0,22.5543,88.3132,3
7,700033.0,22.5041,88.3598,3
8,700040.0,22.5333,88.3917,3
9,700047.0,22.4667,88.3833,3


In [7]:
#Displaying map of Toronto with ALL CLUSTERS marked
lat = all_venues_df.loc[0, 'pc_lat']
lng = all_venues_df.loc[0, 'pc_lng']
map1 = folium.Map(location=[lat, lng], zoom_start=12)

for pc, lat, lon, cl in zip(cluster_df['pincode'], cluster_df['pc_lat'], cluster_df['pc_lng'], cluster_df['cluster']):
    text = '{}, {}'.format(pc, cl)
    label = folium.Popup(text, parse_html=True)
    colour = ['red', 'blue', 'green', 'yellow', 'black']
    
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=colour[cl],
        fill=True,
        fill_color=colour[cl],
        fill_opacity=0.7).add_to(map1)
    
map1

NameError: name 'cluster_df' is not defined