In [1]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import collections
import matplotlib.pyplot as plt
import seaborn as sns

from pyproj import Proj, transform

from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances

import bokeh
import bokeh.plotting as plotting
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.tile_providers import CARTODBPOSITRON
plotting.output_notebook()


sns.set_style('whitegrid')

# Problem definition

In [2]:
#### Cluster the BIXI stations based on geo location

# Load the data

In [3]:
#input
df = pd.read_csv('raw-data/bixi-stations/Stations_2018.csv', encoding='latin_1')
# df['DATE'] = pd.to_datetime(df['DATE'])
print(df.columns)
# print(df['CATEGORIE'].value_counts())
df.head()

Index(['code', 'name', 'latitude', 'longitude'], dtype='object')


Unnamed: 0,code,name,latitude,longitude
0,7030,de Bordeaux / Marie-Anne,45.533409,-73.570657
1,6141,de Bordeaux / Rachel,45.53227,-73.56828
2,6100,Mackay / de Maisonneuve,45.49659,-73.57851
3,6064,MÃ©tro Peel (de Maisonneuve / Stanley),45.50038,-73.57507
4,6730,35e avenue / Beaubien,45.570081,-73.573047


# Feature Engineering 

In [4]:
# adapt X and Y to the visualization
df['X'] = df.apply(lambda x: transform(Proj(init='epsg:4326'), Proj(init='epsg:3857'), x['longitude'], x['latitude'])[1], axis=1)
df['Y'] = df.apply(lambda x: transform(Proj(init='epsg:4326'), Proj(init='epsg:3857'), x['longitude'], x['latitude'])[0], axis=1)

X_columns = ['X', 'Y']
df = df[X_columns]

In [5]:
df.head()

Unnamed: 0,X,Y
0,5705890.0,-8189848.0
1,5705709.0,-8189583.0
2,5700041.0,-8190722.0
3,5700643.0,-8190339.0
4,5711720.0,-8190114.0


# Model Training

In [6]:
k = 30
model = KMeans(n_clusters=k).fit(df.values)

print(set(model.labels_))
print(collections.Counter(model.labels_))

df['cluster'] = model.labels_

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29}
Counter({1: 41, 7: 37, 22: 34, 19: 34, 20: 31, 8: 28, 2: 27, 18: 27, 23: 25, 26: 25, 12: 20, 0: 19, 24: 18, 15: 17, 16: 16, 10: 15, 13: 14, 3: 14, 21: 14, 14: 14, 9: 13, 5: 11, 28: 11, 4: 10, 11: 9, 6: 8, 29: 6, 17: 6, 27: 5, 25: 3})


In [7]:
# model = DBSCAN(eps=1000.1, min_samples=5)
# model.fit(df[['X', 'Y']])

# cluster_labels = model.labels_
# n_clusters = len(set(cluster_labels))
# print(n_clusters)
# print(collections.Counter(cluster_labels))

# df['cluster'] = cluster_labels

In [8]:
p = figure(y_range=(5641788.0, 5751788.0), x_range=(-8152883, -8252883))
p.add_tile(CARTODBPOSITRON)

latitude  = list(df[df['cluster']>-1]['X'].values)
longitude = list(df[df['cluster']>-1]['Y'].values)

colormap = list(bokeh.palettes.viridis(k))
colors = [colormap[x] for x in df[df['cluster']>-1]['cluster']]
source = ColumnDataSource(data=dict(longitude=longitude, latitude=latitude))
p.circle(x=longitude, y=latitude, color=colors, fill_alpha=0.2, size=5)
show(p)

# Model Evaluation

In [9]:
# Inter-Cluster
centroids = []
for cluster in sorted(set(model.labels_)):
    centroids.append(df[df['cluster']==cluster][X_columns].mean().values)
distances = []
for c1 in centroids:
    for c2 in centroids:
        distances.append(euclidean_distances(c1.reshape(-1, 1), c2.reshape(-1, 1))[0][0])
print('Inter Cluster distance', np.mean(distances))

# Intra-Cluster
distances = []
for cluster in sorted(set(model.labels_)):
    df_filter = df[df['cluster']==cluster]
    centroid = df_filter[X_columns].mean().values
    for k, v in df_filter[X_columns].iterrows():
        distances.append(euclidean_distances(centroid.reshape(-1, 1), v.values.reshape(-1, 1))[0][0])
print('Intra Cluster distance', np.mean(distances))

# Inertia
distances = []
for cluster in sorted(set(model.labels_)):
    df_filter = df[df['cluster']==cluster]
    centroid = df_filter[X_columns].mean().values
    for k, v in df_filter[X_columns].iterrows():
        distances.append(euclidean_distances(centroid.reshape(1, -1), v.values.reshape(1, -1), squared=True)[0][0])
print('Inertia', np.sum(distances))

Inter Cluster distance 6103.952605109998
Intra Cluster distance 580.3788846428608
Inertia 507243915.90625
