In [27]:
%matplotlib inline
import pandas as pd
import numpy as np
import collections

from pyproj import Proj, transform
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances

import bokeh
from bokeh.plotting import figure, show, ColumnDataSource, output_notebook
from bokeh.tile_providers import get_provider, Vendors

In [28]:
#Read the csv data file.
stationsData = pd.read_csv('Bixi_data/Stations/Stations_2018.csv', encoding='latin_1')
print(stationsData.columns)

stationsData.apply(lambda x: pd.to_numeric(x['longitude'], errors='ignore'), axis = 1)
#If axis = 1 --> the lambda function is applied on all the rows. If axis = 0 --> the lambda function is applied on all the columns.

print(stationsData.head())
stationsData.dtypes

Index(['code', 'name', 'latitude', 'longitude'], dtype='object')
   code                                    name   latitude  longitude
0  7030                de Bordeaux / Marie-Anne  45.533409 -73.570657
1  6141                    de Bordeaux / Rachel  45.532270 -73.568280
2  6100                 Mackay / de Maisonneuve  45.496590 -73.578510
3  6064  MÃ©tro Peel (de Maisonneuve / Stanley)  45.500380 -73.575070
4  6730                   35e avenue / Beaubien  45.570081 -73.573047


code           int64
name          object
latitude     float64
longitude    float64
dtype: object

In [29]:
# Use Proj to transform the latitude and longitude coordinates (EPSG 3857) into x,y geographic map coordinates system (EPSG 4326).
inputProj = Proj(init='epsg:3857')
outputProj = Proj(init='epsg:4326')

#Make a DataFrame with the new (x,y) coordinates.
stationsData['X'] = stationsData.apply(lambda x: transform(outputProj, inputProj, x['longitude'], x['latitude'])[1], axis=1)
stationsData['Y'] = stationsData.apply(lambda x: transform(outputProj, inputProj, x['longitude'], x['latitude'])[0], axis=1)

X_columns = ['X', 'Y']
stationsData = stationsData[X_columns]
stationsData.head()

print(stationsData)

  return _prepare_from_string(" ".join(pjargs))
  projstring = _prepare_from_string(" ".join((projstring, projkwargs)))
  return _prepare_from_string(" ".join(pjargs))
  projstring = _prepare_from_string(" ".join((projstring, projkwargs)))
  stationsData['X'] = stationsData.apply(lambda x: transform(outputProj, inputProj, x['longitude'], x['latitude'])[1], axis=1)
  stationsData['Y'] = stationsData.apply(lambda x: transform(outputProj, inputProj, x['longitude'], x['latitude'])[0], axis=1)


                X             Y
0    5.705890e+06 -8.189848e+06
1    5.705709e+06 -8.189583e+06
2    5.700041e+06 -8.190722e+06
3    5.700643e+06 -8.190339e+06
4    5.711720e+06 -8.190114e+06
..            ...           ...
547  5.701161e+06 -8.195151e+06
548  5.700387e+06 -8.195215e+06
549  5.699881e+06 -8.187962e+06
550  5.697874e+06 -8.190859e+06
551  5.697485e+06 -8.190646e+06

[552 rows x 2 columns]


In [30]:
# Using sklearn to do the clustering algorithm on the data with Kmeans. This is unsupervised learning and will return 10 clusters.

k= 10
model = KMeans(n_clusters=k).fit(stationsData.values)
model

print(set(model.labels_))
print(collections.Counter(model.labels_))

stationsData['Cluster'] = model.labels_
print(stationsData['Cluster'].head())



{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
Counter({9: 103, 3: 98, 0: 75, 1: 61, 2: 55, 4: 49, 5: 37, 6: 31, 8: 27, 7: 16})
0    9
1    9
2    2
3    3
4    7
Name: Cluster, dtype: int32


In [35]:
k = 10

plot = figure(y_range=(5641788.0, 5751788.0), x_range=(-8152883, -8252883), x_axis_type="mercator", y_axis_type="mercator")
plot.add_tile(get_provider(Vendors.CARTODBPOSITRON_RETINA))

latitude  = list(stationsData[stationsData['Cluster']>-1]['X'].values)
longitude = list(stationsData[stationsData['Cluster']>-1]['Y'].values)
colormap = list(bokeh.palettes.plasma(k))
colors = [colormap[x] for x in stationsData[stationsData['Cluster']>-1]['Cluster']]

plot.circle(x=longitude, y=latitude, color=colors, fill_alpha=0.2, size=7)
output_notebook()
show(plot)



In [36]:
# Intercluster distance:
# For the clusters in the dataframe, stationsData, find the mean of the latitude and longitude data.
# Calculate the intrercluster distance, which is the distance between the centroids of the clusters.
# The array is reshaped to make the array have one column.

centroids = []
for cluster in sorted(set(model.labels_)):
    centroids.append(stationsData[stationsData['Cluster']==cluster][X_columns].mean().values)

distances = []
for c1 in centroids:
    for c2 in centroids:
        distances.append(euclidean_distances(c1.reshape(-1, 1), c2.reshape(-1, 1))[0][0])

# Find and display the average of the intercluster distances
print('Intercluster distance = ', np.mean(distances))


# Intracluster distance:
# Calculate the distance between the data and the centroid within a cluster.

distances = []
for cluster in sorted(set(model.labels_)):
    filter = stationsData[stationsData['Cluster']==cluster]
    centroid = filter[X_columns].mean().values
    for k, v in filter[X_columns].iterrows():
        distances.append(euclidean_distances(centroid.reshape(-1, 1), v.values.reshape(-1, 1))[0][0])

# Find and display the average of the intracluster distances
print('Intracluster distance = ', np.mean(distances))

Intercluster distance =  6154.881676138089
Intracluster distance =  1016.6520544233316
