# Data clustering based on stations' behaviour
One of the way how to cluster stations is based on their behaviour - number of usage in time. We will look on how the stations bikes_diff changes during the day. Stations with similar day curve of usage will belong into the same cluster. The idea behind it is that there may be stations with very similar day usage curve in different parts of the city (e.g. universities), whereas some stations in close neighbourhood may be used completely different.

In [1]:
import pandas as pd

## Load data

In [2]:
data = pd.read_csv('./datasets/dataset.csv', sep=',')
data.head()

  data = pd.read_csv('./datasets/dataset.csv', sep=',')


Unnamed: 0,gid,available,free,total,w_temp,w_sky,w_precip,w_humid,w_wind_speed,w_clouds,...,dayofweek,day_category,day_motive,lat,long,datetime,occupancy,vacancy,bikes_diff,free_diff
0,901622,18,2,20,12.0,Cielo cubierto,0.0,66.0,6.8,88.0,...,Thursday,Working day,,39.458781,-0.358658,2022-12-01 02:00:00,0.9,0.1,0.0,0.0
1,901673,9,29,38,12.0,Cielo cubierto,0.0,66.0,6.8,88.0,...,Thursday,Working day,,39.472282,-0.343809,2022-12-01 02:00:00,0.236842,0.763158,0.0,0.0
2,901677,15,8,23,12.0,Cielo cubierto,0.0,66.0,6.8,88.0,...,Thursday,Working day,,39.473068,-0.343132,2022-12-01 02:00:00,0.652174,0.347826,0.0,0.0
3,901679,3,18,21,12.0,Cielo cubierto,0.0,66.0,6.8,88.0,...,Thursday,Working day,,39.471344,-0.340484,2022-12-01 02:00:00,0.142857,0.857143,0.0,0.0
4,901627,15,4,20,12.0,Cielo cubierto,0.0,66.0,6.8,88.0,...,Thursday,Working day,,39.456512,-0.355418,2022-12-01 02:00:00,0.75,0.2,0.0,0.0


## How bikes_diff changes during day

In [5]:
data['datetime'] = pd.to_datetime(data['datetime'])
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10891319 entries, 0 to 10891318
Data columns (total 21 columns):
 #   Column        Dtype         
---  ------        -----         
 0   gid           int64         
 1   available     int64         
 2   free          int64         
 3   total         int64         
 4   w_temp        float64       
 5   w_sky         object        
 6   w_precip      float64       
 7   w_humid       float64       
 8   w_wind_speed  float64       
 9   w_clouds      float64       
 10  w_visib       float64       
 11  dayofweek     object        
 12  day_category  object        
 13  day_motive    object        
 14  lat           float64       
 15  long          float64       
 16  datetime      datetime64[ns]
 17  occupancy     float64       
 18  vacancy       float64       
 19  bikes_diff    float64       
 20  free_diff     float64       
dtypes: datetime64[ns](1), float64(12), int64(4), object(4)
memory usage: 1.7+ GB


In [7]:
import numpy as np

hourly_avg_abs_bikes_diff_per_stations = data.groupby([data.gid, data.datetime.dt.hour])['bikes_diff'].apply(lambda x: np.mean(np.abs(x)))

hourly_avg_abs_bikes_diff_per_stations.head()

gid     datetime
901581  0           0.955280
        1           0.235792
        2           0.155623
        3           0.126735
        4           0.129268
Name: bikes_diff, dtype: float64

In [8]:
# Reshape the data
hourly_avg_abs_bikes_diff_per_stations = hourly_avg_abs_bikes_diff_per_stations.unstack()

# Fill NaN values with 0
hourly_avg_abs_bikes_diff_per_stations = hourly_avg_abs_bikes_diff_per_stations.fillna(0)
hourly_avg_abs_bikes_diff_per_stations.head()

datetime,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
gid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
901581,0.95528,0.235792,0.155623,0.126735,0.129268,0.074818,0.119394,0.431598,0.790698,0.721101,...,0.842424,0.791135,0.719416,0.835052,0.842649,0.78845,0.707674,0.658981,0.49364,0.434994
901582,1.126708,0.359734,0.256535,0.199155,0.189024,0.156934,0.093939,0.254843,0.44186,0.590214,...,0.809091,0.772313,0.742544,0.800485,0.875456,0.972019,0.862972,0.777778,0.640218,0.540705
901583,1.511801,0.696493,0.486322,0.316043,0.182927,0.074209,0.218182,0.497579,0.591187,0.933333,...,0.950909,0.87371,0.81619,0.83869,0.877886,0.934915,0.947016,0.896238,0.855239,0.837789
901584,1.344099,0.434704,0.255927,0.156815,0.115854,0.054745,0.192727,0.439467,1.179927,1.164526,...,1.236364,1.260474,0.913573,0.850212,0.975699,1.171533,1.108404,1.051578,0.933374,0.991495
901585,0.985714,0.369407,0.177508,0.115872,0.101829,0.072384,0.089697,0.276634,0.569155,0.665443,...,0.855152,0.928355,0.68594,0.683445,0.773998,0.787713,0.806334,0.671524,0.502726,0.442892


## Data clustering based on behaviour

In [9]:
from sklearn.cluster import KMeans

k_clusters = 6
# KMeans
kmeans = KMeans(n_clusters=k_clusters) 
clusters = kmeans.fit_predict(hourly_avg_abs_bikes_diff_per_stations)

# Add cluster labels to the DataFrame
hourly_avg_abs_bikes_diff_per_stations['cluster'] = clusters

# Example of how the data looks after clustering
print(hourly_avg_abs_bikes_diff_per_stations.head())

datetime         0         1         2         3         4         5  \
gid                                                                    
901581    0.955280  0.235792  0.155623  0.126735  0.129268  0.074818   
901582    1.126708  0.359734  0.256535  0.199155  0.189024  0.156934   
901583    1.511801  0.696493  0.486322  0.316043  0.182927  0.074209   
901584    1.344099  0.434704  0.255927  0.156815  0.115854  0.054745   
901585    0.985714  0.369407  0.177508  0.115872  0.101829  0.072384   

datetime         6         7         8         9  ...        15        16  \
gid                                               ...                       
901581    0.119394  0.431598  0.790698  0.721101  ...  0.791135  0.719416   
901582    0.093939  0.254843  0.441860  0.590214  ...  0.772313  0.742544   
901583    0.218182  0.497579  0.591187  0.933333  ...  0.873710  0.816190   
901584    0.192727  0.439467  1.179927  1.164526  ...  1.260474  0.913573   
901585    0.089697  0.276634  0.5



In [10]:
# Reset index to make 'gid' a regular column
hourly_avg_abs_bikes_diff_per_stations.reset_index(inplace=True)

# Merge based on the 'gid' column
data = pd.merge(data, hourly_avg_abs_bikes_diff_per_stations[['gid', 'cluster']], on='gid', how='left')

# Display the merged DataFrame
data.head()

Unnamed: 0,gid,available,free,total,w_temp,w_sky,w_precip,w_humid,w_wind_speed,w_clouds,...,day_category,day_motive,lat,long,datetime,occupancy,vacancy,bikes_diff,free_diff,cluster
0,901622,18,2,20,12.0,Cielo cubierto,0.0,66.0,6.8,88.0,...,Working day,,39.458781,-0.358658,2022-12-01 02:00:00,0.9,0.1,0.0,0.0,3
1,901673,9,29,38,12.0,Cielo cubierto,0.0,66.0,6.8,88.0,...,Working day,,39.472282,-0.343809,2022-12-01 02:00:00,0.236842,0.763158,0.0,0.0,1
2,901677,15,8,23,12.0,Cielo cubierto,0.0,66.0,6.8,88.0,...,Working day,,39.473068,-0.343132,2022-12-01 02:00:00,0.652174,0.347826,0.0,0.0,3
3,901679,3,18,21,12.0,Cielo cubierto,0.0,66.0,6.8,88.0,...,Working day,,39.471344,-0.340484,2022-12-01 02:00:00,0.142857,0.857143,0.0,0.0,3
4,901627,15,4,20,12.0,Cielo cubierto,0.0,66.0,6.8,88.0,...,Working day,,39.456512,-0.355418,2022-12-01 02:00:00,0.75,0.2,0.0,0.0,3


In [18]:
import folium

# array of all the stations in the network
stations = data.gid.unique()

# extracting information about each station's total number of available racks
stations_size = data.groupby(["gid"])["total"].mean()

f = folium.Figure(width=700, height=700)
map = folium.Map(location=[39.4502730411, -0.333362], tiles='OpenStreetMap', zoom_start=12)

colors = ['blue', 'red', 'green', 'orange', 'purple', 'magenta', 'brown', 'cyan', 'black', 'darkred', 'gray']

for station in stations:
    folium.Circle(
        location=[data[data.gid == station]['lat'].iloc[0], data[data.gid == station]['long'].iloc[0]],
        color=colors[data[data.gid == station]['cluster'].iloc[0]],
        radius=5 * stations_size.loc[station],
        fill=True,
        opacity=0.8,
        fill_opacity=0.2,
        tooltip='Station {} | Docks: {} | Cluster: {}'.format(
            station, stations_size.loc[station], data[data.gid == station]['cluster'].iloc[0])
    ).add_to(map)

f.add_child(map)

## Save the dataset

In [16]:
data.to_csv('./datasets/data_beh_group.csv', index=False)