# Data clustering based on stations' position
One of the way how to cluster stations is based on their position. The idea behind it is that stations in neighbourhood will have similar behaviour - number of usage in time.

In [1]:
import pandas as pd

## Load data

In [2]:
data = pd.read_csv('./datasets/dataset.csv', sep=',')
data.head()

  data = pd.read_csv('./datasets/dataset.csv', sep=',')


Unnamed: 0,gid,available,free,total,w_temp,w_sky,w_precip,w_humid,w_wind_speed,w_clouds,...,dayofweek,day_category,day_motive,lat,long,datetime,occupancy,vacancy,bikes_diff,free_diff
0,901622,18,2,20,12.0,Cielo cubierto,0.0,66.0,6.8,88.0,...,Thursday,Working day,,39.458781,-0.358658,2022-12-01 02:00:00,0.9,0.1,0.0,0.0
1,901673,9,29,38,12.0,Cielo cubierto,0.0,66.0,6.8,88.0,...,Thursday,Working day,,39.472282,-0.343809,2022-12-01 02:00:00,0.236842,0.763158,0.0,0.0
2,901677,15,8,23,12.0,Cielo cubierto,0.0,66.0,6.8,88.0,...,Thursday,Working day,,39.473068,-0.343132,2022-12-01 02:00:00,0.652174,0.347826,0.0,0.0
3,901679,3,18,21,12.0,Cielo cubierto,0.0,66.0,6.8,88.0,...,Thursday,Working day,,39.471344,-0.340484,2022-12-01 02:00:00,0.142857,0.857143,0.0,0.0
4,901627,15,4,20,12.0,Cielo cubierto,0.0,66.0,6.8,88.0,...,Thursday,Working day,,39.456512,-0.355418,2022-12-01 02:00:00,0.75,0.2,0.0,0.0


# Data clustering based on location

In [3]:
locations = data[["gid", "lat", "long"]].drop_duplicates()
print(locations.head())

if len(locations) != locations.gid.unique().shape[0]:
    print("some stations has multiple locations")

      gid        lat      long
0  901622  39.458781 -0.358658
1  901673  39.472282 -0.343809
2  901677  39.473068 -0.343132
3  901679  39.471344 -0.340484
4  901627  39.456512 -0.355418


In [4]:
from sklearn.cluster import KMeans

# Selecting lat and long columns for clustering
X = locations[['lat', 'long']]

# Number of clusters
k_clusters = 7

# KMeans
kmeans = KMeans(n_clusters=k_clusters)
kmeans.fit(X)

# Adding cluster labels
locations['cluster'] = kmeans.labels_

print(locations)



        gid        lat      long  cluster
0    901622  39.458781 -0.358658        1
1    901673  39.472282 -0.343809        3
2    901677  39.473068 -0.343132        3
3    901679  39.471344 -0.340484        3
4    901627  39.456512 -0.355418        1
..      ...        ...       ...      ...
271  901651  39.460380 -0.333531        3
272  901649  39.461952 -0.338721        3
273  901647  39.458860 -0.336798        3
274  901657  39.484973 -0.365667        0
275  901663  39.477585 -0.366970        0

[276 rows x 4 columns]


In [5]:
import folium

# array of all the stations in the network
stations = data.gid.unique()

# extracting information about each station's total number of available racks
stations_size = data.groupby(["gid"])["total"].mean()

f = folium.Figure(width=700, height=700)
map = folium.Map(location=[39.4502730411, -0.333362], tiles='OpenStreetMap', zoom_start=12)

colors = ['blue', 'red', 'green', 'orange', 'purple', 'darkblue', 'brown', 'cyan', 'black', 'darkred', 'gray']

for station in stations:
    folium.Circle(
        location=[locations[locations.gid == station]['lat'].iloc[0], locations[locations.gid == station]['long'].iloc[0]],
        color=colors[locations[locations.gid == station]['cluster'].iloc[0]],
        radius=5 * stations_size.loc[station],
        fill=True,
        opacity=0.8,
        fill_opacity=0.2,
        tooltip='Station {} | Docks: {} | Cluster: {}'.format(
            station, stations_size.loc[station], locations[locations.gid == station]['cluster'].iloc[0])
    ).add_to(map)

f.add_child(map)


## Make beach stations into separate cluster
Because Kmeans usually did not put beach station in a separate cluster we will do it independently.

In [6]:
# List of GIDs of stations near beach
beach_stations = [901750, 901749, 901747, 901745, 901744, 901743, 901856, 901751, 901748, 901746, 901734, 901738]

# Update cluster for stations near beach
locations.loc[locations['gid'].isin(beach_stations), 'cluster'] = k_clusters

k_clusters += 1

In [7]:
f = folium.Figure(width=700, height=700)
map = folium.Map(location=[39.4502730411, -0.333362], tiles='OpenStreetMap', zoom_start=12)

colors = ['blue', 'red', 'green', 'orange', 'purple', 'magenta', 'brown', 'cyan', 'black', 'darkred', 'gray']

for station in stations:
    folium.Circle(
        location=[locations[locations.gid == station]['lat'].iloc[0], locations[locations.gid == station]['long'].iloc[0]],
        color=colors[locations[locations.gid == station]['cluster'].iloc[0]],
        radius=5 * stations_size.loc[station],
        fill=True,
        opacity=0.8,
        fill_opacity=0.2,
        tooltip='Station {} | Docks: {} | Cluster: {}'.format(
            station, stations_size.loc[station], locations[locations.gid == station]['cluster'].iloc[0])
    ).add_to(map)

f.add_child(map)

## Save the dataset

In [8]:
# Joining the dataframes on gid to get cluster information
data = pd.merge(data, locations, on='gid', how='left')

# Merge lat_x and lat_y into one column 'lat'
data['lat'] = data.apply(lambda row: row['lat_x'] if pd.notnull(row['lat_x']) else row['lat_y'], axis=1)

# Merge long_x and long_y into one column 'long'
data['long'] = data.apply(lambda row: row['long_x'] if pd.notnull(row['long_x']) else row['long_y'], axis=1)

# Drop lat_x, lat_y, long_x, long_y columns
data.drop(['lat_x', 'lat_y', 'long_x', 'long_y'], axis=1, inplace=True)

data.head()

Unnamed: 0,gid,available,free,total,w_temp,w_sky,w_precip,w_humid,w_wind_speed,w_clouds,...,day_category,day_motive,datetime,occupancy,vacancy,bikes_diff,free_diff,cluster,lat,long
0,901622,18,2,20,12.0,Cielo cubierto,0.0,66.0,6.8,88.0,...,Working day,,2022-12-01 02:00:00,0.9,0.1,0.0,0.0,1,39.458781,-0.358658
1,901673,9,29,38,12.0,Cielo cubierto,0.0,66.0,6.8,88.0,...,Working day,,2022-12-01 02:00:00,0.236842,0.763158,0.0,0.0,3,39.472282,-0.343809
2,901677,15,8,23,12.0,Cielo cubierto,0.0,66.0,6.8,88.0,...,Working day,,2022-12-01 02:00:00,0.652174,0.347826,0.0,0.0,3,39.473068,-0.343132
3,901679,3,18,21,12.0,Cielo cubierto,0.0,66.0,6.8,88.0,...,Working day,,2022-12-01 02:00:00,0.142857,0.857143,0.0,0.0,3,39.471344,-0.340484
4,901627,15,4,20,12.0,Cielo cubierto,0.0,66.0,6.8,88.0,...,Working day,,2022-12-01 02:00:00,0.75,0.2,0.0,0.0,1,39.456512,-0.355418


In [9]:
data.to_csv('./datasets/data_pos_group.csv', index=False)