In [98]:
# Import libraries
import numpy as np
import pandas as pd
import warnings
warnings.simplefilter("ignore")

In [29]:
# Read in the trip summary data
file_path = '../data/trip_summary_201901.csv'
trips = pd.read_csv(file_path)
trips = trips.dropna(how="any")

trips['start_station_id'].nunique()

152

In [78]:
# Create station pair identifiers and group the data

# Function: sorts the pair array and concatenates them to create a unique ID
def sort_concat(arr):
    arr_sorted = np.sort(arr)
    pair_id = str(arr_sorted[0]) + '_' + str(arr_sorted[1])
    
    return pair_id

pair_id = trips[['start_station_id', 'end_station_id']].values
pair_id_sorted = list(map(sort_concat, pair_id))

trips['station_pair_id'] = pair_id_sorted

trips.head()

Unnamed: 0,start_date,start_hour,start_station_id,start_station_name,start_station_cap,start_station_has_kiosk,end_station_id,end_station_name,end_station_cap,end_station_has_kiosk,start_station_latitude,start_station_longitude,end_station_latitude,end_station_longitude,sum_duration_sec,sum_duration_min,trip_count,subscriber_trip_count,customer_trip_count,station_pair_id
0,2019-01-01,0,109,17th St at Valencia St,23.0,True,141,Valencia St at Cesar Chavez St,19.0,True,37.763316,-122.421904,37.747998,-122.420219,390,6.5,1,1,0,109_141
1,2019-01-01,0,109,17th St at Valencia St,23.0,True,91,Berry St at King St,23.0,True,37.763316,-122.421904,37.771762,-122.398438,780,13.0,1,1,0,91_109
2,2019-01-01,0,118,Eureka Valley Recreation Center,19.0,True,126,Esprit Park,31.0,True,37.759177,-122.436943,37.761634,-122.390648,1306,21.766667,1,1,0,118_126
3,2019-01-01,0,119,18th St at Noe St,15.0,True,98,Valencia St at 16th St,23.0,True,37.761047,-122.432642,37.765052,-122.421866,422,7.033333,1,1,0,98_119
4,2019-01-01,0,120,Mission Dolores Park,27.0,True,98,Valencia St at 16th St,23.0,True,37.76142,-122.426435,37.765052,-122.421866,277,4.616667,1,1,0,98_120


In [83]:
# Create an affinity matrix from the data
pair_trips = trips.groupby('station_pair_id', as_index=False)['trip_count'].sum()
pair_trips['station_i'] = pair_trips['station_pair_id'].str.split("_", n = 1, expand = True)[0]
pair_trips['station_j'] = pair_trips['station_pair_id'].str.split("_", n = 1, expand = True)[1]
intercluster_volume = pair_trips
pair_trips['trip_count_inv'] = 1 / pair_trips['trip_count']

pair_trips['station_i'] = pd.to_numeric(pair_trips['station_i']).astype(np.int64)
pair_trips['station_j'] = pd.to_numeric(pair_trips['station_j']).astype(np.int64)

stations_sorted = pair_trips.sort_values(['station_i'])['station_i'].unique()
pair_trips = pair_trips.sort_values(['station_i'])

print(stations_sorted,"\n")
print(len(stations_sorted),"\n")
print(pair_trips.head(), "\n")

# Generate affinity matrix from pair_trips
upp_mat = pair_trips.set_index(['station_i','station_j'])['trip_count_inv'].unstack().values

aff_mat = np.triu(upp_mat) + np.triu(upp_mat, 1).T
aff_mat[np.isnan(aff_mat)] = 0

print(aff_mat, "\n")
print(aff_mat.shape)

[  3   4   5   6   8   9  10  11  13  14  15  16  17  19  20  21  22  23
  24  25  26  27  28  29  30  31  33  34  36  37  39  41  42  43  44  47
  49  50  52  53  54  55  56  58  59  60  61  62  63  64  66  67  70  71
  72  73  74  75  76  77  78  79  80  81  84  85  86  87  88  89  90  91
  92  93  95  96  97  98  99 100 101 102 104 105 106 107 108 109 110 112
 113 114 115 116 118 119 120 121 122 123 124 125 126 127 129 130 131 132
 133 134 136 137 138 139 140 141 142 144 145 146 147 223 284 285 321 323
 324 336 343 345 349 350 355 356 358 359 360 361 362 363 364 365 368 369
 370 371 373 375 377 380 381 383] 

152 

     station_pair_id  trip_count  station_i  station_j  trip_count_inv
5014           3_350          61          3        350        0.016393
5036            3_39          14          3         39        0.071429
5037             3_4          17          3          4        0.058824
5038            3_41          51          3         41        0.019608
5039            3_4

In [63]:
# Spectral clustering on the affinity matrix
from sklearn.cluster import SpectralClustering

beta = 1
sim_mat = np.exp(-beta * aff_mat / aff_mat.std(ddof=0))
print(sim_mat)

sc = SpectralClustering(3, affinity='precomputed', n_init=100, assign_labels='discretize')
clusters = sc.fit_predict(sim_mat)
print(len(clusters))

[[0.94702883 0.82259411 0.96916493 ... 0.71748944 0.0361539  0.73947449]
 [0.82259411 0.5147911  0.88818931 ... 0.33066258 0.0361539  0.62233205]
 [0.96916493 0.88818931 0.94944798 ... 0.78888025 0.0361539  0.69150451]
 ...
 [0.71748944 0.33066258 0.78888025 ... 0.81261474 1.         0.0361539 ]
 [0.0361539  0.0361539  0.0361539  ... 1.         0.62233205 0.0361539 ]
 [0.73947449 0.62233205 0.69150451 ... 0.0361539  0.0361539  0.0361539 ]]
152


In [64]:
clusters

array([1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 1, 2, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 2, 1,
       2, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 2, 1, 0, 0,
       0, 0, 2, 2, 1, 0, 1, 1, 0, 0, 2, 2, 0, 2, 1, 0, 2, 0, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 2, 1, 2, 2, 2, 1, 0, 2, 2,
       2, 0, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 1, 0, 1, 1, 1, 2, 1, 1, 1, 1,
       2, 0, 0, 0, 0, 0, 0, 1, 1, 2, 0, 1, 1, 1, 0, 0, 0, 0, 0, 2],
      dtype=int64)

In [67]:
list(zip(stations_sorted, clusters))

[(3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (8, 0),
 (9, 1),
 (10, 1),
 (11, 1),
 (13, 1),
 (14, 1),
 (15, 1),
 (16, 1),
 (17, 1),
 (19, 1),
 (20, 1),
 (21, 1),
 (22, 1),
 (23, 1),
 (24, 1),
 (25, 1),
 (26, 1),
 (27, 1),
 (28, 1),
 (29, 0),
 (30, 1),
 (31, 0),
 (33, 0),
 (34, 0),
 (36, 1),
 (37, 1),
 (39, 2),
 (41, 1),
 (42, 1),
 (43, 0),
 (44, 1),
 (47, 1),
 (49, 1),
 (50, 1),
 (52, 0),
 (53, 0),
 (54, 0),
 (55, 0),
 (56, 2),
 (58, 1),
 (59, 2),
 (60, 1),
 (61, 1),
 (62, 1),
 (63, 0),
 (64, 1),
 (66, 1),
 (67, 1),
 (70, 0),
 (71, 0),
 (72, 0),
 (73, 0),
 (74, 0),
 (75, 0),
 (76, 0),
 (77, 2),
 (78, 0),
 (79, 1),
 (80, 2),
 (81, 1),
 (84, 0),
 (85, 0),
 (86, 0),
 (87, 0),
 (88, 2),
 (89, 2),
 (90, 1),
 (91, 0),
 (92, 1),
 (93, 1),
 (95, 0),
 (96, 0),
 (97, 2),
 (98, 2),
 (99, 0),
 (100, 2),
 (101, 1),
 (102, 0),
 (104, 2),
 (105, 0),
 (106, 2),
 (107, 2),
 (108, 2),
 (109, 2),
 (110, 2),
 (112, 2),
 (113, 2),
 (114, 2),
 (115, 2),
 (116, 2),
 (118, 0),
 (119, 2),
 (120, 0),
 (121, 2),
 (122, 

In [84]:
intercluster_volume.head()

Unnamed: 0,station_pair_id,trip_count,station_i,station_j,trip_count_inv
0,100_100,8,100,100,0.125
1,100_101,7,100,101,0.142857
2,100_102,9,100,102,0.111111
3,100_104,11,100,104,0.090909
4,100_105,6,100,105,0.166667


In [103]:
# map clusters to roundtrip volume data

# select start station and trip count to new dataframe
intercluster_volume = intercluster_volume[['station_i','station_j', 'trip_count']]

#define a mapping dictionary
cluster_dict = dict(zip(stations_sorted, clusters))

# map the clusters to the starting stations
intercluster_volume['cluster_i'] = intercluster_volume['station_i'].map(cluster_dict)
intercluster_volume['cluster_j'] = intercluster_volume['station_j'].map(cluster_dict)


intercluster_volume.loc[intercluster_volume.cluster_i == intercluster_volume.cluster_j, 'intercluster_trip'] = 0 
intercluster_volume.loc[intercluster_volume.cluster_i != intercluster_volume.cluster_j, 'intercluster_trip'] = 1 

intercluster_volume['intercluster_volume'] = intercluster_volume['intercluster_trip']*intercluster_volume['trip_count']

intercluster_volume.head()

Unnamed: 0,station_i,station_j,trip_count,cluster_i,cluster_j,intercluster_trip,intercluster_volume
0,100,100,8,2,2,0.0,0.0
1,100,101,7,2,1,1.0,7.0
2,100,102,9,2,0,1.0,9.0
3,100,104,11,2,2,0.0,0.0
4,100,105,6,2,0,1.0,6.0


In [104]:
intercluster_volume['intercluster_volume'].sum()

58104.0

In [111]:
# Spectral clustering on the affinity matrix
from sklearn.cluster import SpectralClustering

i = 2

while i <= 10:

    sc = SpectralClustering(i, affinity='precomputed', n_init=100, assign_labels='discretize')
    clusters = sc.fit_predict(sim_mat)
    
    # select start station and trip count to new dataframe
    intercluster_volume = intercluster_volume[['station_i','station_j', 'trip_count']]

    #define a mapping dictionary
    cluster_dict = dict(zip(stations_sorted, clusters))

    # map the clusters to the starting stations
    intercluster_volume['cluster_i'] = intercluster_volume['station_i'].map(cluster_dict)
    intercluster_volume['cluster_j'] = intercluster_volume['station_j'].map(cluster_dict)


    intercluster_volume.loc[intercluster_volume.cluster_i == intercluster_volume.cluster_j, 'intercluster_trip'] = 0 
    intercluster_volume.loc[intercluster_volume.cluster_i != intercluster_volume.cluster_j, 'intercluster_trip'] = 1 

    intercluster_volume['intercluster_volume'] = intercluster_volume['intercluster_trip']*intercluster_volume['trip_count']

    print(intercluster_volume['intercluster_volume'].sum())

    i = i + 1

37719.0
58104.0
73586.0
78853.0
81138.0
90381.0
101995.0
102385.0
110834.0


In [112]:
station_clustered = list(zip(stations_sorted, clusters))
df_station_clustered = pd.DataFrame(station_clustered)
df_station_clustered.rename(columns={0:'station_id',
                                    1: 'cluster'},inplace=True)

start = trips[['start_station_id', 'start_station_name', 'start_station_latitude', 'start_station_longitude']].drop_duplicates().rename(columns = {'start_station_id':'station_id', \
                                                 'start_station_name':'station_name', \
                                                 'start_station_latitude':'station_latitude', 
                                                 'start_station_longitude': 'station_longitude'})

end = trips[['end_station_id', 'end_station_name', 'end_station_latitude', 'end_station_longitude']].drop_duplicates().rename(columns = {'end_station_id':'station_id', \
                                             'end_station_name':'station_name', \
                                             'end_station_latitude':'station_latitude', \
                                             'end_station_longitude': 'station_longitude'})

df_loc = pd.concat([start, end]).drop_duplicates()
df_all = df_loc.merge(df_station_clustered, on=["station_id"], how="right")
df_all.to_csv("df_station_clustered.csv", index=None)
df_all.head()

Unnamed: 0,station_id,station_name,station_latitude,station_longitude,cluster
0,109,17th St at Valencia St,37.763316,-122.421904,2
1,118,Eureka Valley Recreation Center,37.759177,-122.436943,6
2,119,18th St at Noe St,37.761047,-122.432642,2
3,120,Mission Dolores Park,37.76142,-122.426435,2
4,121,Mission Playground,37.75921,-122.421339,2
