In [1]:
# Import libraries
import numpy as np
import pandas as pd
import warnings
warnings.simplefilter("ignore")

In [2]:
# Read in the trip summary data
file_path = '../data/trip_summary_201901.csv'
trips = pd.read_csv(file_path)
trips = trips.dropna(how="any")

trips['start_station_id'].nunique()

152

In [3]:
# Create station pair identifiers and group the data

# Function: sorts the pair array and concatenates them to create a unique ID
def sort_concat(arr):
    arr_sorted = np.sort(arr)
    pair_id = str(arr_sorted[0]) + '_' + str(arr_sorted[1])
    
    return pair_id

pair_id = trips[['start_station_id', 'end_station_id']].values
pair_id_sorted = list(map(sort_concat, pair_id))

trips['station_pair_id'] = pair_id_sorted

trips.head()

Unnamed: 0,start_date,start_hour,start_station_id,start_station_name,start_station_cap,start_station_has_kiosk,end_station_id,end_station_name,end_station_cap,end_station_has_kiosk,start_station_latitude,start_station_longitude,end_station_latitude,end_station_longitude,sum_duration_sec,sum_duration_min,trip_count,subscriber_trip_count,customer_trip_count,station_pair_id
0,2019-01-01,0,109,17th St at Valencia St,23.0,True,141,Valencia St at Cesar Chavez St,19.0,True,37.763316,-122.421904,37.747998,-122.420219,390,6.5,1,1,0,109_141
1,2019-01-01,0,109,17th St at Valencia St,23.0,True,91,Berry St at King St,23.0,True,37.763316,-122.421904,37.771762,-122.398438,780,13.0,1,1,0,91_109
2,2019-01-01,0,118,Eureka Valley Recreation Center,19.0,True,126,Esprit Park,31.0,True,37.759177,-122.436943,37.761634,-122.390648,1306,21.766667,1,1,0,118_126
3,2019-01-01,0,119,18th St at Noe St,15.0,True,98,Valencia St at 16th St,23.0,True,37.761047,-122.432642,37.765052,-122.421866,422,7.033333,1,1,0,98_119
4,2019-01-01,0,120,Mission Dolores Park,27.0,True,98,Valencia St at 16th St,23.0,True,37.76142,-122.426435,37.765052,-122.421866,277,4.616667,1,1,0,98_120


### Generate an Affinity Matrix from the Data

In [79]:
# Create a new dataframe that sums the roundtrips between two stations into one row of data
# We will be able to create a matrix on how each station interactions with other stations
pair_trips = trips.groupby('station_pair_id', as_index=False)['trip_count','sum_duration_min'].sum()
pair_trips['station_i'] = pair_trips['station_pair_id'].str.split("_", n = 1, expand = True)[0]
pair_trips['station_j'] = pair_trips['station_pair_id'].str.split("_", n = 1, expand = True)[1]
intercluster_volume = pair_trips
pair_trips['trip_count_inv'] = 1 / pair_trips['trip_count']

pair_trips['station_i'] = pd.to_numeric(pair_trips['station_i']).astype(np.int64)
pair_trips['station_j'] = pd.to_numeric(pair_trips['station_j']).astype(np.int64)

# Sort the dataframe by the starting station, so we can link up the clusters to the correct row of data later on
stations_sorted = pair_trips.sort_values(['station_i'])['station_i'].unique()
pair_trips = pair_trips.sort_values(['station_i'])

# print(stations_sorted,"\n")
# print(len(stations_sorted),"\n")
# print(pair_trips.head(), "\n")

# Generate affinity matrix from pair_trips
upp_mat = pair_trips.set_index(['station_i','station_j'])['trip_count_inv'].unstack().values

aff_mat = np.triu(upp_mat) + np.triu(upp_mat, 1).T
aff_mat[np.isnan(aff_mat)] = 0

print(aff_mat, "\n")
print(aff_mat.shape)

[[0.01639344 0.05882353 0.00943396 ... 0.1        1.         0.09090909]
 [0.05882353 0.2        0.03571429 ... 0.33333333 1.         0.14285714]
 [0.00943396 0.03571429 0.015625   ... 0.07142857 1.         0.11111111]
 ...
 [0.1        0.33333333 0.07142857 ... 0.0625     0.         1.        ]
 [1.         1.         1.         ... 0.         0.14285714 1.        ]
 [0.09090909 0.14285714 0.11111111 ... 1.         1.         1.        ]] 

(152, 152)


In [35]:
# Spectral clustering on the affinity matrix
from sklearn.cluster import SpectralClustering

beta = 1
sim_mat = np.exp(-beta * aff_mat / aff_mat.std(ddof=0))
print(sim_mat)

sc = SpectralClustering(3, affinity='precomputed', n_init=100, assign_labels='discretize')
clusters = sc.fit_predict(sim_mat)
print(len(clusters))

[[0.94702883 0.82259411 0.96916493 ... 0.71748944 0.0361539  0.73947449]
 [0.82259411 0.5147911  0.88818931 ... 0.33066258 0.0361539  0.62233205]
 [0.96916493 0.88818931 0.94944798 ... 0.78888025 0.0361539  0.69150451]
 ...
 [0.71748944 0.33066258 0.78888025 ... 0.81261474 1.         0.0361539 ]
 [0.0361539  0.0361539  0.0361539  ... 1.         0.62233205 0.0361539 ]
 [0.73947449 0.62233205 0.69150451 ... 0.0361539  0.0361539  0.0361539 ]]
152


In [77]:
# clusters

In [78]:
# list(zip(stations_sorted, clusters))

### Test performance of spectral clustering on 1-15 clusters

#### Output from this cell shows the mean volume and time for intercluster travel

In [75]:
# Spectral clustering on the affinity matrix
from sklearn.cluster import SpectralClustering

print("clusters","  ", "avg intercluster volume", " ", "avg intercluster time")

i = 2

while i <= 15:

    sc = SpectralClustering(i, affinity='precomputed', n_init=100, assign_labels='discretize')
    clusters = sc.fit_predict(sim_mat)
    
    # select start station and trip count to new dataframe
    intercluster_volume = intercluster_volume[['station_i','station_j', 'trip_count','sum_duration_min']]

    #define a mapping dictionary
    cluster_dict = dict(zip(stations_sorted, clusters))

    # map the clusters to the starting stations
    intercluster_volume['cluster_i'] = intercluster_volume['station_i'].map(cluster_dict)
    intercluster_volume['cluster_j'] = intercluster_volume['station_j'].map(cluster_dict)


    intercluster_volume.loc[intercluster_volume.cluster_i == intercluster_volume.cluster_j, 'intercluster_trip'] = 0 
    intercluster_volume.loc[intercluster_volume.cluster_i != intercluster_volume.cluster_j, 'intercluster_trip'] = 1 

    intercluster_volume['intercluster_volume'] = intercluster_volume['intercluster_trip']*intercluster_volume['trip_count']
    intercluster_volume['intercluster_time_volume'] = intercluster_volume['intercluster_trip']*intercluster_volume['sum_duration_min']

    print(i, "         ", intercluster_volume['intercluster_volume'].mean(),
          "         ", intercluster_volume['intercluster_time_volume'].mean())

    i = i + 1

clusters    avg intercluster volume   avg intercluster time
2           3.9095149253731343           61.31896075179676
3           6.046330845771144           90.13701457987833
4           8.06540215588723           110.51857207020448
5           8.634535655058043           117.68637368711991
6           8.90308872305141           122.7616172609175
7           9.67858623548922           131.6637714206743
8           10.329498341625207           137.59300027639583
9           10.879145936981757           140.80326319789927
10           11.387541459369817           150.22819755389702
11           10.799025704809287           143.00603406578196
12           10.73476368159204           141.93739289662795
13           11.359970978441128           148.1271282476505
14           12.297470978441128           158.27736491155335
15           11.080949419568823           144.79847809563256


### Generate a Dataframe that has the pertinent information for Data Visualization

In [112]:
station_clustered = list(zip(stations_sorted, clusters))
df_station_clustered = pd.DataFrame(station_clustered)
df_station_clustered.rename(columns={0:'station_id',
                                    1: 'cluster'},inplace=True)

start = trips[['start_station_id', 'start_station_name', 'start_station_latitude', 'start_station_longitude']].drop_duplicates().rename(columns = {'start_station_id':'station_id', \
                                                 'start_station_name':'station_name', \
                                                 'start_station_latitude':'station_latitude', 
                                                 'start_station_longitude': 'station_longitude'})

end = trips[['end_station_id', 'end_station_name', 'end_station_latitude', 'end_station_longitude']].drop_duplicates().rename(columns = {'end_station_id':'station_id', \
                                             'end_station_name':'station_name', \
                                             'end_station_latitude':'station_latitude', \
                                             'end_station_longitude': 'station_longitude'})

df_loc = pd.concat([start, end]).drop_duplicates()
df_all = df_loc.merge(df_station_clustered, on=["station_id"], how="right")
df_all.to_csv("df_station_clustered.csv", index=None)
df_all.head()

Unnamed: 0,station_id,station_name,station_latitude,station_longitude,cluster
0,109,17th St at Valencia St,37.763316,-122.421904,2
1,118,Eureka Valley Recreation Center,37.759177,-122.436943,6
2,119,18th St at Noe St,37.761047,-122.432642,2
3,120,Mission Dolores Park,37.76142,-122.426435,2
4,121,Mission Playground,37.75921,-122.421339,2


### Extra Code for single run of volume and time volume aggregation

In [73]:
# map clusters to roundtrip volume and rountrip volume time data

# select start station and trip count to new dataframe
intercluster_volume = intercluster_volume[['station_i','station_j', 'trip_count','sum_duration_min']]

#define a mapping dictionary
cluster_dict = dict(zip(stations_sorted, clusters))

# map the clusters to the starting stations
intercluster_volume['cluster_i'] = intercluster_volume['station_i'].map(cluster_dict)
intercluster_volume['cluster_j'] = intercluster_volume['station_j'].map(cluster_dict)


intercluster_volume.loc[intercluster_volume.cluster_i == intercluster_volume.cluster_j, 'intercluster_trip'] = 0 
intercluster_volume.loc[intercluster_volume.cluster_i != intercluster_volume.cluster_j, 'intercluster_trip'] = 1 

intercluster_volume['intercluster_volume'] = intercluster_volume['intercluster_trip']*intercluster_volume['trip_count']
intercluster_volume['intercluster_time_volume'] = intercluster_volume['intercluster_trip']*intercluster_volume['sum_duration_min']

intercluster_volume.head()

Unnamed: 0,station_i,station_j,trip_count,sum_duration_min,cluster_i,cluster_j,intercluster_trip,intercluster_volume,intercluster_time_volume
0,100,100,8,72.833333,8,8,0.0,0.0,0.0
1,100,101,7,18.816667,8,3,1.0,7.0,18.816667
2,100,102,9,40.783333,8,5,1.0,9.0,40.783333
3,100,104,11,106.333333,8,4,1.0,11.0,106.333333
4,100,105,6,54.516667,8,0,1.0,6.0,54.516667
