In [187]:
import pandas as pd
import networkx as net
from itertools import chain

In [194]:
visitors = pd.read_csv('raw_data/visitor.csv')
stations = pd.read_csv('raw_data/stations.csv')
connections = pd.read_csv('raw_data/connections.csv')
ranks = pd.read_csv('raw_data/ranks.csv')
ratio = pd.read_csv("raw_data/ratio.csv")

In [182]:
stations.head()

Unnamed: 0,station_id,name,lat,lon
0,398,Atlantic Ave & Furman St,40.6917,-74.0
1,470,W 20 St & 8 Ave,40.7435,-74.0
2,263,Elizabeth St & Hester St,40.7173,-73.9964
3,3050,Putnam Ave & Throop Ave,40.6852,-73.9411
4,3565,36 Ave & 10 St,40.7614,-73.9411


<h2>Connections

Examing subgraphs sizes indicates several nodes that are disconnected from the rest of the graph. Examination shows these are speciality locations used for a variety of reasons but are not open to the public. They will be removed.

In [184]:
connections.head()

Unnamed: 0,src,src_name,dst,dst_name,total
0,2006,Central Park S & 6 Ave,2006,Central Park S & 6 Ave,114726
1,281,Grand Army Plaza & Central Park S,281,Grand Army Plaza & Central Park S,49478
2,387,Centre St & Chambers St,387,Centre St & Chambers St,40142
3,499,Broadway & W 60 St,499,Broadway & W 60 St,38776
4,514,12 Ave & W 40 St,426,West St & Chambers St,33392


In [185]:
g = net.Graph()

for index, row in connections.iterrows():
    g.add_edge(row['src'], row['dst'])
    g[row['src']][row['dst']]['total'] = row['total']

In [186]:
sub = [c for c in net.connected_component_subgraphs(g)]
print([len(s) for s in sub])

[843, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1]


In [189]:
flat = list(chain(*[list(s.nodes()) for s in sub if len(s) <= 2]))
stations[stations['station_id'].isin(flat)]

Unnamed: 0,station_id,name,lat,lon
47,3036,8D OPS 01,40.518,-74.0314
250,3000,MLSWKiosk,40.7555,-73.9865
270,3239,Bressler,40.6465,-74.0166
324,3485,NYCBS Depot - RIS,40.7252,-73.9747
377,3036,8D OPS 01,40.6849,-74.0254
491,3266,Kiosk in a box Deployment,40.7086,-73.9285
557,3470,Expansion Tech Station,40.6698,-73.9949
558,3470,Gowanus Tech Station,40.6698,-73.9949
574,3488,8D QC Station 01,45.5064,-73.5695
748,3650,8D Mobile 01,45.5059,-73.5691


In [190]:
stations = stations[~stations['station_id'].isin(flat)]

<h2>Circular Stations

This next chart displays stations ordered by the percent of circular trips. A circular trip is defined as a trip that begins and ends at the same station. It is likely that these stations are popular more with tourists and joy riders than with commuters.

Stations with 'tech' or 'depot' are not open to the public.

In [205]:
circle = pd.merge(visitors, stations, left_on='src', right_on='station_id')\
    .loc[:, ['station_id', 'name_x', 'percent', 'lat', 'lon']]\
    .rename({'name_x': 'name', 'station_id': 'id'}, axis=1)\
    .drop_duplicates(subset='id', keep='first')\
    .sort_values(['percent'], ascending=False)
circle.head(20)

Unnamed: 0,id,name,percent,lat,lon
184,3239,Bressler,1.0,40.6465,-74.0166
183,3488,8D QC Station 01,1.0,45.5064,-73.5695
182,3000,MLSWKiosk,1.0,40.7555,-73.9865
181,3633,333 Johnson TEST 1,1.0,0.0,0.0
180,3446,NYCBS Depot - STY - Valet Scan,1.0,0.0,0.0
226,3650,8D Mobile 01,0.984848,45.5059,-73.5691
227,3036,8D OPS 01,0.96,40.518,-74.0314
230,3266,Kiosk in a box Deployment,0.857143,40.7086,-73.9285
231,3470,Expansion Tech Station,0.777778,40.6698,-73.9949
258,3181,Soissons Landing,0.576471,40.6926,-74.0164


In [206]:
circle\
    .to_csv('output/circle.csv')

<h2>Page Ranks

Page Rank identifies the stations that are most important and central to the network. It uses a simplified version of Google's Page Rank algorithm. Locations that receive bikes from many other locations are generally ranked higher.

In [207]:
page = pd.merge(ranks, stations, left_on='id', right_on='station_id')\
    .loc[:, ['id', 'name_x', 'pagerank', 'lat', 'lon']]\
    .rename({'name_x': 'name'}, axis=1)\
    .drop_duplicates(subset='id', keep='first')\
    .sort_values(['pagerank'], ascending=False)
page.head(20)

Unnamed: 0,id,name,pagerank,lat,lon
0,519,Pershing Square North,11.673778,40.7524,-73.9784
9,3016,Kent Ave & N 7 St,4.534812,40.7204,-73.9617
531,517,Pershing Square South,4.134365,40.7515,-73.978
540,532,S 5 Pl & S 5 St,3.916161,40.7105,-73.9609
544,504,1 Ave & E 15 St,3.583905,40.7322,-73.9817
548,468,Broadway & W 55 St,3.434931,40.7653,-73.9819
15,527,E 33 St & 2 Ave,3.400347,40.744,-73.9761
19,2006,Central Park S & 6 Ave,3.395891,40.7659,-73.9763
20,497,E 17 St & Broadway,3.351234,40.737,-73.9901
21,387,Centre St & Chambers St,3.3056,40.7127,-74.0046


In [208]:
page\
    .to_csv('output/page.csv')

<h2>Ratio

This represents the ratio between arrivals and departures from a station. A station with the large ratio see many end trips (but few beginings) while small ratios see many start trip (but few ends)

Trips Often End (but rarely begin)

In [198]:
temp = pd.merge(ratio, stations, left_on='id', right_on='station_id')\
    .loc[:, ['id', 'name_x', 'ratio', 'lat', 'lon']]\
    .rename({'name_x': 'name', }, axis=1)\
    .drop_duplicates(subset='id', keep='first')

temp.sort_values(['ratio'], ascending=False)\
    .head(20)

Unnamed: 0,id,name,ratio,lat,lon
332,3250,NYCBS Depot - PIT,11.926829,40.7169,-73.9838
333,3219,NYCBS Depot - STY,4.120743,40.7292,-73.9767
334,3240,NYCBS Depot BAL - DYR,3.745098,0.0,0.0
336,3432,Bike in Movie Night | Prospect Park Bandshell,2.874747,40.6691,-73.9946
142,3636,Expansion Warehouse 333 Johnson Ave,2.090909,40.7077,-73.9345
143,3468,NYCBS Depot - STY - Garage 4,2.043062,40.7304,-73.9748
144,3446,NYCBS Depot - STY - Valet Scan,2.0,0.0,0.0
145,3385,2 Ave & E 105 St,2.0,40.7898,-73.943
146,3543,Morningside Dr & Amsterdam Ave,2.0,40.8103,-73.9574
147,3631,Crown St & Bedford Ave,2.0,40.6666,-73.9567


In [209]:
temp.sort_values(['ratio'], ascending=False)\
    .to_csv('output/high_ratio.csv')

Trip Often Begin (but rarely end)

In [212]:
temp.sort_values(['ratio']).head(20)

Unnamed: 0,id,name,ratio,lat,lon
517,3480,WS Don't Use,0.233333,0.0,0.0
516,3450,Penn Station Valet - Valet Scan,0.237725,40.7513,-73.9969
515,3197,Hs Don't Use,0.470588,40.7193,-74.0342
514,3230,Penn Station Valet,0.585521,40.7513,-73.9969
743,3302,Columbus Ave & W 103 St,0.62749,40.7969,-73.9643
739,3014,E.T. Bike-In Movie Valet Station,0.690741,40.7229,-73.9591
735,3236,W 42 St & Dyer Ave,0.696407,40.759,-73.9938
734,3539,W 116 St & Amsterdam Ave,0.716004,40.8068,-73.9607
733,399,Lafayette Ave & St James Pl,0.744359,40.6885,-73.9648
732,289,Monroe St & Classon Ave,0.750615,40.6846,-73.9588


In [210]:
temp.sort_values(['ratio'])\
    .to_csv('output/low_ratio.csv')

<h2>Islands

Each connection is weighted by the total number of trip taken between the two stations. We can raise the "water level" by trimming edges that have a smaller number of trip than some threshold. Given the right water level this will result in a number of islands that represent the most densly connected stations.

In [200]:
connections = connections[(~connections['src'].isin(flat)) | (~connections['dst'].isin(flat))]
g = net.Graph()

for index, row in connections.iterrows():
    g.add_edge(row['src'], row['dst'])
    g[row['src']][row['dst']]['total'] = row['total']

In [202]:
# This cell's code has been adapted from Social Network Analysis for Startups
def trim_edges(graph, weight=1):
    g2 = net.Graph()
    for f, to, edata in graph.edges(data=True):
        if edata['total'] > weight:
            g2.add_edge(f, to, attr_dict=edata)
    return g2

def island_method(graph, iterations=5):
    weights = [edata['total'] for f, to, edata in graph.edges(data=True)]
    
    mn = int(min(weights))
    mx = 51626
    
    step = int((mx-mn)/iterations)
    
    return [[threshold, trim_edges(graph, threshold)] for threshold in range(mn, mx, step)]

Each value represents the required threshold between two nodes in order for them to be added to the network. The second value reprensents the total number connections between nodes and the list represents the size of the islands.

I am selecting a weight (water level) of 67 as it has the best balance of island number and island size.

In [204]:
for i in island_method(g, 20):
    print(i[0], len(i[1].edges()), [len(s) for s in net.connected_component_subgraphs(i[1])])
    if len(i[1].edges()) == 67:
        for c in net.connected_component_subgraphs(i[1]):
            if len(c) == 22:
                dense1 = c
            elif len(c) == 15:
                dense2 = c

2 156380 [840, 1]
2583 4462 [417, 3, 8, 3, 2, 1, 4, 2, 1, 6, 1, 2, 2]
5164 1192 [292, 12, 2, 7, 2, 2, 1, 1, 1, 1, 2, 1, 2, 1, 2, 2]
7745 450 [164, 7, 5, 8, 2, 4, 3, 3, 2, 4, 3, 4, 1, 3, 4, 4, 1, 1, 1, 2, 1, 2]
10326 215 [11, 104, 3, 4, 7, 1, 2, 3, 2, 2, 2, 2, 2, 3, 2, 2]
12907 111 [7, 61, 3, 3, 3, 6, 1, 2, 3, 2, 2]
15488 67 [3, 22, 15, 1, 5, 3, 3, 3, 3, 2, 1, 1, 1]
18069 34 [3, 2, 10, 4, 6, 1, 3, 2, 3, 2]
20650 19 [1, 2, 2, 3, 3, 5, 2, 1, 2]
23231 12 [1, 1, 1, 1, 2, 3, 4]
25812 8 [1, 1, 1, 1, 2, 2]
28393 6 [1, 1, 1, 1, 2]
30974 5 [1, 1, 1, 1, 2]
33555 4 [1, 1, 1, 1]
36136 4 [1, 1, 1, 1]
38717 4 [1, 1, 1, 1]
41298 2 [1, 1]
43879 2 [1, 1]
46460 2 [1, 1]
49041 2 [1, 1]
51622 1 [1]


The 22 node island and 15 node island are the two largest, most densely connected stations. I write them out for further analysis.

In [168]:
stations[stations['station_id'].isin(dense1)]\
    .to_csv('output/core1.csv')

In [169]:
stations[stations['station_id'].isin(dense2)]\
    to_csv('output/core2.csv')