In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

In [154]:
trips = pd.read_csv('../CSVs/trips.csv', low_memory=False)
trip_updates = pd.read_csv('../CSVs/trip_updates.csv', low_memory=False)
vehicle_pos = pd.read_csv('../CSVs/vehicle_positions.csv', low_memory=False)
stop_times = pd.read_csv('../CSVs/stop_times.csv', dtype={'trip_id': str, 'stop_id': str})
stop_times.head()

Unnamed: 0,trip_id,stop_id,stop_sequence,arrival_time,departure_time,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,timepoint
0,Logan-22-Weekday-trip,Logan-Subway,1,08:00:00,08:00:00,,0,1,,1.0
1,Logan-22-Weekday-trip,Logan-RentalCarCenter,2,08:04:00,08:04:00,,0,0,,1.0
2,Logan-22-Weekday-trip,Logan-A,3,08:09:00,08:09:00,,0,0,,1.0
3,Logan-22-Weekday-trip,Logan-B,4,08:12:00,08:12:00,,0,0,,1.0
4,Logan-22-Weekday-trip,Logan-Subway,5,08:17:00,08:17:00,,1,0,,1.0


In [155]:
trips.head()

Unnamed: 0,trip_id,route_id,service_id,direction_id,block_id,shape_id,trip_type,trip_headsign,trip_short_name,bikes_allowed,wheelchair_accessible
0,Logan-22-Weekday-trip,Logan-22,Logan-Weekday,0,,,,Loop,,0,1
1,Logan-22-Weekend-trip,Logan-22,Logan-Weekend,0,,,,Loop,,0,1
2,Logan-33-Weekday-trip,Logan-33,Logan-Weekday,0,,,,Loop,,0,1
3,Logan-33-Weekend-trip,Logan-33,Logan-Weekend,0,,,,Loop,,0,1
4,Logan-55-Weekday-trip,Logan-55,Logan-Weekday,0,,,,Loop,,0,1


In [156]:
trips = trips.loc[:, ['route_id', 'trip_id', 'trip_headsign']]
trips.head()

Unnamed: 0,route_id,trip_id,trip_headsign
0,Logan-22,Logan-22-Weekday-trip,Loop
1,Logan-22,Logan-22-Weekend-trip,Loop
2,Logan-33,Logan-33-Weekday-trip,Loop
3,Logan-33,Logan-33-Weekend-trip,Loop
4,Logan-55,Logan-55-Weekday-trip,Loop


In [157]:
route_stops = stop_times.merge(trips, on='trip_id', how='inner')
route_stops.head()
#route_stops['route_id'].unique()

Unnamed: 0,trip_id,stop_id,stop_sequence,arrival_time,departure_time,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,timepoint,route_id,trip_headsign
0,Logan-22-Weekday-trip,Logan-Subway,1,08:00:00,08:00:00,,0,1,,1.0,Logan-22,Loop
1,Logan-22-Weekday-trip,Logan-RentalCarCenter,2,08:04:00,08:04:00,,0,0,,1.0,Logan-22,Loop
2,Logan-22-Weekday-trip,Logan-A,3,08:09:00,08:09:00,,0,0,,1.0,Logan-22,Loop
3,Logan-22-Weekday-trip,Logan-B,4,08:12:00,08:12:00,,0,0,,1.0,Logan-22,Loop
4,Logan-22-Weekday-trip,Logan-Subway,5,08:17:00,08:17:00,,1,0,,1.0,Logan-22,Loop


These are all the routes that trips are associated with in the stop_times table

In [158]:
route_stops['route_id'].value_counts()[-15:]

351         360
8993        333
434         264
3738        231
Boat-F4     224
171         160
3233         64
9701         30
9702         22
Logan-55     22
Logan-66     22
9703         20
195          15
Logan-33     14
Logan-22     14
Name: route_id, dtype: int64

We can see how many trips there are per route

In [159]:
route_stops = route_stops.loc[:, ['trip_id', 'stop_id', 'stop_sequence', 'route_id']]
route_stops.head()

Unnamed: 0,trip_id,stop_id,stop_sequence,route_id
0,Logan-22-Weekday-trip,Logan-Subway,1,Logan-22
1,Logan-22-Weekday-trip,Logan-RentalCarCenter,2,Logan-22
2,Logan-22-Weekday-trip,Logan-A,3,Logan-22
3,Logan-22-Weekday-trip,Logan-B,4,Logan-22
4,Logan-22-Weekday-trip,Logan-Subway,5,Logan-22


In [160]:
total_trips = route_stops['trip_id'].value_counts().index
print('There are {} trips'.format(len(total_trips)))

There are 72342 trips


In [161]:
total_trips

Index(['35065445', '35284958', '35065449', '35065451', '35065459', '35065463',
       '35065461', '35065443', '35065455', '35065457',
       ...
       'Boat-F4-1315-Charlestown-Weekend', 'Boat-F1-1330-Rowes-SummerWeekday',
       'Boat-F1-1100-Hingham-SummerWeekday', 'Boat-F1-0845-Hingham-Weekday',
       'Boat-F4-1245-Charlestown-Weekend',
       'Boat-F1-2030-Hingham-SummerWeekday', 'CR-Weekday-Spring-17-289',
       'Boat-F1-1940-Hingham-Weekday', 'Boat-F1-1730-Long-Weekday',
       'Boat-F1-1705-Hingham-Weekday'],
      dtype='object', length=72342)

## Robust Way of Obtaining Patterns

The following cells display code that is used to obtain the unique trip patterns that exist under a particular target route. It is dubbed 'Robust', since it involves a thorough method that is sure to obtain all the patterns and label each trip with the appropiate patter id. The steps are as follows
<ol>
    <li>obtain a list of trips within a target route</li>
    <li>for every trip obtain a python tuple of stops</li>
    <li>add this tuple to a grand set of stops </li>
    <li>use this tuple as a key in a dictionary (only if it is not already)</li>
</ol>
Note that by using a set we take care of the case where different iterations can have the same tuple of stops.

In [162]:
# lets focus on just a subset of route_stops
# in particular lets focus on a route instead
# of all routes
targ_route = '448'
def get_targ_trips(targ_route, route_stops):
    return route_stops[route_stops['route_id'] == targ_route].copy().reset_index(drop=True)
targ_trips = get_targ_trips(targ_route, route_stops)
targ_trips.head()

Unnamed: 0,trip_id,stop_id,stop_sequence,route_id
0,35058168,4807,1,448
1,35058168,4832,2,448
2,35058168,4834,3,448
3,35058168,14836,6,448
4,35058168,4835,4,448


In [163]:
# display all the trips that are under this route
route_trips = targ_trips['trip_id'].unique()
route_trips

array(['35058168', '35058169', '35058204', '35058279', '35058280',
       '35070565', '35070566', '35070601', '35070676', '35070677'], dtype=object)

In [164]:


# get_trip_pats will return a mapping 
# of the form stops: pattern_name where
# stops is a tuple that consists of the 
# stop_ids a trip travels through and
# pattern_name is the name of the pattern
def get_trip_pts(route_trips, targ_trips, route, debug = False):
    trip_patterns_dict = {}
    alph_iter = iter(range(ord('A'), ord('Z')))
    trip_patterns = set()

    for tr in route_trips:
        stops = targ_trips[targ_trips['trip_id'] == tr].sort_values('stop_sequence')['stop_id']
        stops = tuple(stops)
        if debug:
            print('{}\n'.format(tr))
            print(tuple(stops))
        trip_patterns.add(stops)
        if stops not in trip_patterns_dict:
            trip_patterns_dict[stops] = '{}{}'.format(route, chr(next(alph_iter)))
    
    return trip_patterns, trip_patterns_dict

trp_pat, trp_pat_dict = get_trip_pts(route_trips, targ_trips, targ_route)
#trip_patterns_dict = {name: trp for trp, name in trip_patterns_dict.items()}
#trip_patterns_dict['448A'][:10]
trp_pat_dict.values()

dict_values(['448A', '448C', '448B'])

In [165]:
# make a new column for pattern id
targ_trips['pattern_id'] = ''

# fill_trp_pat will fill in the pattern_id column
# of all the rows in targ_trips with a matching trip_id
def fill_trp_pat(route_trips, targ_trips, trip_patterns_dict):
    # iterate through every trip
    for tr in route_trips:
        # find the rows that have a matching trip_id
        match_trps = targ_trips[targ_trips['trip_id'] == tr]
        # get the tuple that will contain the stops for this trip
        stops = tuple(match_trps.sort_values('stop_sequence')['stop_id'])
        # use the dictionary to get the pattern name based on the stops
        pat_name = trip_patterns_dict[stops]    
        # fill in all the rows with the appropiate pattern name
        targ_trips.loc[targ_trips['trip_id'] == tr, 'pattern_id'] = pat_name
        
fill_trp_pat(route_trips, targ_trips, trp_pat_dict)
targ_trips['pattern_id'].unique()

array(['448A', '448B', '448C'], dtype=object)

In [166]:
gp = targ_trips.groupby('pattern_id')
gp['trip_id'].unique()

pattern_id
448A    [35058168, 35058169, 35070565, 35070566]
448B                        [35058204, 35070601]
448C    [35058279, 35058280, 35070676, 35070677]
Name: trip_id, dtype: object

Here we can see that we only used the pattern names that we generated as well as the trip_ids that belong to each pattern.

In [168]:
targ_routes = ['448', '434', '3738']

for route in targ_routes:
    targ_trps = get_targ_trips(route, route_stops)
    route_trips = targ_trps['trip_id'].unique()
    pat_set, pat_dict = get_trip_pts(route_trips, targ_trps, route)
    fill_trp_pat(route_trips, targ_trps, pat_dict)
    gp = targ_trps.groupby('pattern_id')
    print(gp['trip_id'].unique())

pattern_id
448A    [35058168, 35058169, 35070565, 35070566]
448B                        [35058204, 35070601]
448C    [35058279, 35058280, 35070676, 35070677]
Name: trip_id, dtype: object
pattern_id
434A    [35058051, 35070448]
434B    [35058052, 35070449]
Name: trip_id, dtype: object
pattern_id
3738A    [35144705, 35144734, 35144756]
3738B    [35144716, 35144727, 35144747]
Name: trip_id, dtype: object


## Obtain Patterns with Headsign

The following cells attempt to reach the same results as the Robust method. Unfortunately it looks like we cannot rely on the headsign to give us an accurate distinction between different patterns.

In [126]:
pat_448A = gp['trip_id'].unique()['448A']
trips[(trips['trip_id'] == '35058168') |  (trips['trip_id'] == '35058169') | (trips['trip_id'] == '35070565') | (trips['trip_id'] == '35070566')]

Unnamed: 0,route_id,trip_id,trip_headsign
28492,448,35058168,Downtown Express via Airport and Paradise Rd
28493,448,35058169,Downtown Express via Airport and Paradise Rd
30704,448,35070565,Downtown Express via Airport and Paradise Rd
30705,448,35070566,Downtown Express via Airport and Paradise Rd


It looks like all trips from 448A have the same headsign, which is good!

In [128]:
trips[(trips['trip_id'] == '35058204') | (trips['trip_id'] == '35070601')]

Unnamed: 0,route_id,trip_id,trip_headsign
28528,448,35058204,Marblehead via Paradise Rd & Central Sq (Express)
30740,448,35070601,Marblehead via Paradise Rd & Central Sq (Express)


All the trips in 448B also share the same headsign and differ from the headsign seen in 448A

In [140]:
trips[(trips['trip_id'] =='35058279') | (trips['trip_id'] =='35058280') | (trips['trip_id'] == '35070676') | (trips['trip_id'] == '35070677')]

Unnamed: 0,route_id,trip_id,trip_headsign
28603,448,35058279,Marblehead via Paradise Rd & Central Sq (Express)
28604,448,35058280,Marblehead via Paradise Rd & Central Sq (Express)
30815,448,35070676,Marblehead via Paradise Rd & Central Sq (Express)
30816,448,35070677,Marblehead via Paradise Rd & Central Sq (Express)


It looks like all the trips from 448C share the same headsign as those from 448B. Perhaps the Robust method does not yield the correct list of patterns?

In [134]:
pat_name_dict = {name:stops for stops, name in trp_pat_dict.items()}
set(pat_name_dict['448B']) - set(pat_name_dict['448C'])

{'24717',
 '4709',
 '4717',
 '4727',
 '4728',
 '4730',
 '4731',
 '4732',
 '4735',
 '4736',
 '4738',
 '4739',
 '4741',
 '4742',
 '4743',
 '4744',
 '4745',
 '4746',
 '4747',
 '4854',
 '48541',
 '4855',
 '4856'}

In [136]:
'4709' in set(pat_name_dict['448C'])

False

It turns out that there is a difference between the stops that 448B and 448C visit! Therefore we cannot rely on the headsign as an alternative to determining patterns