In [68]:
import requests
import pandas as pd
from time import time, mktime
import datetime
import numpy as np
import json
from math import cos, sqrt
from operator import itemgetter

In [2]:
route_id = '1'
date = '2020-05-26'

stops_url = 'https://raw.githubusercontent.com/Lambda-School-Labs/sfmta-data-analysis-ds/master/deprecated_assets/datasets/route_info.csv'
schedules_url = 'http://sfmta-ds.eba-hqpuyrup.us-east-1.elasticbeanstalk.com/get-route-info' # needs to be updated with api url when live
daily_url = 'http://sfmta-ds.eba-hqpuyrup.us-east-1.elasticbeanstalk.com/daily-general-json'

stops = pd.read_csv(stops_url)
sched_json = requests.get(schedules_url, params={'route_id': route_id,
                                                 'day': date}).json()
daily_json = requests.get(daily_url, params={'day': date}).json()

# First Look At Calculating On-Time %

## Helper Functions

In [3]:
def fcc_projection(loc1, loc2):
    """
    function to apply FCC recommended formulae
    for calculating distances on earth projected to a plane
    
    significantly faster computationally, negligible loss in accuracy
    
    Args: 
    loc1 - a tuple of lat/lon
    loc2 - a tuple of lat/lon
    """
    lat1, lat2 = loc1[0], loc2[0]
    lon1, lon2 = loc1[1], loc2[1]
    
    mean_lat = (lat1+lat2)/2
    delta_lat = lat2 - lat1
    delta_lon = lon2 - lon1
    
    k1 = 111.13209 - 0.56605*cos(2*mean_lat) + .0012*cos(4*mean_lat)
    k2 = 111.41513*cos(mean_lat) - 0.09455*cos(3*mean_lat) + 0.00012*cos(5*mean_lat)
    
    distance = sqrt((k1*delta_lat)**2 + (k2*delta_lon)**2)
    
    return distance

In [69]:
def assign_stops(df, stops):
    """
    applies basic wrangling function
    calculates nearest stop from reported location in km
    returns dataframe with reported location, 
    nearest stop (coords and name), and distance between

    tested with single buses on single routes on a single day;
    technically route/vehicle/time agnostic
    don't foresee any issues generalizing
    
    implements FCC projection formulae for calculating distance
    
    Args:
    df - dataframe of transit data, requires 'latitude', 'longitude' columns
    stops - datafram of stops data, requires 'lat', 'lon', 'title' columns
    """
    
    # TO-DO: error handling for missing routes from either df or stops
    # Currently handling by intersecting sets during function call
    
    start = time()
    
    # creating list of lat/lon dictionaries for stops and reported bus locations
    stop_lats = stops['lat'].values
    stop_lons = stops['lon'].values

    reported_lats = df['latitude'].values
    reported_lons = df['longitude'].values

    stop_points = [{'latitude': stop_lats[x], 'longitude': stop_lons[x]} 
                 for x in range(len(stops))]

    reported_points = [{'latitude': reported_lats[x], 
                      'longitude': reported_lons[x]} 
                     for x in range(len(df))]

    # to minimize possible overlap between probable stops
    # 500 ft as km
    # upper end of previous range for minimum distance between stops according to sfmta
    # this value seems good but could use more testing
    radius = .1524

    # dict to tuples to play nice with geopy
    stop_point_tuples = [tuple(stop_points[x].values()) 
                       for x in range(len(stop_points))]

    reported_point_tuples = [tuple(reported_points[x].values()) 
                           for x in range(len(reported_points))]

    df['reported_location'] = reported_point_tuples
    
    print('Prep Complete')
    
    # generating ((lat/lon), distance) tuples for nearest stop within range
    # using FCC ellipsoidal earth projection
    distances = [{x: fcc_projection(location, x) 
                 for x in stop_point_tuples} 
                 for location in reported_point_tuples]
    
    print(f'Distances Generated => {len(distances)}')
    
    # sorting for nearest stop
    distances_sorted = [{k: v for k, v in sorted(distances[x].items(), 
                                                 key=itemgetter(1))}
                       for x in range(len(distances))]
    
    print(f'Distances Sorted => {len(distances_sorted)}')
    
    # creating list of nearest stops
    # nearest stop if nearest stop within radius, else None
    point_stops = [next(iter(distances_sorted[x].items())) 
                   if next(iter(distances_sorted[x].items()))[1] <= radius 
                   else None
                   for x in range(len(distances_sorted))]
    
    print(f'Stops Created => {len(point_stops)}')
    
    # assigning stop name from stops table based on lat/lon from previous step
    stop_tuples = list(zip(stops.lat, stops.lon))
    stop_titles = [stops.title.iloc[stop_tuples.index(stop[0])] 
                   if stop != None
                   else None 
                   for stop in point_stops]
    
    print(f'Titles Created => {len(stop_titles)}')
    
    # pulling lat/lon and distance from tuples for df
    df['nearest_stop'] = [x[0] if x != None else None for x in point_stops]
    df['distance_in_km'] = [x[1] if x != None else None for x in point_stops]

    # pulling stop names from list for df
    df['title'] = stop_titles

    # dropping columns of redundant information
    df = df.drop(columns=['age', 'rid', 'latitude', 'longitude', 'timestamp',
                          'kph', 'heading'])
    end = time()
    
    print(f'DF Complete\nTime Elapsed: {end-start} seconds\n')
    
    return df

In [70]:
def adjust_timestamp(df):
    times = df.timestamp.values
    ages = df.age.values

    df['adjusted_timestamp'] = [pd.Timestamp(times[x]) - 
                                pd.Timedelta(seconds=ages[x]) 
                                for x in range(len(df.timestamp))]
    
    return df

## Function To Calculate On-Time %

In [201]:
def find_on_time(stops, daily_json, schedules_json):
    
    route = schedules_json['route']
    intervals = schedules_json['intervals']
    stop_list = schedules_json['stops']
    inbound, outbound = pd.read_json(sched_json['inbound']), pd.read_json(sched_json['outbound'])
    
    locations = pd.DataFrame(data=daily_json).dropna()
    locations = locations[locations.rid.eq(route)]
    
    locations['direction'] = ['Inbound' if '1____I' in x 
                              else 'Outbound' if '1____O' in x
                              else None
                              for x in locations.direction.values]
    
    stops = stops[stops.route_id.eq(route)]
    stops = stops[stops.tag.isin(stop_list)]
    stops = stops.rename(columns={'dir': 'direction'})
    
    locations = adjust_timestamp(locations)
    
    locations['unix_timestamp'] = locations['adjusted_timestamp'].apply(lambda d: 
                                                                        mktime(d.timetuple()))
    
    locations = assign_stops(locations, stops)
    
    locations = locations.merge(stops[stops.route_id.eq(route)], how='left', on=['direction', 'title'])
    locations = locations.drop(columns=['route_id', 'lat', 'lon', 'stopId']).dropna()
    
    scheduled_times = []
    serviced_times = []
    for x in inbound:
        if x in locations.tag.values:
            
            stop_times = pd.to_datetime(inbound[x].dropna())
            bus_times = locations[locations.direction.eq('Inbound')
                                  & locations.tag.eq(x)
                                  & locations.adjusted_timestamp.dt.time.between(
                                      min(stop_times.dt.time), 
                                      max(stop_times.dt.time))
                                 ][['adjusted_timestamp', 'unix_timestamp', 'reported_location',
                                    'distance_in_km', 'vid', 'tag']]
            
            scheduled_times.append(stop_times.reset_index(drop=True).dt.time)
            serviced_times.append(bus_times.reset_index(drop=True))
        
        else:
            pass
        
            
    return scheduled_times, serviced_times, locations

In [202]:
def bin_from_timestamp(timestamp, bins):
    diffs = [abs(timestamp - x) for x in bins]
    return bins[diffs.index(min(diffs))]

In [206]:
scheduled_times, serviced_times, locations = find_on_time(stops, daily_json, sched_json)

Prep Complete
Distances Generated => 11399
Distances Sorted => 11399
Stops Created => 11399
Titles Created => 11399
DF Complete
Time Elapsed: 0.974747896194458 seconds



# Results

Shoot, I don't know. 

Been trying to figure out an approach for this all day; best I've done is filter bus locations so we're only worried about busses that reported:
- while active on a route
- at or near a stop
- that exists in the selected route
- and exists in the current schedule for that route
- where the busses Inbound/Outbound indicator matched the scheduled stop's Inbound/Outbound status
- and the timestamp of the report is within the scheduled service time

So theoretically I've eliminated all (or most) of the data that won't actually be useful in strictly determining whether or not a stop is being serviced on-time. I just have no clue how to get from here to there.

Currently the funcion spits out:
- a list of datetime series of scheduled stop times by stop
- a list of datarames filtered as above, also filtered by stop
- the full dataframe for the specified date, filtered as above

Intuitively it feels like there must be some simple next step, given the data, to completion. But idk ¯\\_(ツ)_/¯

In [207]:
times1[0]

0     2020-06-02 05:00:00
1     2020-06-02 05:08:00
2     2020-06-02 05:16:00
3     2020-06-02 05:24:00
4     2020-06-02 05:32:00
              ...        
137   2020-06-02 22:16:00
138   2020-06-02 22:24:00
139   2020-06-02 22:32:00
140   2020-06-02 22:40:00
141   2020-06-02 22:48:00
Name: 4277, Length: 142, dtype: datetime64[ns]

In [208]:
[pd.Timestamp(times2[0].loc[x+1, 'adjusted_timestamp'])
 - pd.Timestamp(times2[0].loc[x,  'adjusted_timestamp']) 
 for x in range(0, len(times2[0])-1)]

[Timedelta('0 days 00:01:33'),
 Timedelta('0 days 00:00:13'),
 Timedelta('0 days 00:00:50'),
 Timedelta('0 days 00:01:57'),
 Timedelta('0 days 00:00:12'),
 Timedelta('0 days 01:30:40'),
 Timedelta('0 days 05:07:45'),
 Timedelta('0 days 00:00:51'),
 Timedelta('0 days 00:01:02'),
 Timedelta('0 days 00:01:15'),
 Timedelta('0 days 00:02:48'),
 Timedelta('0 days 00:00:54'),
 Timedelta('0 days 00:01:04'),
 Timedelta('0 days 00:01:03'),
 Timedelta('0 days 00:00:55'),
 Timedelta('0 days 00:01:02'),
 Timedelta('0 days 00:03:39'),
 Timedelta('0 days 00:01:45'),
 Timedelta('0 days 00:01:03'),
 Timedelta('0 days 00:00:50'),
 Timedelta('0 days 00:01:08'),
 Timedelta('0 days 00:01:54'),
 Timedelta('0 days 00:00:53'),
 Timedelta('0 days 00:01:02'),
 Timedelta('0 days 00:01:04'),
 Timedelta('0 days 00:00:12'),
 Timedelta('0 days 00:00:50'),
 Timedelta('0 days 00:00:00'),
 Timedelta('0 days 00:00:53'),
 Timedelta('0 days 00:00:12'),
 Timedelta('0 days 00:00:25'),
 Timedelta('0 days 00:01:03'),
 Timedel

In [220]:
times1[0]

0     2020-06-02 05:00:00
1     2020-06-02 05:08:00
2     2020-06-02 05:16:00
3     2020-06-02 05:24:00
4     2020-06-02 05:32:00
              ...        
137   2020-06-02 22:16:00
138   2020-06-02 22:24:00
139   2020-06-02 22:32:00
140   2020-06-02 22:40:00
141   2020-06-02 22:48:00
Name: 4277, Length: 142, dtype: datetime64[ns]

In [221]:
testdf = pd.DataFrame(times1[0])

In [224]:
testdf[4277].apply(lambda d:
                    mktime(pd.Timestamp(d).timetuple()))

0      1.591096e+09
1      1.591096e+09
2      1.591097e+09
3      1.591097e+09
4      1.591098e+09
           ...     
137    1.591158e+09
138    1.591158e+09
139    1.591159e+09
140    1.591159e+09
141    1.591160e+09
Name: 4277, Length: 142, dtype: float64

In [225]:
delta = datetime.timedelta(minutes=8)

time_bins = [np.arange(np.min(times2[x]['unix_timestamp']),
                     np.max(times2[x]['unix_timestamp'])
                     + delta.seconds, delta.seconds) 
                     for x in range(len(times2))]

binned_dfs = [times2[x].groupby(times2[x].unix_timestamp
                           .map(lambda t: 
                                datetime.datetime.fromtimestamp(
                                    bin_from_timestamp(t, time_bins[x]))))
                  for x in range(len(time_bins))]

binned_locs = [{str(x): df.get_group(str(x))
                    for x, y in df} for df in binned_dfs]

In [226]:
binned_locs[0]['2020-05-26 06:02:37']

Unnamed: 0,adjusted_timestamp,unix_timestamp,reported_location,distance_in_km,vid,tag
0,2020-05-26 06:02:37,1590495000.0,"(37.7798, -122.493)",0.013898,5831,4277.0
1,2020-05-26 06:04:10,1590495000.0,"(37.7798, -122.493)",0.013898,5831,4277.0
2,2020-05-26 06:04:23,1590495000.0,"(37.7798, -122.493)",0.013898,5831,4277.0
3,2020-05-26 06:05:13,1590495000.0,"(37.7798, -122.493)",0.013898,5831,4277.0


In [228]:
pd.Timestamp(inbound[4277][157])-pd.Timestamp(inbound[4277][156])

Timedelta('0 days 00:08:00')

In [227]:
locations

Unnamed: 0,vid,direction,adjusted_timestamp,unix_timestamp,reported_location,nearest_stop,distance_in_km,title,tag
6,5770,Inbound,2020-05-26 03:49:03,1.590487e+09,"(37.7924, -122.421)","(37.7923599, -122.42101000000001)",0.004571,Clay St & Polk St,4026.0
12,5770,Inbound,2020-05-26 03:52:07,1.590487e+09,"(37.7939, -122.409)","(37.7938299, -122.40959)",0.065844,Clay St & Powell St,4027.0
13,5873,Outbound,2020-05-26 03:52:07,1.590487e+09,"(37.7799, -122.492)","(37.779739899999996, -122.49311000000002)",0.124432,Geary Blvd & 33rd Ave,34277.0
14,5873,Inbound,2020-05-26 03:53:09,1.590487e+09,"(37.7798, -122.493)","(37.779739899999996, -122.49311000000002)",0.013898,Geary Blvd & 33rd Ave,4277.0
17,5873,Inbound,2020-05-26 03:54:09,1.590487e+09,"(37.7798, -122.493)","(37.779739899999996, -122.49311000000002)",0.013898,Geary Blvd & 33rd Ave,4277.0
...,...,...,...,...,...,...,...,...,...
11384,5877,Outbound,2020-05-26 21:49:25,1.590551e+09,"(37.7932, -122.408)","(37.7930399, -122.40912)",0.125388,Sacramento St & Powell St,6312.0
11389,5877,Outbound,2020-05-26 21:53:33,1.590552e+09,"(37.7917, -122.42)","(37.7915199, -122.42115)",0.129021,Sacramento St & Polk St,6311.0
11390,5877,Outbound,2020-05-26 21:54:23,1.590552e+09,"(37.7914, -122.422)","(37.7915199, -122.42115)",0.095150,Sacramento St & Polk St,6311.0
11395,5877,Outbound,2020-05-26 21:59:36,1.590552e+09,"(37.7899, -122.434)","(37.7898199, -122.43399)",0.008926,Sacramento St & Fillmore St,6295.0
