# Iterating On Stop Assignment

## Loading data

In [1]:
pip install folium



In [None]:
import requests
import pandas as pd
from operator import itemgetter
from math import sqrt, cos
from time import time, mktime
import folium
from folium.plugins import TimestampedGeoJson
import datetime
import numpy as np

In [None]:
# Muting chained assignment warning; needs refactoring
# but doesn't affect performance

pd.set_option('mode.chained_assignment', None)

### A single day's (05/24/2020) full data pulled from api

In [None]:
url = 'http://sfmta-ds.eba-hqpuyrup.us-east-1.elasticbeanstalk.com/daily-general-json'

In [None]:
json_data = requests.get(url, params={'day': '2020-05-24'}).json()

In [None]:
# making df

full_data = pd.DataFrame(data=json_data).sort_values('timestamp')

In [None]:
# sub dfs for testing
# single route, all vehicles
# single route, single vehicle

nbus = full_data[full_data['rid']=='NBUS']
nbus_highest = nbus[nbus['vid']==(nbus['vid'].value_counts().index[0])].sort_values('timestamp')

In [None]:
# using stops gathered by Labs 22 for expediency
# all stops
# all stops in one route

stops = pd.read_csv('https://raw.githubusercontent.com/Lambda-School-Labs/sfmta-data-analysis-ds/master/deprecated_assets/datasets/route_info.csv')
nbus_stops = stops[stops['route_id']=='NBUS']

## Engineering Probable Nearest Stop and Distance (For Confidence)

### Original helper function for wrangling

In [None]:
def wrangle_bus(df):
    """
    preps dataframe for a single bus
    gives accurate timestamps and naively calculates 
    dwell time as 1min per checkin with motion (kph <= 0)
    
    Largest bottleneck for time-cost in df prep
    currently not implemented until refactor
    """
    
    times = df['timestamp'].values
    ages = df['age'].values
    
    df['adjusted_timestamp'] = [pd.Timestamp(times[x]) - pd.Timedelta(seconds=ages[x]) for 
                                x in range(len(df['timestamp']))]
    
    df['timestamp'] = [pd.Timestamp(times[x]) for x in range(len(df['timestamp']))]

    dwell_count = 0
    dwell_totals = []

    for x in df['kph']:
        if x <= 0:
          dwell_count += 1
          dwell_totals.append(dwell_count)
        elif x > 0:
          dwell_totals.append(0)
          dwell_count = 0
            
    df['dwell'] = [dwell_totals[x] for x in range(len(df))]

    return df

### Function to calculate nearest stop within $X$ km by projected euclidean distance

In [None]:
def fcc_projection(loc1, loc2):
    """
    function to apply FCC recommended formulae
    for calculating distances on earth projected to a plane
    
    significantly faster computationally, negligible loss in accuracy
    
    Args: 
    loc1 - a tuple of lat/lon
    loc2 - a tuple of lat/lon
    """
    lat1, lat2 = loc1[0], loc2[0]
    lon1, lon2 = loc1[1], loc2[1]
    
    mean_lat = (lat1+lat2)/2
    delta_lat = lat2 - lat1
    delta_lon = lon2 - lon1
    
    k1 = 111.13209 - 0.56605*cos(2*mean_lat) + .0012*cos(4*mean_lat)
    k2 = 111.41513*cos(mean_lat) - 0.09455*cos(3*mean_lat) + 0.00012*cos(5*mean_lat)
    
    distance = sqrt((k1*delta_lat)**2 + (k2*delta_lon)**2)
    
    return distance

In [None]:
def assign_stop(df, stops):
    """
    applies basic wrangling function
    calculates nearest stop from reported location in km
    returns dataframe with reported location, 
    nearest stop (coords and name), and distance between

    tested with single buses on single routes on a single day;
    technically route/vehicle/time agnostic
    don't foresee any issues generalizing
    
    implements FCC projection formulae for calculating distance
    
    Args:
    df - dataframe of transit data, requires 'latitude', 'longitude' columns
    stops - datafram of stops data, requires 'lat', 'lon', 'title' columns
    """
    
    # TO-DO: error handling for missing routes from either df or stops
    # Currently handling by intersecting sets during function call
    
    start = time()
    
    # wrangle_bus function is now largest time bottleneck - may be removed
    wrangle_bus(df)

    # creating list of lat/lon dictionaries for stops and reported bus locations
    stop_lats = stops['lat'].values
    stop_lons = stops['lon'].values

    reported_lats = df['latitude'].values
    reported_lons = df['longitude'].values

    stop_points = [{'latitude': stop_lats[x], 'longitude': stop_lons[x]} 
                 for x in range(len(stops))]

    reported_points = [{'latitude': reported_lats[x], 
                      'longitude': reported_lons[x]} 
                     for x in range(len(df))]

    # to minimize possible overlap between probable stops
    # 500 ft as km
    # upper end of previous range for minimum distance between stops according to sfmta
    # this value seems good but could use more testing
    radius = .1524

    # dict to tuples to play nice with geopy
    stop_point_tuples = [tuple(stop_points[x].values()) 
                       for x in range(len(stop_points))]

    reported_point_tuples = [tuple(reported_points[x].values()) 
                           for x in range(len(reported_points))]

    df['reported_location'] = reported_point_tuples
    
    print('Prep Complete')
    
    # generating ((lat/lon), distance) tuples for nearest stop within range
    # using FCC ellipsoidal earth projection
    distances = [{x: fcc_projection(location, x) 
                 for x in stop_point_tuples} 
                 for location in reported_point_tuples]
    
    print(f'Distances Generated => {len(distances)}')
    
    # sorting for nearest stop
    distances_sorted = [{k: v for k, v in sorted(distances[x].items(), 
                                                 key=itemgetter(1))}
                       for x in range(len(distances))]
    
    print(f'Distances Sorted => {len(distances_sorted)}')
    
    # creating list of nearest stops
    # nearest stop if nearest stop within radius, else None
    point_stops = [next(iter(distances_sorted[x].items())) 
                   if next(iter(distances_sorted[x].items()))[1] <= radius 
                   else None
                   for x in range(len(distances_sorted))]
    
    print(f'Stops Created => {len(point_stops)}')
    
    # assigning stop name from stops table based on lat/lon from previous step
    stop_tuples = list(zip(stops['lat'], stops['lon']))
    stop_titles = [stops['title'].iloc[stop_tuples.index(stop[0])] 
                   if stop != None
                   else None 
                   for stop in point_stops]
    
    print(f'Titles Created => {len(stop_titles)}')
    
    # pulling lat/lon and distance from tuples for df
    df['nearest_stop'] = [x[0] if x != None else None for x in point_stops]
    df['distance_in_km'] = [x[1] if x != None else None for x in point_stops]

    # pulling stop names from list for df
    df['stops'] = stop_titles

    # dropping columns of redundant information
    df = df.drop(columns=['age', 'latitude', 'longitude'])
    end = time()
    
    print(f'DF Complete\nTime Elapsed: {end-start} seconds\n')
    
    return df

### Full Day - Single Route, Single Bus

In [72]:
nbus_highest_wrangled = assign_stop(nbus_highest, nbus_stops)

Prep Complete
Distances Generated => 979
Distances Sorted => 979
Stops Created => 979
Titles Created => 979
DF Complete
Time Elapsed: 0.3319363594055176 seconds



In [73]:
nbus_wrangled = assign_stop(nbus, nbus_stops)

Prep Complete
Distances Generated => 17186
Distances Sorted => 17186
Stops Created => 17186
Titles Created => 17186
DF Complete
Time Elapsed: 5.510903358459473 seconds



## Binning Avg. Bus Locations by Time (5 min)

Below functions are predicated on data wrangled by above functions;\
specifically, they depend on the adjusted_timestamp

In [None]:
def time_wrangling(dataframe, delta=5):
    start = time()
    delta = datetime.timedelta(minutes=delta)
    
    dataframe['unix_timestamp'] = dataframe['adjusted_timestamp'].apply(lambda d: 
                                                                        mktime(d.timetuple()))
    
    # list of dataframes by vehicle id
    by_route = [dataframe[dataframe.vid.eq(x)].set_index('adjusted_timestamp') 
                for x in dataframe.vid.unique()]
    print(f'Separated by route: {round(time()-start, 2)} seconds')

    # list of time_bins for each df
    time_bins = [np.arange(np.min(by_route[x]['unix_timestamp']),
                     np.max(by_route[x]['unix_timestamp'])
                     + delta.seconds, delta.seconds) 
                     for x in range(len(by_route))]
    print(f'Time binned: {round(time()-start, 2)} seconds')

    # grouping entries in each df by time bins
    binned_dfs = [by_route[x].groupby(by_route[x].unix_timestamp
                           .map(lambda t: 
                                datetime.datetime.fromtimestamp(
                                    bin_from_timestamp(t, time_bins[x]))))
                  for x in range(len(time_bins))]
    print(f'DFs binned: {round(time()-start, 2)} seconds')

    # list of dicts of timestamp, binned locations for each df
    binned_locs = [{str(x): df.get_group(str(x))['reported_location'] 
                    for x, y in df} for df in binned_dfs]
    print(f'Locations binned: {round(time()-start, 2)} seconds')

    # averaging binned locations for each timestamp, zipping with vid
    avg_locs = zip(dataframe.vid, [{x: avg_coords(df[x][:])
                for x in df} for df in binned_locs])
    print(f'Complete: {round(time()-start, 2)} seconds')

    # returns list of vid, timestamps: locations for all vids
    return [x for x in avg_locs]

In [513]:
locs = time_wrangling(nbus_wrangled)

Separated by route: 0.46 seconds
Time binned: 0.47 seconds
DFs binned: 1.3 seconds
Locations binned: 4.59 seconds
Complete: 5.23 seconds


In [None]:
def bin_from_timestamp(timestamp, bins):
    diffs = [abs(timestamp - x) for x in bins]
    return bins[diffs.index(min(diffs))]

In [None]:
def avg_coords(coords):
    avg = tuple(map(lambda y: sum(y) / float(len(y)), zip(*coords)))
    return avg

## Mapping!

### Creating GeoJson for playback of timestamped locations

In [None]:
def create_geojson_features(time_locs):
    features = []
    for entry in time_locs[1]:
      feature = {
          'type': 'Feature',
          'geometry': {
              'type':'Point', 
              'coordinates':[time_locs[1][entry][1], time_locs[1][entry][0]]
          },
          'properties': {
              'name': time_locs[0].__str__(),
              'time': entry.__str__(),
              'style': {'color' : 'blue'},
              'icon': 'circle',
              'iconstyle':{
                  'fillColor': 'blue',
                  'fillOpacity': 0.3,
                  'stroke': 'true',
                  'radius': 5
              }
          }
      }
      features.append(feature)
    return features

### Defining map and geojson params for playback

In [None]:
def make_map(features, period, duration):
    print('> Making map...')
    bus_map = folium.Map(location=[37.77293534218353, -122.44596170151804], 
                         control_scale=True, 
                         zoom_start=14,
                         tiles='Stamen Toner')

    TimestampedGeoJson(
        {'type': 'FeatureCollection',
        'features': features},
        period=f'PT{period}M',
        add_last_point=True,
        auto_play=False,
        loop=False,
        max_speed=1,
        loop_button=True,
        date_options='HH:mm:ss',
        time_slider_drag_update=True,
        duration=f'PT{duration}M'
    ).add_to(bus_map)
    print('> Done.')
    return bus_map

def plot_map(time_locs, period=5, duration=5):
    geojson = [create_geojson_features(x) for x in time_locs]
    features = []
    for x in geojson:
      features.extend(x)
    return make_map(features, period, duration)

### It's a map!

In [523]:
plot_map(time_wrangling(nbus_wrangled, delta=1), period=1, duration=10)

Separated by route: 0.21 seconds
Time binned: 0.22 seconds
DFs binned: 3.69 seconds
Locations binned: 19.54 seconds
Complete: 22.51 seconds
> Making map...
> Done.


In [524]:
# saving map

nbus_full_map = plot_map(time_wrangling(nbus_wrangled, delta=1), 
                         period=1, duration=10)

nbus_full_map.save('nbus_full.html')

Separated by route: 0.34 seconds
Time binned: 0.35 seconds
DFs binned: 4.27 seconds
Locations binned: 19.85 seconds
Complete: 22.78 seconds
> Making map...
> Done.


## Updates

Refactored to handle all vehicles on a route; still not ready to plot multiple routes at once. Necessary? ¯\\_(ツ)_/¯

Updated how buses are displayed, as well as map tileset, to (hopefully) declutter and improve visibility. Increased transparency, decreased duration of display to ten minutes, decreased bin sizes and mapping time to 1 minute. All together this means that we're seeing every buses location every minute, only averaging in the (rare) circumstance that we have a location for a bus more than once within a minute, and each buses location is visible for ten minutes. 

A fun side effect of this is that it serves as sort of an approximation of a heatmap; if the circles are darkening, buses are congregating. Neat!

Haven't investigated further but I believe there is functionality for hover text, meaning we could give indication of which bus is which if desired. More importantly, I think there may be functionality to link points based on distance, meaning we could easily recreate the visual representation of bunching from Isaac's mapping mockup.

Definitely needs refactoring for execution speed; ~22 seconds to generate data for this route with delta=1 which is effectively our 'worst case' as far as amount of data for a route, ~5 seconds for delta=5, but I'm sure there are improvements to be made. Mapping is nearly instantaneous.