# Investigating Stop Assignment

## Loading data

In [None]:
import requests
import pandas as pd
import numpy as np
import geopy

### A single day's (05/24/2020) full data pulled from api

In [None]:
url = 'http://sfmta-ds.eba-hqpuyrup.us-east-1.elasticbeanstalk.com/daily-general-json'

In [None]:
json = requests.get(url, params={'day': '2020-05-24'}).json()

In [None]:
# making df

df = pd.DataFrame(data=json)

In [None]:
# paring down to a couple of buses 
# (most reports and second-most reports) 
# on a single route to simplify
# and making sure we're sorted by time (stupid-check)

nbus = df[df['rid']=='NBUS']
nbus_highest = nbus[nbus['vid']==(nbus['vid'].value_counts().index[0])].sort_values('timestamp')
nbus_second = nbus[nbus['vid']==(nbus['vid'].value_counts().index[1])].sort_values('timestamp')

In [None]:
# using stops gathered by Labs 22 for expediency

stops = pd.read_csv('https://raw.githubusercontent.com/Lambda-School-Labs/sfmta-data-analysis-ds/master/deprecated_assets/datasets/route_info.csv')

In [None]:
nbus_stops = stops[stops['route_id']=='NBUS']

In [None]:
nbus_stops

Unnamed: 0,route_id,lat,lon,stopId,tag,title,dir
652,NBUS,37.77658,-122.39549,16695,6695,Townsend St & 4th St,Outbound
653,NBUS,37.77960,-122.38955,15235,5235,King St & 2nd St,Outbound
654,NBUS,37.78455,-122.38795,17447,7447,The Embarcadero & Brannan St,Outbound
655,NBUS,37.79108,-122.39010,14508,4508,The Embarcadero & Folsom St,Outbound
656,NBUS,37.79347,-122.39618,15669,5669,Market St & Drumm St,Outbound
...,...,...,...,...,...,...,...
716,NBUS,37.79257,-122.39702,15658,5658,Market St & Beale St,Inbound
717,NBUS,37.79056,-122.38990,14511,4511,The Embarcadero & Folsom St,Inbound
718,NBUS,37.78360,-122.38832,14531,4531,The Embarcadero & Townsend St,Inbound
719,NBUS,37.77980,-122.38994,15236,5236,King St & 2nd St,Inbound


## Engineering Probable Nearest Stop and Distance (For Confidence)

### Original helper function for wrangling

In [None]:
def wrangle_bus(df):
  """
  preps dataframe for a single bus
  gives accurate timestamps and naively calculates 
  dwell time as 1min per checkin with motion (kph <= 0)
  """
  times = df['timestamp'].values
  ages = df['age'].values
  df['adjusted_timestamp'] = [pd.Timestamp(times[x]) - pd.Timedelta(seconds=ages[x]) for 
                              x in range(len(df['timestamp']))]
  df['timestamp'] = [pd.Timestamp(times[x]) for x in range(len(df['timestamp']))]

  dwell_count = 0
  dwell_totals = []
  df['dwell'] = 0
  for x in df['kph']:
    if x <= 0:
      dwell_count += 1
      dwell_totals.append(dwell_count)
    elif x > 0:
      dwell_totals.append(0)
      dwell_count = 0
  df['dwell'] = [dwell_totals[x] for x in range(len(df))]

  return df

### Function to calculate nearest stop within $X$ km by geodesic distance

This is definitely not the most efficient code; I haven't done any refactoring yet\
 so its slow AF and probably (100%) has extra steps.\
There's definitely MUCH more fast/efficient implementations of what I'm doing here.

However, based off of the eyeball test it seems to be doing a pretty good job of assigning stops\
appropriately. Dwell times are lining up closely with time at stops, and the progression of stops\
appears to be accurate in comparison to the route.

I'm using geopy (plays nice with lat/lon) to calculate the geodesic distance between reported\
locations and stops. Essentially straight-line (euclidean) distance but generalized to a curved surface.

The calculated distance is used to decide whether to assign a stop at a reported location or not;\
it also serves as a measure of confidence in the assigned stop.\
The smaller the distance, the more certain we are the bus is at or very near that stop.

Code below (or similar) should generalize - time/route/vehicle agnostic - but as is larger\
amounts of data will equate to extravagant execution times. For the purpose of scheduled\
generation of daily reports this probably doesn't matter all that much; a user won't have to sit and\
wait while the report generates from scratch. Still, leaves a lot to be desired. :coolcry:


In [None]:
def assign_stop(df, stops):
  """
  applies basic wrangling function
  calculates nearest stop from reported location in km
  returns dataframe with reported location, 
  nearest stop (coords and name), and distance between

  tested with single buses on single routes on a single day;
  technically route/vehicle/time agnostic
  don't foresee any issues beyond extravagant execution time
  but haven't tested yet
  """

  # original wrangling function for continuity
  wrangle_bus(df)

  # creating list of lat/lon dictionaries for stops and reported bus locations
  stop_lats = stops['lat'].values
  stop_lons = stops['lon'].values

  reported_lats = df['latitude'].values
  reported_lons = df['longitude'].values

  stop_points = [{'latitude': stop_lats[x], 'longitude': stop_lons[x]} 
                 for x in range(len(stops))]

  reported_points = [{'latitude': reported_lats[x], 
                      'longitude': reported_lons[x]} 
                     for x in range(len(df))]

  # just less than the smallest distance between stops as reported by sfmta
  # to minimize possible overlap between probable stops
  # In km to play nice with geopy
  radius = .09

  # dict to tuples to play nice with geopy
  stop_point_tuples = [tuple(stop_points[x].values()) 
                       for x in range(len(stop_points))]

  reported_point_tuples = [tuple(reported_points[x].values()) 
                           for x in range(len(reported_points))]
  
  df['reported_location'] = reported_point_tuples
  
  # generating ((lat/lon), distance) tuples for nearest stop within range
  point_stops = []
  for location in reported_point_tuples:
    distances = {x: geopy.distance.distance(location, x) 
                 for x in stop_point_tuples}

    distances_sorted = {k: v for k, v in sorted(distances.items(), 
                                                key=lambda item: item[1])}

    smallest_distance = next(iter(distances_sorted.items()))

    if smallest_distance[1] <= radius:
      point_stops.append(smallest_distance)
    else:
      point_stops.append(None)

  # assigning stop name from stops table based on lat/lon from previous step
  stop_titles = []
  for stop in point_stops:
    if stop != None:
      stop = stop[0] # if not none, grab lat/lon from tuple

      # if assigned stop in stops table, grab name of stop from stops table
      if stop in list(zip(stops['lat'], stops['lon'])):
        title_pos = list(zip(stops['lat'], stops['lon'])).index(stop)
        stop_titles.append(stops['title'].iloc[title_pos])
      else:
        # possible assigned stops are generated from stops table
        # theoretically this else statement should never execute
        stop_titles.append(None)

    else:
      # if no assigned stop, don't assign stop ¯\_(ツ)_/¯
      stop_titles.append(None)
  
  # pulling lat/lon and distance from tuples for df
  df['nearest_stop'] = [x[0] if x != None else x for x in point_stops]
  df['distance'] = [x[1] if x != None else x for x in point_stops]

  # pulling stop names from list for df
  df['stops'] = [stop_titles[x] for x in range(len(stop_titles))]

  # dropping columns of redundant information
  df = df.drop(columns=['age', 'rid', 'vid', 'latitude', 'longitude'])
  
  return df

## Results

In [None]:
%%timeit

nbus_highest_wrangled = assign_stop(nbus_highest, nbus_stops)

1 loop, best of 3: 14.1 s per loop


In [None]:
nbus_highest_wrangled[['adjusted_timestamp', 'kph', 'direction', 
                       'dwell', 'reported_location', 'stops', 
                       'nearest_stop', 'distance']].head(20)

Unnamed: 0,adjusted_timestamp,kph,direction,dwell,reported_location,stops,nearest_stop,distance
9458,2020-05-24 04:08:55,39,,0,"(37.7595, -122.508)",,,
9516,2020-05-24 04:09:55,8,,0,"(37.7601, -122.509)",Judah & La Playa St,"(37.7603599, -122.50900990000001)",0.028859994044171066 km
9573,2020-05-24 04:10:43,0,,1,"(37.7602, -122.509)",Judah & La Playa St,"(37.7603599, -122.50900990000001)",0.017769039167965333 km
9630,2020-05-24 04:11:43,0,,2,"(37.7602, -122.509)",Judah & La Playa St,"(37.7603599, -122.50900990000001)",0.017769039167965333 km
9687,2020-05-24 04:12:42,0,,3,"(37.7602, -122.509)",Judah & La Playa St,"(37.7603599, -122.50900990000001)",0.017769039167965333 km
9746,2020-05-24 04:13:41,0,,4,"(37.7602, -122.509)",Judah & La Playa St,"(37.7603599, -122.50900990000001)",0.017769039167965333 km
9809,2020-05-24 04:14:41,0,,5,"(37.7602, -122.509)",Judah & La Playa St,"(37.7603599, -122.50900990000001)",0.017769039167965333 km
9872,2020-05-24 04:15:41,0,,6,"(37.7602, -122.509)",Judah & La Playa St,"(37.7603599, -122.50900990000001)",0.017769039167965333 km
9935,2020-05-24 04:16:41,13,NBUS_I_F00,0,"(37.7604, -122.506)",Judah St & 46th Ave,"(37.7603899, -122.50606)",0.005404586774553971 km
10002,2020-05-24 04:17:40,32,NBUS_I_F00,0,"(37.7607, -122.5)",Judah St & 40th Ave,"(37.760740000000006, -122.49935990000002)",0.056578201591308795 km


In [None]:
nbus_highest_wrangled[['adjusted_timestamp', 'kph', 'direction', 
                       'dwell', 'reported_location', 'stops', 
                       'nearest_stop', 'distance']].tail(20)

Unnamed: 0,adjusted_timestamp,kph,direction,dwell,reported_location,stops,nearest_stop,distance
206296,2020-05-24 21:13:54,0,NBUS_I_F00,1,"(37.7785, -122.415)",Market St & Hyde St,"(37.779109999999996, -122.41437990000001)",0.08699561130631518 km
206385,2020-05-24 21:15:08,0,NBUS_I_F00,2,"(37.7803, -122.413)",Market St & 7th St,"(37.7803599, -122.41261000000002)",0.034993953652685976 km
206470,2020-05-24 21:15:56,0,NBUS_I_F00,3,"(37.7803, -122.413)",Market St & 7th St,"(37.7803599, -122.41261000000002)",0.034993953652685976 km
206551,2020-05-24 21:16:57,14,NBUS_I_F00,0,"(37.7816, -122.411)",,,
206632,2020-05-24 21:17:59,8,NBUS_I_F00,0,"(37.7835, -122.409)",,,
206711,2020-05-24 21:19:00,13,NBUS_I_F00,0,"(37.7843, -122.408)",Market St & 5th St North,"(37.784079999999996, -122.4079899)",0.02443453040954525 km
206790,2020-05-24 21:20:01,32,NBUS_I_F00,0,"(37.786, -122.406)",,,
206868,2020-05-24 21:21:02,0,NBUS_I_F00,1,"(37.7873, -122.404)",Market St & 3rd St,"(37.78753, -122.4035199)",0.04939756729417067 km
206947,2020-05-24 21:21:55,26,NBUS_I_F00,0,"(37.7905, -122.4)",,,
207024,2020-05-24 21:22:56,0,NBUS_I_F00,1,"(37.7921, -122.398)",,,


In [None]:
nbus_second_wrangled = assign_stop(nbus_second, nbus_stops)

nbus_second_wrangled[['adjusted_timestamp', 'kph', 'direction', 
                      'dwell', 'reported_location', 'stops', 
                      'nearest_stop', 'distance']].head()

Unnamed: 0,adjusted_timestamp,kph,direction,dwell,reported_location,stops,nearest_stop,distance
10477,2020-05-24 04:25:05,16,NBUS_O_F00,0,"(37.7838, -122.408)",Market St & 5th St North,"(37.784079999999996, -122.4079899)",0.031090602503148934 km
10560,2020-05-24 04:26:05,0,NBUS_O_F00,1,"(37.7825, -122.41)",,,
10648,2020-05-24 04:27:05,0,NBUS_O_F00,2,"(37.7808, -122.412)",Market St & 7th St,"(37.7803599, -122.41261000000002)",0.0726207272423116 km
10734,2020-05-24 04:28:06,11,NBUS_O_F00,0,"(37.7799, -122.413)",Market St & 7th St,"(37.7803599, -122.41261000000002)",0.06153056958246852 km
10820,2020-05-24 04:29:05,0,NBUS_O_F00,1,"(37.779, -122.414)",Market St & Hyde St,"(37.779109999999996, -122.41437990000001)",0.03562486777096445 km
