# Creating GeoJSON for mapping

## Loading data

In [None]:
import requests
import pandas as pd
from time import time
import numpy as np
import json

### A single day's (05/24/2020) full data pulled from api

In [None]:
url = 'http://sfmta-ds.eba-hqpuyrup.us-east-1.elasticbeanstalk.com/daily-general-json'

In [None]:
json_data = requests.get(url, params={'day': '2020-05-24'}).json()

In [None]:
# making df

full_data = pd.DataFrame(data=json_data).sort_values('timestamp')

## GeoJSON Generation

In [None]:
def create_simple_geojson(df, route=None):
    """
    Function to generate a GeoJSON with basic info for plotting

    Args:
    df = dataframe of daily data from db or api
         requires timestamp, age, rid, vid, latitude, longitude columns
    
    route = optional, str
            id of route requested
            if unspecified returns all routes in different format
    """
    start = time()

    # filtering by route if necessary
    if route:
        df = df[df.rid.eq(route)].copy()

    # for readability
    times = df.timestamp.values
    ages = df.age.values

    # adjusting timestamps by age of report
    df['adjusted_timestamp'] = [pd.Timestamp(times[x]) - 
                                pd.Timedelta(seconds=ages[x]) 
                                for x in range(len(df.timestamp))]

    print(f'Prep work done => {time()-start} seconds')

    # generating only for queried route if specified
    if route:
        # list of dataframes by vehicle id
        # verifying indexed by correct timestamp
        divided_dfs = [df[df.vid.eq(x)].set_index('adjusted_timestamp') 
                      for x in df.vid.unique()]

        print(f'Separated by Vehicle ID => {time()-start} seconds')

        # creating geojson
        geojson = {'type': 'FeatureCollection',
                   'route': route, 
                   'vehicles': {str(x.vid[0]): create_geojson_features(x) 
                                for x in divided_dfs}}

        print(f'GeoJSON created => {time()-start} seconds')
        
    else:
        # list of dataframes by route id
        divided_dfs = [df[df.rid.eq(x)].set_index('adjusted_timestamp')
                       for x in df.rid.unique()]

        print(f'Separated by Route ID => {time()-start} seconds')

        # creating geojson
        geojson = {'type': 'FeatureCollection',
                   'routes': {x.rid[0]: create_geojson_features(x) 
                             for x in divided_dfs}}

        print(f'GeoJSON created => {time()-start} seconds')

    return geojson

In [None]:
def create_geojson_features(df):
    """
    function to generate list of geojson features
    for plotting vehicle locations on timestamped map

    Expects a dataframe containing lat/lon, vid, timestamp
    returns list of basic geojson formatted features:

    {
      type: Feature
      geometry: {
        type: Point,
        coordinates:[lat, lon]
      },
      properties: {
        route_id: rid
        vehicle_id: vid
        time: timestamp
      }
    }
    """
    # initializing empty features list
    features = []

    # iterating through df to pull coords, vid, timestamp
    # and format for json
    for index, row in df.iterrows():
      feature = {
          'type': 'Feature',
          'geometry': {
              'type':'Point', 
              'coordinates':[row.latitude, row.longitude]
          },
          'properties': {
              'route_id': row.rid.__str__(),
              'vehicle_id': row.vid.__str__(),
              'time': row.timestamp.__str__(),
          }
      }
      features.append(feature) # adding point to features list
    return features

In [72]:
# generating for the cali 1 line

test_json = create_simple_geojson(full_data, '1')

Prep work done => 0.4732964038848877 seconds
Separated by Vehicle ID => 0.518723726272583 seconds
GeoJSON created => 3.169445753097534 seconds


In [None]:
# saving json file

with open('05-24-cali1.txt', 'w') as outfile:
  json.dump(test_json, outfile)

## Updates

Now generating basic GeoJSON files for Web.

Followed simplest common formatting I could find; tested with plotly and folium,
should play beautifully with mapbox but haven't personally tested.

### GeoJSON structure - Route Specified:
```
{
  "type": "FeatureCollection",
  "route": "route_specified_here",
  "vehicles": {
    "8624": [
      {
        "type": "Feature",
        "geometry": {
          "type": "Point",
          "coordinates": [
            37.7099,
            -122.404
          ]
        },
        "properties": {
          "route_id": "90",
          "vehicle_id": "8624",
          "time": "2020-05-24 00:00:13"
        }
      },
      {
        "type": "Feature",
        "geometry": {
          "type": "Point",
          "coordinates": [
            37.7099,
            -122.404
          ]
        },
        "properties": {
          "route_id": "90",
          "vehicle_id": "8624",
          "time": "2020-05-24 00:01:13"
        }
      }
    "8810": [
      {
        "type": "Feature",
        etc. etc. etc.
```

### GeoJSON Structure - Route Unspecified:
```
{
  "type": "FeatureCollection",
  "routes": {
    "1": [
      {
        "type": "Feature",
        "geometry": {
          "type": "Point",
          "coordinates": [
            37.7906,
            -122.428
          ]
        },
        "properties": {
          "route_id": "1",
          "vehicle_id": "5805",
          "time": "2020-05-24 03:40:12"
        }
      },
      {
        "type": "Feature",
        "geometry": {
          "type": "Point",
          "coordinates": [
            37.7908,
            -122.426
          ]
        },
        "properties": {
          "route_id": "1",
          "vehicle_id": "5805",
          "time": "2020-05-24 03:41:13"
        }
      }
    ]
    "2": [
      {
        "type: "Feature",
        etc. etc. etc.
```

It seems to me the general use case is by route; mapping whole transit types could be way too cluttered to be useful. Nevertheless, the functionality is there if needed.

Generating the JSON for a single route adds a negligible amount of time to the time spent generating a hypothetical daily report. Highly dependent on number of vehicles to parse but between .3 seconds and 4 seconds min and max during testing. Generating for all routes takes ~40 seconds. Given that we're generating scheduled daily reports and aren't worried about generating and serving this data live except under very specific use cases, I don't foresee this being an issue either way. 

Probably some optimization to be done there but definitely not worth taking the time until we actually have MVP out.

One (maybe more pressing) concern is JSON size; not sure what is reasonable, but largest for a single route is ~3.5mb. I imagine there is optimization to be done in formatting to cut down on that size but this is outside of my wheelhouse.

Given that we're timestamping for slider instead of just plotting all at once my understanding is that we're locked into a certain amount of bloat to ensure relationships between points and timestamps.