In [1]:
from pathlib import Path
import json

import gtfstk as gt
import pandas as pd
import numpy as np
import shapely.geometry as sg


DATA_DIR = Path('../data')
OUT_DIR = Path('../output')

In [2]:
path = DATA_DIR/'wellington_gtfs_20171016.zip'
feed = gt.read_gtfs(path, dist_units='km')
feed.assess_quality()

Unnamed: 0,indicator,value
0,num_route_short_names_duplicated,0
1,frac_route_short_names_duplicated,0
2,num_stop_time_dists_missing,340007
3,frac_stop_time_dists_missing,1
4,num_direction_ids_missing,0
5,frac_direction_ids_missing,0
6,num_trips_missing_shapes,0
7,frac_trips_missing_shapes,0
8,num_departure_times_missing,0
9,frac_departure_times_missing,0


In [3]:
# shapes_g = feed.shapes_to_geojson()
# path = Path('../wellington_shapes_20171016.geojson')
# with path.open('w') as tgt:
#     json.dump(shapes_g, tgt)

In [4]:
"""
If no shapes, use stops only.
If shapes, then add distances to stop times and to shapes
"""
trip_stats = feed.compute_trip_stats(compute_dist_from_shapes=True)
feed = feed.append_dist_to_stop_times(trip_stats)


In [53]:
def refine(xs, n):
    """
    Given a strictly increasing NumPy array ``xs`` of at least two numbers
    x_1 < x_2 < ... < x_r and an integer ``n`` >= 0, 
    insert into the list ``n`` more numbers between x_1 and x_r
    in a spread-out way.
    Return the resulting list as a NumPy array.
    """
    while n > 0:
        diffs = np.diff(xs)

        # Get indices i, j of biggest diffs d_i > d_j.
        # Use the method at https://stackoverflow.com/a/23734295 for speed.
        try:
            indices = np.argpartition(diffs, -2)[-2:]
            i, j = indices[np.argsort(diffs[indices])[::-1]]
            d_i, d_j = diffs[i], diffs[j]
            
            # Choose k => 1 least such that d_i/(k + 1) < d_j
            # with the intent of inserting k evenly spaced points 
            # between x_i and x_{i+1}
            k = int(max(1, np.ceil(d_i/d_j - 1)))

            # Shrink k if necessary so as not to exceed number of remaining points
            k = min(k, n)
        except ValueError:
            # Here xs has only two elements, hence diffs has only one element.
            # Using try-except because faster than if-else.
            i = 0
            d_i = diffs[0]
            k = n
        
        # Insert the k points, updating xs
        xs = np.concatenate([
          xs[:i + 1], 
          [xs[i] + s*d_i/(k + 1) for s in range(1, k + 1)], 
          xs[i + 1:]
          ])
        
        # Update n
        n -= k
        
    return xs

In [58]:
# Test refine() some

xs = np.array([0, 3/4, 1])
assert np.array_equal(refine(xs, 0), xs)
assert np.array_equal(refine(xs, 1), np.array([0, 3/8, 3/4, 1]))
assert np.array_equal(refine(xs, 2), np.array([0, 1/4, 1/2, 3/4, 1]))
assert np.array_equal(refine(xs, 3), np.array([0, 1/4, 1/2, 3/4, 7/8, 1]))

In [68]:
def get_stop_patterns(feed, trip_ids=None, sep='-'):
    """
    Return the DataFrame ``feed.trips`` with the additional column
    
    - ``'stop_pattern'``: string; the stop IDs along the 
      trip joined by the separator ``sep``
      
    If a list of trip IDs is also given, then restrict the output
    to those trip IDs.
    """
    st = feed.stop_times.copy()
    if trip_ids is not None:
        # Filter to given trip IDs
        st = st[st['trip_id'].isin(trip_ids)].copy()
            
    def get_pattern(group):
        return group.stop_id.str.cat(sep=sep)
        
    f = st.groupby('trip_id').apply(get_pattern).reset_index().rename(
      columns={0: 'stop_pattern'})
    return feed.trips.merge(f)
    
def build_sample_points_by_trip(feed, trip_ids=None, max_sample_points=100):
    """
    Given a GTFS feed (GTFSTK Feed instance), 
    preferably with a ``feed.stop_times.shape_dist_traveled`` column, 
    return a dictionary of the form 
    
    trip ID -> list of (longitude, latitude) sample points along trip.
    
    Each sample point list comprises the k stop points of the trip
    along with ``max_sample_points - k`` additional points somewhat evenly 
    sampled from the trip shape, all in the order of the trip's travel.
    If k >= ``max_sample_points`` or if the trip has no shape ID or
    if the trip has fewer than two ``shape_dist_traveled`` values, then
    only include the k stops of the trip.
    """
    t = feed.trips
    
    if trip_ids is not None:
        # Filter trips to given trip IDs
        t = t[t['trip_id'].isin(trip_ids)].copy()
    
    # Append stop patterns to trips for later
    t = get_stop_patterns(feed, t.trip_id)
    
    # Get shape geometries
    geom_by_shape = feed.build_geometry_by_shape(shape_ids=t.shape_id) or {}
    
    # Get stops times for the given trips
    st = feed.stop_times
    st = st[st['trip_id'].isin(t.trip_id)].sort_values(
      ['trip_id', 'stop_sequence']).merge(t[['trip_id', 'stop_pattern']])

    # Append null dists for later
    if not 'shape_dist_traveled' in st:
        st['shape_dist_traveled'] = np.nan
    
    # Join in stop locations
    st = st.merge(t).merge(feed.stops)
    
    # Build dictionary (shape ID, stop pattern) -> list of (lon, lat) sample points,
    # instead of trip ID -> list of (lon, lat) sample points, because the former
    # avoids repeating computations.
    n = max_sample_points
    points_by_ssp = {}    
    for (shape_id, stop_pattern), group in st.groupby(['shape_id', 'stop_pattern']):
        if (shape_id, stop_pattern) in points_by_ssp:
            # Already computed
            continue
            
        k = group.shape[0]  # Number of stops along trip            
        if k >= n or shape_id not in geom_by_shape\
          or group.shape_dist_traveled.count() < 2:
            # Use stop points only
            points = group[['stop_lon', 'stop_lat']].values.tolist()
        else:
            # Start with stop points, and mark their distances for later sorting.
            # Scale distances to interval [0, 1] to avoid changing coordinate systems.
            group['shape_dist_traveled'] /= group['shape_dist_traveled'].max()
            stop_points = group[['stop_lon', 'stop_lat', 'shape_dist_traveled']].values
            dists = group['shape_dist_traveled'].values
            
            # Get n - k nicely spaced points from trip shape.
            new_dists = np.setdiff1d(refine(dists, n - k), dists)   
            geom = geom_by_shape[shape_id]
            shape_points = [
              list(geom.interpolate(d, normalized=True).coords[0]) + [d]
              for d in new_dists]
            
            # Combine with stop points and sort
            points = sorted(np.concatenate([stop_points, shape_points]).tolist(), 
              key=lambda x: x[2])
            
            # Remove distance markers
            points = [x[:2] for x in points]
            
        points_by_ssp[(shape_id, stop_pattern)] = points

    # Build final dict trip ID -> list of (lon, lat) sample points
    return {t: points_by_ssp[(s, sp)] 
      for t, s, sp in t[['trip_id', 'shape_id', 'stop_pattern']].values}


In [69]:
t = feed.trips.sample(frac=0.1)
%time tp1 = build_sample_points_by_trip(feed, t.trip_id, 1)
%time tp2 = build_sample_points_by_trip(feed, t.trip_id)


CPU times: user 524 ms, sys: 0 ns, total: 524 ms
Wall time: 520 ms


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


CPU times: user 11.9 s, sys: 140 ms, total: 12.1 s
Wall time: 12.1 s


In [9]:
tid = t.trip_id.iat[2]
print(tid)
l1 = sg.LineString(tp1[tid])
l2 = sg.LineString(tp2[tid])
l1

110__0__2149__VLYF__715__1__715__1


NameError: name 'tp2' is not defined

In [None]:
l2