# GTFS Stops

## Create a geopandas geodataframe from a GTFS feed

Here is a url for a GTFS data feed. Let's turn it into a flexible geodataframe!

In [None]:
url = 'http://web.mta.info/developers/data/nyct/subway/google_transit.zip'

We first have to import some modules from the [Python standard library](https://docs.python.org/3/library/)

We also have to import some third party modules.

Add the *conda-forge* channel to your base channel by running:

`conda config --add channels conda-forge`

You can then create an environment with these dependencies by running: 

`conda create --name geo_env --file package-list.txt`

In [None]:
import requests
from zipfile import ZipFile
from io import StringIO, BytesIO

import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import contextily as ctx
from shapely import geometry

Define a function for converting gtfs zipfiles into pandas dataframes. The dataframes are stored in a python dictionary.

In [None]:
def gtfsZipToDataframes(zip):
    # create a dictionary to store the dataframes
    dataframes = {}

    # check if the zipfile has the required files
    required_files = ['agency.txt', 'stops.txt', 'routes.txt', 'trips.txt', 'stop_times.txt']

    has_required_files = [req_file in (file.filename for file in zip.filelist) for req_file in required_files]

    if all(has_required_files):
        # also check if there are any files not part of the gtfs standard and alert the user
        standard_files = required_files + [
            'shapes.txt', 'stop_times.txt', 'calendar.txt', 'calendar_dates.txt',
            'fare_attributes.txt', 'fare_rules.txt', 'frequencies.txt', 'transfers.txt',
            'pathways.txt', 'levels.txt', 'translations.txt', 'feed_info.txt', 'attributions.txt'
        ]

        # for each file in the list
        for file in zip.filelist:
            file_name = file.filename
            # check that it is one of the standard files
            if any(standard_file_name == file_name for standard_file_name in standard_files):
                # open the file, read the data into a pandas dataframe, and add that to the dictionary
                with zip.open(file_name) as f:
                    bytes = f.read()
                    s = str(bytes, 'utf-8')
                    data = StringIO(s)
                    df = pd.read_csv(data, low_memory=False)
                    name = file_name.split('.txt')[0]
                    dataframes[name] = df
            else:
                print('%s is not part of the gtfs specification!' % file_name)
    else:
        missing_files = [required_files[i] for i, has_file in enumerate(has_required_files) if not has_file]
        raise Exception('GTFS feed does not have the required file(s): %s' % ' '.join(missing_files))

    return dataframes

Run the function and list the resulting dataframes. The number of dataframes will vary between different gtfs sources.

*You can also work with a local copy of the data for improved performance*

In [None]:
# zip = zipfile.ZipFile('data/nyc_subways.zip')
r = requests.get(url)
zip = ZipFile(BytesIO(r.content))
gtfs_dataframes = gtfsZipToDataframes(zip)
list(gtfs_dataframes)

Let's take a look at the agency dataframe

In [None]:
agency = gtfs_dataframes['agency']
agency.head()

Here's the routes dataframe

In [None]:
routes = gtfs_dataframes['routes']
routes.head()

We can join the agency and routes dataframes on the agency_id column

In [None]:
agency_routes = agency.join(
    routes.set_index('agency_id'),
    on='agency_id'
)
agency_routes.head()

Here's a look at the trips datframe

In [None]:
trips = gtfs_dataframes['trips']
trips.head()

We can join the trips and routes dataframes on the route_id

In [None]:
routes_trips = agency_routes.join(
    trips.set_index('route_id'),
    on='route_id'
)
routes_trips.head()

This is the stop_times dataframe. It links the stops to the trips.

In [None]:
stop_times = gtfs_dataframes['stop_times']
stop_times.head()

We can join the shapes and trips on the shape_id column. Let's also drop a bunch of columns that no longer need.

In [None]:
trip_stop_times = routes_trips.join(
    stop_times.set_index('trip_id'),
    on='trip_id'
)

trips_stop_times.drop(
    [
        'service_id',
        'trip_id',
        'trip_headsign',
        'direction_id',
        'block_id',
        'shape_dist_traveled'
    ], 
    axis=1,
    errors='ignore',
    inplace=True
)
trip_stop_times.head()

Here's the stops dataframe. This contains the spatial data describing the location of each stop.

In [None]:
stops = gtfs_dataframes['stops']
stops.head()

We can join the stops and the stop times on the stop_id column

In [None]:
trip_stops = stops.join(
    trip_stop_times.set_index('stop_id'),
    on='stop_id'
)
trip_stops.head()

Let's list the unique route ids

In [None]:
list(routes.route_id.unique())

Now for the fun part... We're going to transform this data into a geodataframe. Additional notes describing each step are provided in the code comments.

In [None]:
# create a list to store the stops for each route
route_list = []

# for each unique route_id
for route_id in routes.route_id.unique():
    
    # get the route shapes
    route_stops = trip_stops.loc[trip_stops.route_id == route_id]
    
    # check if there are stops
    if route_stops.size > 0:
        
        # get the unique stop points for the route
        route_stops = route_stops.drop_duplicates(
            subset=['stop_id', 'stop_lat', 'stop_lon'],
            keep='first'
        ).reset_index(drop=True)

        # add a shapely geometry column
        route_stops['geometry'] = route_stops.apply(
            lambda row: geometry.Point(row.stop_lon, row.stop_lat),
            axis=1
        )
        
        # we no longer need the original lat lon columns, so we can drop them
        route_stops.drop(
            [
                'stop_lat',
                'stop_lon'
            ], 
            axis=1,
            inplace=True
        )

        # append the stops to the route list
        route_list.append(route_stops)

# create a geodataframe from the route list
network_stops = gpd.GeoDataFrame(pd.concat(route_list)).reset_index(drop=True)
network_stops.set_crs(4236, inplace=True) # set the spatial reference
network_stops.to_crs(epsg=3857, inplace=True) # project the coordinates
network_stops.head()

Check if there is a route color and set a default, if none.

In [None]:
default_color = '000000'
if 'route_color' in network_stops.columns:
    network_stops.route_color.fillna(default_color, inplace=True)
else:
    network_stops.route_color = default_color

Now we can plot the geodataframe

In [None]:
ax = network_stops.plot(color='#' + network_stops.route_color, figsize=(10, 10))
ctx.add_basemap(ax)
plt.show()

Contextily provides a number of basemap sources

In [None]:
list(ctx.providers.keys())

Let's take a look at CartoDB

In [None]:
list(ctx.providers.CartoDB.keys())

The Positron tileset gives us a nice light background so we can easily see our routes.

In [None]:
ax = network_stops.plot(color='#' + network_stops.route_color, figsize=(10, 10))
ctx.add_basemap(ax, source=ctx.providers.CartoDB.Positron)
plt.show()