# AIS Shipping Data Demo

In this demo, we'll be looking at a preprocessed csv file containing id-state-action-state transitions to plot the corresponding discretized ship trajectories on a map. For information on  how this csv was generated, please reference ``README.md``.

In [3]:
import yaml
import numpy as np
import pandas as pd
import plotly
import plotly.graph_objs as go

First, we load the metadata. The metadata specifies the dimensions and resolution of the grid in longitude and latitude so we can plot the discretized trajectories on a map by mapping coordinates to states. The ``grid_len`` is the side length of one square in degrees of a regular Euclidean grid with ``num_cols`` columns. With this information, we can deduce the boundaries of a grid square from an integer state.

In [4]:
meta_file= 'meta_data.yaml'
ais_meta = {}
with open(meta_file, 'r') as stream:
    try:
        ais_meta = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

all_files_meta = ais_meta['all_files_meta']
directories = ais_meta['directories']
grid_params = ais_meta['grid_params']

# specifies input directory and files of interest
in_dir_path = directories['in_dir_path']
in_dir_data = directories['in_dir_data']

ais_meta

{'all_files_meta': {'AIS_2017_01_Zone18.csv': {'month': 1,
   'year': 2017,
   'zone': 18}},
 'directories': {'in_dir_data': 'ais_data_output.csv', 'in_dir_path': './'},
 'grid_params': {'grid_len': 1.0,
  'max_lat': 42.0,
  'max_lon': -72.0,
  'min_lat': 35.0,
  'min_lon': -78.0,
  'num_cols': 6}}

Now we load the sequences.

In this data, there are thousands of trajectories.

In [5]:
# reads in first file of interest
ais_data = pd.read_csv(in_dir_path + in_dir_data)

ais_data.head(10)

Unnamed: 0,sequence_id,from_state_id,action_id,to_state_id
0,0,14,2,20
1,0,20,1,21
2,0,21,3,20
3,0,20,1,21
4,1,33,1,34
5,1,34,3,33
6,1,33,1,34
7,2,21,2,27
8,2,27,1,28
9,3,26,4,20


We define a function that converts the ``state_id``s from the ``ais_data`` to the coordinates of the top left corner of the corresponding grid square.

In [4]:
def state_to_coord(state):
    state_col = state % grid_params['num_cols']
    state_row = state // grid_params['num_cols']
    state_lon = grid_params['min_lon'] + grid_params['grid_len'] * state_col
    state_lat = grid_params['min_lat'] + grid_params['grid_len'] * state_row
    return state_lon, state_lat

We use pandas to create a new dataframe that will contain the coordinates of the center of each state in each sequence.

In [5]:
# TODO: might be a better idea to just create a mapping of state: (lon, lat)
cols = ['lon', 'lat']
ais_coord = ais_data['from_state_id'].apply(state_to_coord)
ais_data[cols] = pd.DataFrame(ais_coord.tolist()) + [grid_params['grid_len'] / 2, -grid_params['grid_len'] / 2]
ais_data.head()

Unnamed: 0,sequence_id,from_state_id,action_id,to_state_id,lon,lat
0,0,14,2,20,-75.5,36.5
1,0,20,1,21,-75.5,37.5
2,0,21,3,20,-74.5,37.5
3,0,20,1,21,-75.5,37.5
4,1,33,1,34,-74.5,39.5


We also create a dataframe that contains only the unique coordinate tuples in the dataset so that each state is only plotted once on the scatterplot.

In [6]:
unique_coords = ais_coord.unique() 
ais_unique = pd.DataFrame(data=unique_coords.tolist(), columns=cols) + [grid_params['grid_len'] / 2, -grid_params['grid_len'] / 2]
ais_unique.head()

Unnamed: 0,lon,lat
0,-75.5,36.5
1,-75.5,37.5
2,-74.5,37.5
3,-74.5,39.5
4,-73.5,39.5


With the latitudes and longitudes now available, we create a list of dataframes, with each entry of the list corresponding to a different trajectory defined by the ``sequence_id`` column.

In [7]:
sequence_id_unique = ais_data['sequence_id'].unique().tolist()
sequence_dfs = []
for seq_id in sequence_id_unique:
    seq_df = ais_data.loc[ais_data.sequence_id == seq_id]
    # adds final dummy row to each sequence with just the final state in the trajectory
    last_row = seq_df.iloc[-1]
    lon, lat = state_to_coord(last_row['to_state_id'])
    lon += grid_params['grid_len'] / 2
    lat += -grid_params['grid_len'] / 2

    last_row[['from_state_id', 'action_id', 'to_state_id', 'lon', 'lat']] = [last_row['to_state_id'], np.nan, np.nan, lon, lat]
    last_row = np.reshape(last_row.tolist(), (1, -1))
    last_row = pd.DataFrame(data=last_row, columns=seq_df.columns)
    seq_df = pd.concat([seq_df, last_row], ignore_index=True)
    # adds sequence to the list of sequences
    sequence_dfs.append(seq_df)
# TODO: I might want to drop the sequence_id row at this point...



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



We then use plotly to plot the desired area of the globe.

In [8]:
plotly.offline.init_notebook_mode(connected=True)

ais_states = [go.Scattergeo(
    locationmode = 'USA-states',
    lat = ais_unique['lat'],
    lon = ais_unique['lon'],
    # hoverinfo = 'text',
    # text = ais_data['from_state_id'],
    mode = 'markers',
    marker = go.scattergeo.Marker(
        size = 2,
        color = 'red',
        line = go.scattergeo.marker.Line(
            width = 3,
            color = 'rgba(68, 68, 68, 50)'
        )
    ))]


MAX_TRAJECTORIES = 20
ais_trajectories = []
# for i in range(MAX_TRAJECTORIES):
for i in range(len(sequence_dfs)):
    sequence_df = sequence_dfs[i]
    # gets random color for each trajectory
    red = str(np.random.randint(0, high=230))
    green = str(np.random.randint(0, high=230))
    blue = str(np.random.randint(0, high=230))
    ais_trajectories.append(
        go.Scattergeo(
            lon = sequence_df['lon'],
            lat = sequence_df['lat'],
            mode = 'lines',
            line = go.scattergeo.Line(
                width = 1,
                color = 'rgb(' + red + ', ' + blue + ', ' + green + ')',
            ),
        )
    )

layout = go.Layout(
    autosize=False,
    width=900,
    height=750,
    title = go.layout.Title(
        text = 'Shipping data states scatter'
    ),
    showlegend = False,
    geo = go.layout.Geo(
        scope = 'north america',
        resolution = 50,
        projection = go.layout.geo.Projection(
            type = 'equirectangular'
        ),
        showland = True,
        showlakes = True,
        coastlinewidth = 2,
        landcolor = 'rgb(204, 204, 204)',
        lakecolor = 'rgb(255,255,255)',
        countrycolor = 'rgb(190, 190, 190)',
        lonaxis = go.layout.geo.Lonaxis(
            range = [grid_params['min_lon'] - 25, grid_params['max_lon'] + 25],
            # range = [-150, -50],
            showgrid = True,
            dtick = grid_params['grid_len']
        ),
        lataxis = go.layout.geo.Lataxis(
            range = [grid_params['min_lat'] - 15, grid_params['max_lat'] + 15],
            showgrid = True,
            dtick = grid_params['grid_len']
        ),
    ),
)

fig = go.FigureWidget(data = ais_states + ais_trajectories, layout = layout)

lines = fig.data[1:]

# create our callback function
def update_point(trace, points, selector):
    if len(points.point_inds) > 0:
        trace.line.width += 1

for line in lines:
    line.on_click(update_point)

fig

FigureWidget({
    'data': [{'lat': array([36.5, 37.5, 37.5, 39.5, 39.5, 38.5, 38.5, 34.5, 35.5, 38.5, 37.5, 3…