# Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Import Statements

In [None]:
# install
!pip install geopandas

Collecting geopandas
  Downloading geopandas-0.10.2-py2.py3-none-any.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 29.7 MB/s 
[?25hCollecting fiona>=1.8
  Downloading Fiona-1.8.20-cp37-cp37m-manylinux1_x86_64.whl (15.4 MB)
[K     |████████████████████████████████| 15.4 MB 39 kB/s 
[?25hCollecting pyproj>=2.2.0
  Downloading pyproj-3.2.1-cp37-cp37m-manylinux2010_x86_64.whl (6.3 MB)
[K     |████████████████████████████████| 6.3 MB 34.2 MB/s 
Collecting cligj>=0.5
  Downloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Collecting munch
  Downloading munch-2.5.0-py2.py3-none-any.whl (10 kB)
Collecting click-plugins>=1.0
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Installing collected packages: munch, cligj, click-plugins, pyproj, fiona, geopandas
Successfully installed click-plugins-1.1.1 cligj-0.7.2 fiona-1.8.20 geopandas-0.10.2 munch-2.5.0 pyproj-3.2.1


In [None]:
# imports
import numpy as np
import pandas as pd
import geopandas as gpd

# Generate Segment Data

## Inputs (make sure to update save variable)

In [None]:
route_str = 'B46'

In [None]:
read = f'/content/drive/My Drive/Bus Watcher Spec Project/Projects/Alex Amy + Sanket Shah/Data/Bus/API Call/{route_str}_2021-10-18.geojson'
save = f'/content/drive/My Drive/Bus Watcher Spec Project/Projects/Alex Amy + Sanket Shah/Data/Bus/Segment Data - Raw/{route_str}_2021-10-18_v2.csv'

## Run Data Generation Script (~30 - 60+ mins)

In [None]:
# read data
raw_data_df = gpd.read_file(read, ignore_geometry=True)

# remove vehicles that never report passenger_count
vehicles = set(raw_data_df['vehicle_id'])
for vehicle in vehicles:
  vehicle_data = raw_data_df[raw_data_df['vehicle_id'] == vehicle]
  num_non_nan_passenger_counts = vehicle_data['passenger_count'].notna().sum() 
  if num_non_nan_passenger_counts == 0:
    raw_data_df = raw_data_df[raw_data_df['vehicle_id'] != vehicle]
raw_data_df.reset_index(drop=True, inplace=True)
raw_data_df.shape[0]

# cast 'timestamp' column values as DateTime objects
raw_data_df['timestamp']  = pd.to_datetime(raw_data_df['timestamp'])

# create unique_trip_id column
raw_data_df['unique_trip_id'] = raw_data_df['trip_id'] + '-' + raw_data_df['service_date'] + '-' + raw_data_df['vehicle_id']

# generate segment_data_dict
unique_trip_ids = list(set(raw_data_df['unique_trip_id']))
segment_data_dict = {}
i = 0
for unique_trip_id in unique_trip_ids:
  unique_trip_id_df = raw_data_df.copy()
  unique_trip_id_df = unique_trip_id_df[unique_trip_id_df['unique_trip_id'] == unique_trip_id]
  unique_trip_id_stops = list(set(unique_trip_id_df['next_stop_id']))
  for unique_trip_id_stop in unique_trip_id_stops:
    unique_trip_id_stop_df = unique_trip_id_df.copy()
    if not pd.isna(unique_trip_id_stop):
      unique_trip_id_stop_df = unique_trip_id_stop_df[unique_trip_id_stop_df['next_stop_id'] == unique_trip_id_stop]
      unique_trip_id_stop_df.reset_index(drop=True, inplace=True)
      observation_count = unique_trip_id_stop_df.shape[0]
      duration = unique_trip_id_stop_df.timestamp.max() - unique_trip_id_stop_df.timestamp.min()
      middle = observation_count // 2
      segment_data = unique_trip_id_stop_df.loc[middle].to_dict()
      segment_data['observation_count'] = observation_count
      segment_data['duration'] = duration
      segment_data_dict[i] = segment_data
      i += 1
    else:
      unique_trip_id_stop_df = unique_trip_id_stop_df[unique_trip_id_stop_df['next_stop_id'].isna() == True]
      unique_trip_id_stop_df.reset_index(drop=True, inplace=True)
      unique_trip_id_stop_dict = unique_trip_id_stop_df.to_dict('index')
      for index in unique_trip_id_stop_dict:
        segment_data = unique_trip_id_stop_dict[index]
        segment_data['observation_count'] = np.nan
        segment_data['duration'] = np.nan
        segment_data_dict[i] = segment_data
        i += 1
segment_data_df = pd.DataFrame.from_dict(segment_data_dict, orient='index')

# save segment_data_df to drive as csv
segment_data_df.to_csv(save, index=False)