# Interpolating wait times

## import necessary packages

In [1]:
import pandas as pd

## import data

In [101]:
ed_df = pd.read_csv('../data/hour_extracted/edmonds.csv')
ki_df = pd.read_csv('../data/hour_extracted/kingston.csv')

In [102]:
ed_df.head()

Unnamed: 0,tweet_text,time,wait_time
0,edmonds terminal wait time - one hour,2018-02-09 23:35 +0000,1.0
1,update - no extended wait in edmonds,2018-02-10 02:05 +0000,0.0
2,update - no extended wait departing edmonds,2018-02-10 03:35 +0000,0.0
3,edmonds terminal wait time - one hour,2018-02-10 20:25 +0000,1.0
4,update - no extended wait departing edmonds,2018-02-11 00:30 +0000,0.0


## convert to datetime, fill unreported hours

In [103]:
# convert to datetime
ed_df['time'] = pd.to_datetime(ed_df['time'], utc=True)

In [104]:
# set time as index
ed_df = ed_df.set_index('time')

In [105]:
# convert to US/Pacific time zone
ed_df = ed_df.tz_convert('US/Pacific')

In [106]:
# add starting hour with no wait
sod = pd.date_range('2016-12-24 05:00:00', '2019-12-31 5:00:00', freq='D')
sod_df = pd.DataFrame(sod)
sod_df.columns = ['time']
sod_df = sod_df.set_index('time')

In [107]:
sod_df['wait_time'] = 0
sod_df.index = sod_df.index.tz_localize('US/Pacific')

In [108]:
ed_df = ed_df.append(sod_df)

In [109]:
ed_df

Unnamed: 0_level_0,tweet_text,wait_time
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-02-09 15:35:00-08:00,edmonds terminal wait time - one hour,1.0
2018-02-09 18:05:00-08:00,update - no extended wait in edmonds,0.0
2018-02-09 19:35:00-08:00,update - no extended wait departing edmonds,0.0
2018-02-10 12:25:00-08:00,edmonds terminal wait time - one hour,1.0
2018-02-10 16:30:00-08:00,update - no extended wait departing edmonds,0.0
...,...,...
2019-12-27 05:00:00-08:00,,0.0
2019-12-28 05:00:00-08:00,,0.0
2019-12-29 05:00:00-08:00,,0.0
2019-12-30 05:00:00-08:00,,0.0


In [110]:
# resample times to hour intervals, adding the "missing" hours, 
# and filling the wait time forward
ed_df = ed_df.resample('1H').ffill()

In [114]:
# remove non-sailing times (1-4am for Edmonds, 1-3 am for Kingston)
ed_df = ed_df.between_time('5:00', '0:55')

## expand date features

In [123]:
ed_df['year'] = ed_df.index.year
ed_df['month'] = ed_df.index.month
ed_df['day'] = ed_df.index.day
ed_df['hour'] = ed_df.index.hour
ed_df['dayofyear'] = ed_df.index.dayofyear
ed_df['week'] = ed_df.index.week
ed_df['weekday'] = ed_df.index.weekday

In [124]:
ed_df.head()

Unnamed: 0_level_0,tweet_text,wait_time,year,month,day,hour,dayofyear,week,weekday
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2016-12-24 05:00:00-08:00,,0.0,2016,12,24,5,359,51,5
2016-12-24 06:00:00-08:00,,0.0,2016,12,24,6,359,51,5
2016-12-24 07:00:00-08:00,,0.0,2016,12,24,7,359,51,5
2016-12-24 08:00:00-08:00,,0.0,2016,12,24,8,359,51,5
2016-12-24 09:00:00-08:00,,0.0,2016,12,24,9,359,51,5


In [125]:
ed_df.reset_index().drop(columns=['tweet_text']).to_csv('~/ds0805/blogs/edmonds.csv', index=False)

## Holidays

https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#holidays-holiday-calendars