## import necessary packages

In [1]:
from pymongo import MongoClient
import pandas as pd

## Import weather data and convert to DataFrame

### access mongo db

In [2]:
client = MongoClient()
ferry_db = client['ferry']
weather = ferry_db['weather']

### examine record structure for weather

In [3]:
weather.find_one()

{'_id': ObjectId('5e86586d17b20f91ad7ec965'),
 'latitude': 47.811784,
 'longitude': -122.38325,
 'timezone': 'America/Los_Angeles',
 'daily': {'data': [{'time': 1481961600,
    'summary': 'Overcast throughout the day.',
    'icon': 'partly-cloudy-day',
    'sunriseTime': 1481990100,
    'sunsetTime': 1482020400,
    'moonPhase': 0.66,
    'precipIntensity': 0,
    'precipIntensityMax': 0,
    'precipIntensityMaxTime': 1482012060,
    'precipProbability': 0,
    'temperatureHigh': 33.3,
    'temperatureHighTime': 1482017880,
    'temperatureLow': 27.46,
    'temperatureLowTime': 1482037440,
    'apparentTemperatureHigh': 32.8,
    'apparentTemperatureHighTime': 1482017880,
    'apparentTemperatureLow': 27.95,
    'apparentTemperatureLowTime': 1482037440,
    'dewPoint': 21.51,
    'humidity': 0.76,
    'pressure': 1029.9,
    'windSpeed': 1.15,
    'windGust': 3.24,
    'windGustTime': 1482005460,
    'windBearing': 95,
    'cloudCover': 0.76,
    'uvIndex': 1,
    'uvIndexTime': 148200

### Get all records, convert to a list

In [4]:
weather_data = list(weather.find())

### Flatten information and add to weather_df

In [5]:
weather_df = pd.DataFrame()

In [6]:
for day in weather_data:
    day_df = pd.json_normalize(day['daily']['data'][0])
    day_df['date'] = day['date']
    weather_df = weather_df.append(day_df)

In [7]:
weather_df = weather_df.drop(columns=['time'])
weather_df.head()

Unnamed: 0,summary,icon,sunriseTime,sunsetTime,moonPhase,precipIntensity,precipIntensityMax,precipIntensityMaxTime,precipProbability,temperatureHigh,...,temperatureMax,temperatureMaxTime,apparentTemperatureMin,apparentTemperatureMinTime,apparentTemperatureMax,apparentTemperatureMaxTime,date,precipType,precipAccumulation,ozone
0,Overcast throughout the day.,partly-cloudy-day,1481990100,1482020400,0.66,0.0,0.0,1482012000.0,0.0,33.3,...,33.3,1482017880,23.35,1481972100,32.8,1482017880,2016-12-17,,,
0,Drizzle overnight.,partly-cloudy-day,1482076560,1482106800,0.69,0.0,0.0004,1482131000.0,0.03,39.69,...,39.99,1482130800,29.76,1482051540,39.19,1482102060,2016-12-18,,,
0,Light rain throughout the day.,rain,1482162960,1482193200,0.72,0.0124,0.0826,1482214000.0,0.91,43.78,...,43.82,1482205260,32.26,1482155460,43.32,1482205260,2016-12-19,rain,,
0,Partly cloudy throughout the day.,rain,1482249420,1482279660,0.75,0.0037,0.0634,1482221000.0,0.91,49.78,...,49.78,1482273900,39.37,1482303600,47.56,1482276840,2016-12-20,rain,,
0,Partly cloudy throughout the day.,partly-cloudy-day,1482335880,1482366060,0.78,0.0002,0.0007,1482353000.0,0.1,45.85,...,45.85,1482357600,36.86,1482324360,45.35,1482357600,2016-12-21,rain,,


## Combine weather data with wait data

### Import wait data

In [8]:
ed_wait_df = pd.read_csv('../data/expanded/edmonds.csv')

In [9]:
ed_wait_df.head()

Unnamed: 0,time,tweet_text,wait_time,year,month,day,hour,dayofyear,week,weekday
0,2016-12-24 05:00:00-08:00,,0.0,2016,12,24,5,359,51,5
1,2016-12-24 06:00:00-08:00,,0.0,2016,12,24,6,359,51,5
2,2016-12-24 07:00:00-08:00,,0.0,2016,12,24,7,359,51,5
3,2016-12-24 08:00:00-08:00,,0.0,2016,12,24,8,359,51,5
4,2016-12-24 09:00:00-08:00,,0.0,2016,12,24,9,359,51,5


### Prep data frames for merging w/ datetime adjustments

In [10]:
ed_wait_df['time'] = pd.to_datetime(ed_wait_df['time'], utc=True)
ed_wait_df['time'] = ed_wait_df['time'].dt.tz_convert('US/Pacific')

In [11]:
weather_df['date'] = weather_df['date'].dt.tz_localize('US/Pacific')

### Merge weather with wait time dataframe

In [12]:
ed_wait_df = pd.merge_asof(ed_wait_df, weather_df, left_on='time', right_on='date')

In [13]:
ed_wait_df.head().T

Unnamed: 0,0,1,2,3,4
time,2016-12-24 05:00:00-08:00,2016-12-24 06:00:00-08:00,2016-12-24 07:00:00-08:00,2016-12-24 08:00:00-08:00,2016-12-24 09:00:00-08:00
tweet_text,,,,,
wait_time,0,0,0,0,0
year,2016,2016,2016,2016,2016
month,12,12,12,12,12
day,24,24,24,24,24
hour,5,6,7,8,9
dayofyear,359,359,359,359,359
week,51,51,51,51,51
weekday,5,5,5,5,5


## Add holiday data

In [14]:
holidays_df = pd.read_csv('../data/holidays.csv', header=None, names=['holiday'])

In [15]:
holidays_df['holiday'] = pd.to_datetime(holidays_df['holiday'])
holidays_df['holiday'] = holidays_df['holiday'].dt.tz_localize('US/Pacific')

In [16]:
ed_wait_df = ed_wait_df.set_index(['time'])

In [17]:
from datetime import timedelta

In [18]:
for i in range(0, 8):
    column_name = 'holiday +/ ' + str(i)
    ed_wait_df[column_name] = False

    for holiday in holidays_df['holiday']:
        start = holiday - timedelta(days=i)
        end = holiday + timedelta(days=i+1)
        ed_wait_df.loc[start:end, column_name] = True

In [19]:
ed_wait_df.head(50)

Unnamed: 0_level_0,tweet_text,wait_time,year,month,day,hour,dayofyear,week,weekday,summary,...,precipAccumulation,ozone,holiday +/ 0,holiday +/ 1,holiday +/ 2,holiday +/ 3,holiday +/ 4,holiday +/ 5,holiday +/ 6,holiday +/ 7
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-12-24 05:00:00-08:00,,0.0,2016,12,24,5,359,51,5,Possible drizzle in the evening.,...,,,False,False,True,True,True,True,True,True
2016-12-24 06:00:00-08:00,,0.0,2016,12,24,6,359,51,5,Possible drizzle in the evening.,...,,,False,False,True,True,True,True,True,True
2016-12-24 07:00:00-08:00,,0.0,2016,12,24,7,359,51,5,Possible drizzle in the evening.,...,,,False,False,True,True,True,True,True,True
2016-12-24 08:00:00-08:00,,0.0,2016,12,24,8,359,51,5,Possible drizzle in the evening.,...,,,False,False,True,True,True,True,True,True
2016-12-24 09:00:00-08:00,,0.0,2016,12,24,9,359,51,5,Possible drizzle in the evening.,...,,,False,False,True,True,True,True,True,True
2016-12-24 10:00:00-08:00,,0.0,2016,12,24,10,359,51,5,Possible drizzle in the evening.,...,,,False,False,True,True,True,True,True,True
2016-12-24 11:00:00-08:00,,0.0,2016,12,24,11,359,51,5,Possible drizzle in the evening.,...,,,False,False,True,True,True,True,True,True
2016-12-24 12:00:00-08:00,,0.0,2016,12,24,12,359,51,5,Possible drizzle in the evening.,...,,,False,False,True,True,True,True,True,True
2016-12-24 13:00:00-08:00,ed/king- edmonds wait time - 60 minutes,1.0,2016,12,24,13,359,51,5,Possible drizzle in the evening.,...,,,False,False,True,True,True,True,True,True
2016-12-24 14:00:00-08:00,ed/king- edmonds wait time - 60 minutes,1.0,2016,12,24,14,359,51,5,Possible drizzle in the evening.,...,,,False,False,True,True,True,True,True,True


## Prepare for modeling

In [20]:
ed_wait_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 23154 entries, 2016-12-24 05:00:00-08:00 to 2019-12-31 13:00:00-08:00
Data columns (total 57 columns):
 #   Column                       Non-Null Count  Dtype                     
---  ------                       --------------  -----                     
 0   tweet_text                   4413 non-null   object                    
 1   wait_time                    23154 non-null  float64                   
 2   year                         23154 non-null  int64                     
 3   month                        23154 non-null  int64                     
 4   day                          23154 non-null  int64                     
 5   hour                         23154 non-null  int64                     
 6   dayofyear                    23154 non-null  int64                     
 7   week                         23154 non-null  int64                     
 8   weekday                      23154 non-null  int64                   

In [21]:
ed_wait_df = ed_wait_df.drop(columns=['tweet_text', 'summary', 'date'])

## Train-Test Split

In [22]:
from sklearn.model_selection import train_test_split

X = ed_wait_df.drop(columns=['wait_time'])
y = ed_wait_df['wait_time']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

## Modeling

chosen metric: RMSE

In [23]:
from sklearn.metrics import mean_squared_error

### Naive: mean

In [24]:
pred = y_train.mean()

train_preds = [pred] * X_train.shape[0]
print(f"X_train RMSE: {mean_squared_error(y_train, train_preds, squared=False)}")

test_preds = [pred] * X_test.shape[0]
print(f"X_test RMSE: {mean_squared_error(y_test, test_preds, squared=False)}")

X_train RMSE: 0.4539355639034019
X_test RMSE: 0.4380723356182337


### Linear Regression

In [25]:
from sklearn.linear_model import LinearRegression

### Linear Regression w/ Regularization