In [None]:
# Import data
import pandas as pd

# Import raw data and preview contents
raw_arrival_data = pd.read_csv("data/ruhs_arrivals_raw_data.csv", parse_dates=['date'], infer_datetime_format=True)
raw_weather_data = pd.read_csv("data/moval_weather.csv", parse_dates=['DATE'], infer_datetime_format=True)

# Add weather data, remove redundant date column
raw_data = pd.merge(raw_arrival_data, raw_weather_data, left_on=['date'], right_on=['DATE'], how='inner')
raw_data = raw_data.drop('DATE', axis = 1)

print(raw_data.head(10))

In [None]:
#import python-dateutil as dateutil
import numpy as np
from pandas.tseries.holiday import USFederalHolidayCalendar

# Cyclic encodinf for day of week
raw_data['date'] = pd.to_datetime(raw_data.date)
raw_data['day_of_week'] = raw_data.date.dt.weekday

raw_data['day_of_week_sin'] = np.sin(raw_data['day_of_week'] * (2 * np.pi / 7))
raw_data['day_of_week_cos'] = np.cos(raw_data['day_of_week'] * (2 * np.pi / 7))

raw_data = raw_data.drop('day_of_week', axis=1)

# Cyclic encoding for week of year
raw_data['week_of_year_sin'] = np.sin(raw_data['week_of_year'] * (2 * np.pi / 52))
raw_data['week_of_year_cos'] = np.cos(raw_data['week_of_year'] * (2 * np.pi / 52))

raw_data = raw_data.drop('week_of_year', axis=1)

# Calculate running 7-day average
arrive_nums = raw_data['arrive_num']
running_7d_avg = []
for i, arrivals in enumerate(arrive_nums):
    prev_index = i-8
    start_range = prev_index if prev_index >= 0 else 0
    prev_7_days = arrive_nums[start_range : i]
    if prev_7_days.empty : prev_7_days = [arrivals]
    running_7d_avg.append(np.mean(prev_7_days))
raw_data['running_7d_avg'] = running_7d_avg
    
# TODO: add distance from holiday
holidays = np.array(USFederalHolidayCalendar().holidays(start='2022-01-02', end=pd.to_datetime("today")))

avail_dates = raw_data['date']
is_before_holiday = []
is_holiday = []
is_after_holiday = []
for date in avail_dates:
    delta_from_holidays = np.datetime64(date) - holidays
    days_from_holiday = [(x / np.timedelta64(1, 'D')) + 1 for x in delta_from_holidays]
    smallest_delta = min(days_from_holiday, key=abs)
    is_before_holiday.append(np.isclose(smallest_delta, -1.0))
    is_holiday.append(np.isclose(smallest_delta, 0.0))
    is_after_holiday.append(np.isclose(smallest_delta, 1.0))
raw_data['is_before_holiday'] = is_before_holiday
raw_data['is_holiday'] = is_holiday
raw_data['is_after_holiday'] = is_after_holiday

# Normalize temps
raw_data['TMAX'] = raw_data['TMAX'] / raw_data['TMAX'].max()
raw_data['TMIN'] = raw_data['TMIN'] / raw_data['TMIN'].max()

print(raw_data.head(26))

In [None]:
# Drop date column, as we only need this to extrapolate other features, not to train model
if not raw_data['date'].empty : raw_data = raw_data.drop('date', axis = 1)

In [None]:
# Define features and labels

# Labels: values we want to predict
labels = raw_data['arrive_num'] # Un-comment to predict total daily arrivals only
#labels = raw_data[['arrive_num', 'acuity_1', 'acuity_2', 'acuity_3', 'acuity_4', 'acuity_5']] # Un-comment to predict arrivals and acuities
print(labels)
labels = np.array(labels)

# Features: data that will train model, corresponds to input data when predicting labels
features = raw_data[['week_of_year_sin', 'week_of_year_cos', 'AWND', 'PRCP', 'TMAX', 'TMIN', 'running_7d_avg', 'is_before_holiday', 'is_holiday', 'is_after_holiday', 'day_of_week_sin', 'day_of_week_cos']]
print(features)
feature_list = list(features.columns)

features = np.array(features)

In [None]:
from sklearn.model_selection import train_test_split

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.05)

In [None]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

In [None]:
# Train random forest regression model
from sklearn.ensemble import RandomForestRegressor

#Instantiate model
arrive_rf = RandomForestRegressor(n_estimators = 200)

#Train the model on training data
arrive_rf.fit(train_features, train_labels)

In [None]:
# Validate model
predictions = arrive_rf.predict(test_features)
print('Test Features:\n', test_features, '\n')
print('Predictions:\n', predictions, '\n')
print('Test Data:\n', test_labels, '\n')

# Calculate mean absolute error
errors = abs(predictions - test_labels)
print('Absolute error table:\n', errors.round(), '\n')
print('Mean absolute error:', round(np.mean(errors), 2), 'arrivals.')

In [None]:
# Get numerical feature importances
importances = list(arrive_rf.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];