In [2]:
# Import data
import pandas as pd

# Import raw data and preview contents
raw_arrival_data = pd.read_csv("data/ruhs_arrivals_raw_data.csv", parse_dates=['date'], infer_datetime_format=True)
raw_weather_data = pd.read_csv("data/moval_weather.csv", parse_dates=['DATE'], infer_datetime_format=True)

# Add weather data, remove redundant date column
raw_data = pd.merge(raw_arrival_data, raw_weather_data, left_on=['date'], right_on=['DATE'], how='inner')
raw_data = raw_data.drop('DATE', axis = 1)

print(raw_data.head(10))

        date  arrive_num  acuity_1  acuity_2  acuity_3  acuity_4  acuity_5  \
0 2022-12-01         303         1        37       150       101        11   
1 2022-12-02         286         1        33       156        87         8   
2 2022-12-03         280         6        44       137        90         1   
3 2022-12-04         245         6        45       117        73         0   
4 2022-12-05         278         3        31       139       100         0   
5 2022-12-06         304         1        34       161        99         4   
6 2022-12-07         225         1        27       123        69         4   
7 2022-12-08         280         1        41       154        75         5   
8 2022-12-09         275         3        34       148        83         5   
9 2022-12-10         226         1        30       120        68         4   

  day_of_week  week_of_year  AWND  PRCP  TMAX  TMIN  
0       thurs            49  4.47   0.0    56    49  
1         fri            49  2.01

In [4]:
#import python-dateutil as dateutil
import numpy as np
from pandas.tseries.holiday import USFederalHolidayCalendar

# Cyclic encodinf for day of week
raw_data['date'] = pd.to_datetime(raw_data.date)
raw_data['day_of_week'] = raw_data.date.dt.weekday

raw_data['day_of_week_sin'] = np.sin(raw_data['day_of_week'] * (2 * np.pi / 7))
raw_data['day_of_week_cos'] = np.cos(raw_data['day_of_week'] * (2 * np.pi / 7))

raw_data = raw_data.drop('day_of_week', axis=1)

# Cyclic encoding for week of year
raw_data['week_of_year_sin'] = np.sin(raw_data['week_of_year'] * (2 * np.pi / 52))
raw_data['week_of_year_cos'] = np.cos(raw_data['week_of_year'] * (2 * np.pi / 52))

raw_data = raw_data.drop('week_of_year', axis=1)

# Calculate running 7-day average
arrive_nums = raw_data['arrive_num']
running_7d_avg = []
for i, arrivals in enumerate(arrive_nums):
    prev_index = i-8
    start_range = prev_index if prev_index >= 0 else 0
    prev_7_days = arrive_nums[start_range : i]
    if prev_7_days.empty : prev_7_days = [arrivals]
    running_7d_avg.append(np.mean(prev_7_days))
raw_data['running_7d_avg'] = running_7d_avg
    
# TODO: add distance from holiday
holidays = np.array(USFederalHolidayCalendar().holidays(start='2022-01-02', end=pd.to_datetime("today")))

avail_dates = raw_data['date']
is_before_holiday = []
is_holiday = []
is_after_holiday = []
for date in avail_dates:
    delta_from_holidays = np.datetime64(date) - holidays
    days_from_holiday = [(x / np.timedelta64(1, 'D')) + 1 for x in delta_from_holidays]
    smallest_delta = min(days_from_holiday, key=abs)
    is_before_holiday.append(np.isclose(smallest_delta, -1.0))
    is_holiday.append(np.isclose(smallest_delta, 0.0))
    is_after_holiday.append(np.isclose(smallest_delta, 1.0))
raw_data['is_before_holiday'] = is_before_holiday
raw_data['is_holiday'] = is_holiday
raw_data['is_after_holiday'] = is_after_holiday

# Normalize temps
raw_data['TMAX'] = raw_data['TMAX'] / raw_data['TMAX'].max()

print(raw_data.head(26))

KeyError: 'week_of_year'

In [3]:
# Drop date column, as we only need this to extrapolate other features, not to train model
if not raw_data['date'].empty : raw_data = raw_data.drop('date', axis = 1)

In [4]:
# Define features and labels

# Labels: values we want to predict
labels = raw_data['arrive_num'] # Un-comment to predict total daily arrivals only
#labels = raw_data[['arrive_num', 'acuity_1', 'acuity_2', 'acuity_3', 'acuity_4', 'acuity_5']] # Un-comment to predict arrivals and acuities
print(labels)
labels = np.array(labels)

# Features: data that will train model, corresponds to input data when predicting labels
features = raw_data[['week_of_year_sin', 'week_of_year_cos', 'AWND', 'PRCP', 'TMAX', 'TMIN', 'running_7d_avg', 'is_before_holiday', 'is_holiday', 'is_after_holiday', 'day_of_week_sin', 'day_of_week_cos']]
print(features)
feature_list = list(features.columns)

features = np.array(features)

0     303
1     286
2     280
3     245
4     278
     ... 
64    270
65    233
66    247
67    292
68    296
Name: arrive_num, Length: 69, dtype: int64
    week_of_year_sin  week_of_year_cos   AWND  PRCP  TMAX  TMIN  \
0          -0.354605          0.935016   4.47   0.0    56    49   
1          -0.354605          0.935016   2.01   0.0    58    47   
2          -0.354605          0.935016   2.01   0.0    71    41   
3          -0.239316          0.970942   2.01   0.0    66    38   
4          -0.239316          0.970942   3.58   0.0    60    46   
..               ...               ...    ...   ...   ...   ...   
64          0.568065          0.822984   2.01   0.0    67    37   
65          0.568065          0.822984   2.01   0.0    69    32   
66          0.663123          0.748511   3.58   0.0    61    37   
67          0.663123          0.748511  12.08   0.0    62    39   
68          0.663123          0.748511   3.13   0.0    66    34   

    running_7d_avg  is_before_holiday  is_

In [5]:
from sklearn.model_selection import train_test_split

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.10)

In [6]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (62, 12)
Training Labels Shape: (62,)
Testing Features Shape: (7, 12)
Testing Labels Shape: (7,)


In [7]:
# Train random forest regression model
from sklearn.ensemble import RandomForestRegressor

#Instantiate model
arrive_rf = RandomForestRegressor(n_estimators = 200)

#Train the model on training data
arrive_rf.fit(train_features, train_labels)

RandomForestRegressor(n_estimators=200)

In [8]:
# Validate model
predictions = arrive_rf.predict(test_features)
print('Test Features:\n', test_features, '\n')
print('Predictions:\n', predictions, '\n')
print('Test Data:\n', test_labels, '\n')

# Calculate mean absolute error
errors = abs(predictions - test_labels)
print('Absolute error table:\n', errors.round(), '\n')
print('Mean absolute error:', round(np.mean(errors), 2), 'arrivals.')

Test Features:
 [[-0.23931566428755865 0.9709418174260518 1.57 0.0 60 33
  274.42857142857144 False False False 0.43388373911755823
  -0.900968867902419]
 [0.5680647467311557 0.8229838658936565 2.01 0.0 69 32 268.0 False False
  False -0.9749279121818236 -0.2225209339563146]
 [0.23931566428755774 0.970941817426052 1.57 0.0 65 37 249.125 False
  False False -0.7818314824680299 0.6234898018587334]
 [0.12053668025532226 0.9927088740980541 3.36 0.0 56 49 257.875 False
  False False 0.43388373911755823 -0.900968867902419]
 [0.35460488704253557 0.9350162426854148 1.34 0.0 56 33 256.75 False
  False False 0.9749279121818236 -0.22252093395631434]
 [0.4647231720437685 0.8854560256532099 13.87 0.0 57 34 245.875 False
  False False 0.0 1.0]
 [0.12053668025532226 0.9927088740980541 2.46 0.09 61 49 254.625 False
  False False 0.9749279121818236 -0.22252093395631434]] 

Predictions:
 [279.79  247.57  238.59  254.36  253.4   270.015 257.985] 

Test Data:
 [280 233 226 266 277 291 282] 

Absolute erro

In [9]:
# Get numerical feature importances
importances = list(arrive_rf.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: day_of_week_sin      Importance: 0.36
Variable: running_7d_avg       Importance: 0.18
Variable: TMAX                 Importance: 0.12
Variable: week_of_year_cos     Importance: 0.08
Variable: TMIN                 Importance: 0.08
Variable: week_of_year_sin     Importance: 0.06
Variable: AWND                 Importance: 0.04
Variable: is_before_holiday    Importance: 0.03
Variable: day_of_week_cos      Importance: 0.02
Variable: PRCP                 Importance: 0.01
Variable: is_after_holiday     Importance: 0.01
Variable: is_holiday           Importance: 0.0
