In [7]:
# Import data
import pandas as pd

# Import raw data and preview contents
raw_arrival_data = pd.read_csv("data/ruhs_arrivals_raw_data.csv", parse_dates=['date'], infer_datetime_format=True)
raw_weather_data = pd.read_csv("data/moval_weather.csv", parse_dates=['DATE'], infer_datetime_format=True)

# Add weather data, remove redundant date column
raw_data = pd.merge(raw_arrival_data, raw_weather_data, left_on=['date'], right_on=['DATE'], how='inner')
raw_data = raw_data.drop('DATE', axis = 1)

print(raw_data.head(10))

        date  arrive_num  acuity_1  acuity_2  acuity_3  acuity_4  acuity_5  \
0 2022-12-01         303         1        37       150       101        11   
1 2022-12-02         286         1        33       156        87         8   
2 2022-12-03         280         6        44       137        90         1   
3 2022-12-04         245         6        45       117        73         0   
4 2022-12-05         278         3        31       139       100         0   
5 2022-12-06         304         1        34       161        99         4   
6 2022-12-07         225         1        27       123        69         4   
7 2022-12-08         280         1        41       154        75         5   
8 2022-12-09         275         3        34       148        83         5   
9 2022-12-10         226         1        30       120        68         4   

  day_of_week  week_of_year  AWND  PRCP  TMAX  TMIN  
0       thurs            49  4.47   0.0    56    49  
1         fri            49  2.01

In [29]:
#import python-dateutil as dateutil
import numpy as np
from pandas.tseries.holiday import USFederalHolidayCalendar

# TODO: One-hot encode the day of week
#raw_data = pd.get_dummies(raw_data)

# Calculate running 7-day average
arrive_nums = raw_data['arrive_num']
running_7d_avg = []
for i, arrivals in enumerate(arrive_nums):
    prev_index = i-8
    start_range = prev_index if prev_index >= 0 else 0
    prev_7_days = arrive_nums[start_range : i]
    if prev_7_days.empty : prev_7_days = [arrivals]
    running_7d_avg.append(np.mean(prev_7_days))
raw_data['running_7d_avg'] = running_7d_avg
    
# TODO: add distance from holiday
holidays = np.array(USFederalHolidayCalendar().holidays(start='2022-01-02', end=pd.to_datetime("today")))
print(holidays)
avail_dates = raw_data['date']
is_before_holiday = []
is_holiday = []
is_after_holiday = []
for date in avail_dates:
    delta_from_holidays = np.datetime64(date) - holidays
    days_from_holiday = [(x / np.timedelta64(1, 'D')) + 1 for x in delta_from_holidays]
    smallest_delta = min(days_from_holiday, key=abs)
    is_before_holiday.append(np.isclose(smallest_delta, -1.0))
    is_holiday.append(np.isclose(smallest_delta, 0.0))
    is_after_holiday.append(np.isclose(smallest_delta, 1.0))
raw_data['is_before_holiday'] = is_before_holiday
raw_data['is_holiday'] = is_holiday
raw_data['is_after_holiday'] = is_after_holiday

print(raw_data.head(26))

['2022-01-17T00:00:00.000000000' '2022-02-21T00:00:00.000000000'
 '2022-05-30T00:00:00.000000000' '2022-07-04T00:00:00.000000000'
 '2022-09-05T00:00:00.000000000' '2022-10-10T00:00:00.000000000'
 '2022-11-11T00:00:00.000000000' '2022-11-24T00:00:00.000000000'
 '2022-12-26T00:00:00.000000000' '2023-01-02T00:00:00.000000000'
 '2023-01-16T00:00:00.000000000']
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.flo

In [None]:
# Drop date column, as we only need this to extrapolate other features, not to train model
if not raw_data['date'].empty : raw_data = raw_data.drop('date', axis = 1)

In [3]:
# Define features and labels

# Labels: values we want to predict
labels = raw_data['arrive_num'] # Un-comment to predict total daily arrivals only
#labels = raw_data[['arrive_num', 'acuity_1', 'acuity_2', 'acuity_3', 'acuity_4', 'acuity_5']] # Un-comment to predict arrivals and acuities
print(labels)
labels = np.array(labels)

# Features: data that will train model, corresponds to input data when predicting labels
features = raw_data[['day_of_week', 'week_of_year', 'AWND', 'PRCP', 'TMAX', 'TMIN', 'running_7d_avg']]
print(features)
feature_list = list(features.columns)

features = np.array(features)

0     303
1     286
2     280
3     245
4     278
     ... 
64    270
65    233
66    247
67    292
68    296
Name: arrive_num, Length: 69, dtype: int64
    day_of_week  week_of_year   AWND  PRCP  TMAX  TMIN  running_7d_avg
0             5            49   4.47   0.0    56    49      303.000000
1             6            49   2.01   0.0    58    47      303.000000
2             7            49   2.01   0.0    71    41      294.500000
3             1            50   2.01   0.0    66    38      289.666667
4             2            50   3.58   0.0    60    46      278.500000
..          ...           ...    ...   ...   ...   ...             ...
64            6             5   2.01   0.0    67    37      266.750000
65            7             5   2.01   0.0    69    32      268.000000
66            1             6   3.58   0.0    61    37      264.125000
67            2             6  12.08   0.0    62    39      266.000000
68            3             6   3.13   0.0    66    34      273.50

In [4]:
from sklearn.model_selection import train_test_split

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.10)

In [5]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (62, 7)
Training Labels Shape: (62,)
Testing Features Shape: (7, 7)
Testing Labels Shape: (7,)


In [6]:
# Train random forest regression model
from sklearn.ensemble import RandomForestRegressor

#Instantiate model
arrive_rf = RandomForestRegressor(n_estimators = 100)

#Train the model on training data
arrive_rf.fit(train_features, train_labels)

RandomForestRegressor()

In [7]:
# Validate model
predictions = arrive_rf.predict(test_features)
print('Test Features:\n', test_features.round(), '\n')
print('Predictions:\n', predictions, '\n')
print('Test Data:\n', test_labels, '\n')

# Calculate mean absolute error
errors = abs(predictions - test_labels)
print('Absolute error table:\n', errors.round(), '\n')
print('Mean absolute error:', round(np.mean(errors), 2), 'arrivals.')

Test Features:
 [[  7.  52.   2.   0.  78.  37. 250.]
 [  4.   2.   2.   0.  57.  39. 253.]
 [  2.  51.   2.   0.  47.  37. 256.]
 [  6.  53.   1.   0.  56.  51. 260.]
 [  5.  49.   4.   0.  56.  49. 303.]
 [  7.  49.   2.   0.  71.  41. 294.]
 [  5.  50.   2.   0.  60.  33. 274.]] 

Predictions:
 [241.19 261.26 273.3  264.45 261.14 240.77 278.03] 

Test Data:
 [181 265 238 255 303 280 280] 

Absolute error table:
 [60.  4. 35.  9. 42. 39.  2.] 

Mean absolute error: 27.39 arrivals.


In [8]:
# Get numerical feature importances
importances = list(arrive_rf.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: day_of_week          Importance: 0.42
Variable: running_7d_avg       Importance: 0.22
Variable: TMIN                 Importance: 0.1
Variable: week_of_year         Importance: 0.09
Variable: AWND                 Importance: 0.08
Variable: TMAX                 Importance: 0.07
Variable: PRCP                 Importance: 0.02
