In [2]:
if (!require(pacman)) install.packages('pacman')
pacman::p_load(tidyverse,
               tidymodels,
               arrow,
               install = TRUE)

Load Data

In [None]:
bikedata <- read_parquet(".././data/bikeshare-data.parquet")
bikedata <- bikedata %>%
    select(-total_seconds) %>%
    drop_na()

In [7]:
bd_split <- rsample::initial_split(bikedata, prop = 0.9)

In [8]:
count_train <- training(bd_split)
count_test  <- testing(bd_split)

### Preprocessing

In [9]:
# center and scale numeric data, create dummy vars from categorical data
preproc <- recipe(trip_count ~ ., data = bikedata) %>%
    update_role(trip_date, new_role = 'ID') %>%
    step_center(all_predictors()) %>%
    step_scale(all_predictors()) %>%
    step_dummy(all_nominal(),
               one_hot = FALSE) %>%
    step_zv(all_predictors())
preproc

Data Recipe

Inputs:

      role #variables
        ID          1
   outcome          1
 predictor          7

Operations:

Centering for all_predictors()
Scaling for all_predictors()
Dummy variables from all_nominal()
Zero variance filter on all_predictors()

### Model Definition

In [10]:
lm_model <- linear_reg(mode = 'regression') %>%
    set_engine('lm')

In [24]:
rf_model <- rand_forest() %>%
    set_engine("ranger") %>%
    set_mode("regression") %>%
    translate()

In [38]:
keras_model <- mlp(hidden_units = 5, activation = "relu") %>%
    set_engine("keras") %>%
    set_mode("regression") %>%
    translate()

### Build Workflow

In [None]:
lm_wflow <- workflow() %>%
    add_recipe(preproc) %>%
    add_model(lm_model)

rf_wflow <- workflow() %>%
    add_recipe(preproc) %>%
    add_model(rf_model)

In [25]:
keras_wflow <- workflow() %>%
  add_recipe(preproc) %>%
  add_model(keras_model)

Simple fitting; no grid tuning

In [28]:
lm_fit <- fit(lm_wflow, data = count_train)
lm_pred <- predict(lm_fit, new_data = count_train)

In [35]:
rf_fit <- fit(rf_wflow, data = count_train)
rf_pred <- predict(rf_fit, new_data = count_train)

In [36]:
lm_rmse <- rmse_vec(truth = count_train$trip_count, estimate = lm_pred$`.pred`)
rf_rmse <- rmse_vec(truth = count_train$trip_count, estimate = rf_pred$`.pred`)
tibble(model = c("linear regression", "random forest"),
       error = c(lm_rmse, rf_rmse))

model,error
<chr>,<dbl>
linear regression,2803.071
random forest,1766.074
