In [7]:
library(tidyverse)
library(repr)
library(tidymodels)
library(cowplot)
options(repr.matrix.max.rows = 6)

Registered S3 method overwritten by 'GGally':
  method from   
  +.gg   ggplot2



## Outline

1. read data, select "Post Hour" and "Likes", "Comments", "Shares"
2. split data
3. create knn regression model
        - find best k in workflow using training data
        - refit the model with test data
4. create simple linear regression model
5. create new tibble with every hour
6. find "Likes", "Comments", "Shares" for each hour using knn regression and linear regression models
7. find hour with highest "Likes", "Comments", "Shares"

When is the best month, weekday, and hour to post for the highest engagement in comments, likes, and shares?

In [4]:
#reading the data
temp <- tempfile()
download.file("https://archive.ics.uci.edu/ml/machine-learning-databases/00368/Facebook_metrics.zip",temp)

#Reading the data in R
facebook <- read_csv2(unz(temp, "dataset_Facebook.csv"))
head(facebook)
unlink(temp)

#fill in blank values with 0
facebook[is.na(facebook)] <- 0

Using ',' as decimal and '.' as grouping mark. Use read_delim() for more control.

Parsed with column specification:
cols(
  `Page total likes` = [32mcol_double()[39m,
  Type = [31mcol_character()[39m,
  Category = [32mcol_double()[39m,
  `Post Month` = [32mcol_double()[39m,
  `Post Weekday` = [32mcol_double()[39m,
  `Post Hour` = [32mcol_double()[39m,
  Paid = [32mcol_double()[39m,
  `Lifetime Post Total Reach` = [32mcol_double()[39m,
  `Lifetime Post Total Impressions` = [32mcol_double()[39m,
  `Lifetime Engaged Users` = [32mcol_double()[39m,
  `Lifetime Post Consumers` = [32mcol_double()[39m,
  `Lifetime Post Consumptions` = [32mcol_double()[39m,
  `Lifetime Post Impressions by people who have liked your Page` = [32mcol_double()[39m,
  `Lifetime Post reach by people who like your Page` = [32mcol_double()[39m,
  `Lifetime People who have liked your Page and engaged with your post` = [32mcol_double()[39m,
  comment = [32mcol_double()[39m,
  like = [32m

Page total likes,Type,Category,Post Month,Post Weekday,Post Hour,Paid,Lifetime Post Total Reach,Lifetime Post Total Impressions,Lifetime Engaged Users,Lifetime Post Consumers,Lifetime Post Consumptions,Lifetime Post Impressions by people who have liked your Page,Lifetime Post reach by people who like your Page,Lifetime People who have liked your Page and engaged with your post,comment,like,share,Total Interactions
<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
139441,Photo,2,12,4,3,0,2752,5091,178,109,159,3078,1640,119,4,79,17,100
139441,Status,2,12,3,10,0,10460,19057,1457,1361,1674,11710,6112,1108,5,130,29,164
139441,Photo,3,12,3,3,0,2413,4373,177,113,154,2812,1503,132,0,66,14,80
139441,Photo,2,12,2,10,1,50128,87991,2211,790,1119,61027,32048,1386,58,1572,147,1777
139441,Photo,2,12,2,3,0,7244,13594,671,410,580,6228,3200,396,19,325,49,393
139441,Status,2,12,1,9,0,10472,20849,1191,1073,1389,16034,7852,1016,1,152,33,186


In [5]:
facebook <- facebook %>% 
    select("Post Hour", "like", "comment", "share") %>% 
    rename(post_hour = "Post Hour")
facebook

post_hour,like,comment,share
<dbl>,<dbl>,<dbl>,<dbl>
3,79,4,17
10,130,5,29
3,66,0,14
⋮,⋮,⋮,⋮
2,93,4,18
11,91,7,38
4,91,0,28


## Predicting Best Post Hour

In [6]:
#Splitting into training and testing data
#Split into training (75 percent of data set) and test data
set.seed(1)
facebook_split <- initial_split(facebook, prop = 0.75, strata = post_hour)
facebook_train <- training(facebook_split)
facebook_test <- testing(facebook_split)

#Recipe for Post Hour
facebook_recipe <- recipe(post_hour ~ comment + share + like, data = facebook_train) %>%
                   step_scale(all_predictors()) %>%
                   step_center(all_predictors())
#Model
facebook_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>%
                 set_engine("kknn") %>%
                 set_mode("regression")

#5 fold cross validation
facebook_vfold <- vfold_cv(facebook_train, v = 5, strata = post_hour)

#workflow
facebook_workflow <- workflow() %>%
                     add_recipe(facebook_recipe) %>%
                     add_model(facebook_spec)

#testing 200 values to find the best value for K
gridvals <- tibble(neighbors = seq(1, 200))

#Tuning workflow to test the values of K
facebook_results <- facebook_workflow %>%
                        tune_grid(resamples = facebook_vfold, grid = gridvals) %>%
                        collect_metrics()
#Min amount of RMSPE along with mean and standard error to find the best K value to use
facebook_min <- facebook_results %>%
    filter(.metric == "rmse") %>%
    arrange(mean)  %>% 
    slice(1)

facebook_min

#Min RMSPE
k_min <- facebook_min %>%
            pull(neighbors)

#Using the best K in our model
facebook_best_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = k_min) %>%
                            set_engine("kknn") %>%
                            set_mode("regression")

#Reworking the workflow to incorporate our chosen K
facebook_best_fit <- workflow() %>%
                        add_recipe(facebook_recipe) %>%
                        add_model(facebook_best_spec) %>%
                        fit(data = facebook_train)

#Summary of the training vs testing data
facebook_summary <- facebook_best_fit %>% 
                       predict(facebook_test) %>%
                       bind_cols(facebook_test) %>%
                       metrics(truth = post_hour, estimate = .pred) 

facebook_summary

neighbors,.metric,.estimator,mean,n,std_err,.config
<int>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
55,rmse,standard,4.311845,5,0.1005771,Model055


.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
rmse,standard,4.3236027
rsq,standard,0.02240057
mae,standard,3.88810052


In [21]:
fb_spec <- linear_reg() %>%
    set_engine("lm") %>%
    set_mode("regression")

fb_recipe <- recipe(post_hour ~ comment + share + like, data = facebook_train)

fb_fit <- workflow() %>%
    add_recipe(fb_recipe) %>%
    add_model(fb_spec) %>%
    fit(data = facebook_train)

fb_test_results <- fb_fit %>%
    predict(facebook_test) %>%
    bind_cols(facebook_test) %>%
    metrics(truth = post_hour, estimate = .pred)
fb_test_results



.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
rmse,standard,4.357858612
rsq,standard,0.005157872
mae,standard,3.937141926
