In [2]:
library(tidyverse)
library(repr)
library(tidymodels)
library(cowplot)
options(repr.matrix.max.rows = 6)

In [4]:
#reading the data
temp <- tempfile()
download.file("https://archive.ics.uci.edu/ml/machine-learning-databases/00368/Facebook_metrics.zip",temp)

#Reading the data in R
facebook <- read_csv2(unz(temp, "dataset_Facebook.csv"))
head(facebook)
unlink(temp)

Using ',' as decimal and '.' as grouping mark. Use read_delim() for more control.

Parsed with column specification:
cols(
  `Page total likes` = [32mcol_double()[39m,
  Type = [31mcol_character()[39m,
  Category = [32mcol_double()[39m,
  `Post Month` = [32mcol_double()[39m,
  `Post Weekday` = [32mcol_double()[39m,
  `Post Hour` = [32mcol_double()[39m,
  Paid = [32mcol_double()[39m,
  `Lifetime Post Total Reach` = [32mcol_double()[39m,
  `Lifetime Post Total Impressions` = [32mcol_double()[39m,
  `Lifetime Engaged Users` = [32mcol_double()[39m,
  `Lifetime Post Consumers` = [32mcol_double()[39m,
  `Lifetime Post Consumptions` = [32mcol_double()[39m,
  `Lifetime Post Impressions by people who have liked your Page` = [32mcol_double()[39m,
  `Lifetime Post reach by people who like your Page` = [32mcol_double()[39m,
  `Lifetime People who have liked your Page and engaged with your post` = [32mcol_double()[39m,
  comment = [32mcol_double()[39m,
  like = [32m

Page total likes,Type,Category,Post Month,Post Weekday,Post Hour,Paid,Lifetime Post Total Reach,Lifetime Post Total Impressions,Lifetime Engaged Users,Lifetime Post Consumers,Lifetime Post Consumptions,Lifetime Post Impressions by people who have liked your Page,Lifetime Post reach by people who like your Page,Lifetime People who have liked your Page and engaged with your post,comment,like,share,Total Interactions
<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
139441,Photo,2,12,4,3,0,2752,5091,178,109,159,3078,1640,119,4,79,17,100
139441,Status,2,12,3,10,0,10460,19057,1457,1361,1674,11710,6112,1108,5,130,29,164
139441,Photo,3,12,3,3,0,2413,4373,177,113,154,2812,1503,132,0,66,14,80
139441,Photo,2,12,2,10,1,50128,87991,2211,790,1119,61027,32048,1386,58,1572,147,1777
139441,Photo,2,12,2,3,0,7244,13594,671,410,580,6228,3200,396,19,325,49,393
139441,Status,2,12,1,9,0,10472,20849,1191,1073,1389,16034,7852,1016,1,152,33,186


In [6]:
#Splitting into training and testing data
#Split into training (75 percent of data set) and test data
facebook_split <- initial_split(facebook, prop = 0.75, strata = `Total Interactions`)
facebook_train <- training(facebook_split)
facebook_test <- testing(facebook_split)

In [25]:
#Finding best K in Hour of Day through training data
#Recipe
facebook_recipe <- recipe(`Post Month` ~ comment + share + like, data = facebook_train) %>%
  step_scale(all_predictors()) %>%
  step_center(all_predictors())

#Model
spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>%
  set_engine("kknn") %>%
  set_mode("regression")

#Performing a 5 fold cross validation 
facebook_vfold <- vfold_cv(facebook_train, v = 5, strata = `Post Month`)

#Workflow
fb_workflow <- workflow() %>%
              add_recipe(facebook_recipe) %>%
              add_model(spec)
fb_workflow
#Testing 200 values of K in a tibble
gridvals <- tibble(neighbors = seq(1:200))

#(Tunes workflow to test all values of K, resamples cross-validation data set, collects the statistics from that model)
fb_results <- fb_workflow %>%
  tune_grid(resamples = facebook_vfold, grid = gridvals) %>%
  collect_metrics()
fb_results

#Finding min amount of RMSPE
facebook_min <- fb_results %>%
                filter(.metric == "rmse") %>%
                filter(mean == min(mean))
facebook_min

══ Workflow ════════════════════════════════════════════════════════════════════
[3mPreprocessor:[23m Recipe
[3mModel:[23m nearest_neighbor()

── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps

● step_scale()
● step_center()

── Model ───────────────────────────────────────────────────────────────────────
K-Nearest Neighbor Model Specification (regression)

Main Arguments:
  neighbors = tune()
  weight_func = rectangular

Computational engine: kknn 


[31mx[39m [31mFold3: model 1/1 (predictions): Error: Problem with `mutate()` input `.row`.
[...[39m

[31mx[39m [31mFold5: model 1/1 (predictions): Error: Problem with `mutate()` input `.row`.
[...[39m



neighbors,.metric,.estimator,mean,n,std_err,.config
<int>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
1,rmse,standard,4.884649788,3,0.215555565,Model001
1,rsq,standard,0.009976445,3,0.006155074,Model001
2,rmse,standard,4.090229663,3,0.150488092,Model002
⋮,⋮,⋮,⋮,⋮,⋮,⋮
199,rsq,standard,0.02452211,3,0.01905884,Model199
200,rmse,standard,3.36358467,3,0.08748377,Model200
200,rsq,standard,0.02589022,3,0.01735904,Model200


neighbors,.metric,.estimator,mean,n,std_err,.config
<int>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
26,rmse,standard,3.340732,3,0.05514045,Model026


In [37]:
k_min <- facebook_min %>%
         pull(neighbors)

facebook_best_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = k_min) %>%
         set_engine("kknn") %>%
         set_mode("regression")

fb_best_fit <- workflow() %>%
         add_recipe(facebook_recipe) %>%
         add_model(facebook_best_spec) %>%
         fit(data = facebook_train)

fb_summary <- fb_best_fit  %>%
              predict(facebook_test) %>%
              bind_cols(facebook_test) %>%
             metrics(truth = `Post Hour`, estimate = .pred)
fb_summary

ERROR: Error: Can't recycle `..1` (size 121) to match `..2` (size 123).
