In [None]:
library(tidyverse)
library(repr)
library(tidymodels)
library(cowplot)
library(digest)
library(repr)
library(GGally)
library(ISLR)
options(repr.matrix.max.rows = 6)

## Outline

1. read data, select "Post Hour" and "Likes", "Comments", "Shares"
2. split data
3. create knn regression model
        - find best k in workflow using training data
        - refit the model with test data
4. create simple linear regression model
5. create new tibble with every hour
6. find "Likes", "Comments", "Shares" for each hour using knn regression and linear regression models
7. find hour with highest "Likes", "Comments", "Shares"

When is the best month, weekday, and hour to post for the highest engagement in comments, likes, and shares?

In [None]:
#reading the data
temp <- tempfile()
download.file("https://archive.ics.uci.edu/ml/machine-learning-databases/00368/Facebook_metrics.zip",temp)

#Reading the data in R
facebook <- read_csv2(unz(temp, "dataset_Facebook.csv"))
head(facebook)
unlink(temp)
facebook

#fill in blank values with 0
facebook[is.na(facebook)] <- 0

Looking at our dataset, it passes the test for tidy data (each row is a single observation, each column is a single variable, each value is a single cell), so no further cleaning data tools are required. However, the column names are separated by white spaces instead of underscores (_). We will deal with this issue first and rename our predictor columns to additional headaches along the way. 

Before we'll get ahead with our exploratory data analysis, we'll split the data into training and testing data to perform a quick summary of the training dataset, including the number of rows, the number of columns, the number of rows with missing data values, and the mean of our predictor variables.

In [None]:
#Renaming "Post Hour"
facebook_1 <- facebook %>% rename(post_hour = `Post Hour`) 
facebook_1 <- facebook_1 %>% rename(total_interactions = `Total Interactions`) 
facebook_filtered <- facebook_1 %>% filter(total_interactions < 5000) 

#Split into training (75 percent of data set) and test data
set.seed(2021)
facebook_split <- initial_split(facebook_filtered, prop = 0.75, strata = total_interactions)
facebook_train <- training(facebook_split)
facebook_test <- testing(facebook_split)


#Missing Rows of Data
missing_rows <- sum(colSums(is.na(facebook_train)))

#Average of Each Predictor Variable
average_likes <- mean(facebook_train[["like"]])
average_comments <- mean(facebook_train[["comment"]])
average_shares <- mean(facebook_train[["share"]])
average_total_interactions <- mean(facebook_train[["total_interactions"]])

#Total Number of Observations
observation_total <- nrow(facebook_train)

#Total Number of Variables
variable_total <- ncol(facebook_train)

#Exploratory Data Analysis Table
exploratory_table <- facebook_train %>%
                     mutate(facebook_train, observation_total = observation_total) %>%
                     mutate(facebook_train, variable_total = variable_total) %>%
                     mutate(facebook_train, average_likes = average_likes) %>%
                     mutate(facebook_train, average_comments = average_comments) %>%
                     mutate(facebook_train, average_shares = average_shares) %>%
                     mutate(facebook_train, average_total_interactions = average_total_interactions) %>%
                     mutate(facebook_train, missing_rows = missing_rows) %>%
                     select(observation_total, variable_total, average_likes, average_comments, average_shares, average_total_interactions,
                            missing_rows) %>%
                     slice(1)
exploratory_table
#facebook <- facebook %>% 
  #  select("Post Hour", "like", "comment", "share") %>% 
   # rename(post_hour = "Post Hour")
#facebook

Decription of us using highest engagment from this point forward.

In [None]:
#Ggpairs
facebook_ggpairs <- facebook_train %>%                 
                    select(post_hour, total_interactions) %>%
                    ggpairs()
facebook_ggpairs

This makes sense to have no correlation considering there shouldn't be a relationship between post_hour and total_interactions

## Predicting Best Post Hour

In [None]:
set.seed(2021)
#Recipe for Post Hour
facebook_recipe <- recipe(total_interactions ~ post_hour, data = facebook_train) %>%
                   step_scale(all_predictors()) %>%
                   step_center(all_predictors())
#Model
facebook_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>%
                 set_engine("kknn") %>%
                 set_mode("regression")

#5 fold cross validation
facebook_vfold <- vfold_cv(facebook_train, v = 5, strata = total_interactions)

#workflow
facebook_workflow <- workflow() %>%
                     add_recipe(facebook_recipe) %>%
                     add_model(facebook_spec)

#testing 200 values to find the best value for K
gridvals <- tibble(neighbors = seq(1, 200))

#Tuning workflow to test the values of K
facebook_results <- facebook_workflow %>%
                        tune_grid(resamples = facebook_vfold, grid = gridvals) %>%
                        collect_metrics()
#Min amount of RMSPE along with mean and standard error to find the best K value to use
facebook_min <- facebook_results %>%
    filter(.metric == "rmse") %>%
    arrange(mean)  %>% 
    slice(1)

facebook_min

narration

In [None]:
#Min RMSPE
set.seed(2021)
k_min <- facebook_min %>%
            pull(neighbors)

#Using the best K in our model
facebook_best_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = k_min) %>%
                            set_engine("kknn") %>%
                            set_mode("regression")

#Reworking the workflow to incorporate our chosen K
facebook_best_fit <- workflow() %>%
                        add_recipe(facebook_recipe) %>%
                        add_model(facebook_best_spec) %>%
                        fit(data = facebook_train)

#Summary of the training vs testing data
facebook_summary <- facebook_best_fit %>% 
                       predict(facebook_test) %>%
                       bind_cols(facebook_test) %>%
                       metrics(truth = total_interactions, estimate = .pred)  
facebook_summary

Narration

In [None]:
set.seed(2021)
facebook_preds <- facebook_best_fit %>%
                   predict(facebook_test) %>%
                   bind_cols(facebook_test)
facebook_preds
facebook_plot_final <- ggplot(facebook_preds, aes(x = post_hour, y = total_interactions)) +
                       geom_point(alpha = 0.1) +
                       xlab("Hours") +
                      ylab("Total Interactions (Sum of Likes, Comments, and Shares)") +
                      geom_line(data = facebook_preds, aes(x = post_hour, y = .pred), color = "blue") +
                      ggtitle(paste0("K = ", k_min))
facebook_plot_final

In [None]:
summary(facebook_preds)

In [None]:
fb_spec <- linear_reg() %>%
    set_engine("lm") %>%
    set_mode("regression")

fb_recipe <- recipe(total_interactions ~ post_hour, data = facebook_train)

fb_fit <- workflow() %>%
    add_recipe(fb_recipe) %>%
    add_model(fb_spec) %>%
    fit(data = facebook_train)

fb_test_results <- fb_fit %>%
    predict(facebook_test) %>%
    bind_cols(facebook_test) %>%
    metrics(truth = total_interactions, estimate = .pred)
fb_test_results

facebook_linear_preds <- fb_fit %>%
                   predict(facebook_test) %>%
                   bind_cols(facebook_test)
facebook_linear_preds
facebook_plot_final <- ggplot(facebook_linear_preds, aes(x = post_hour, y = total_interactions)) +
                       geom_point(alpha = 0.1) +
                       xlab("Hours") +
                       ylab("Total Interactions (Sum of Likes, Comments, and Shares)") +
                       geom_line(data = facebook_linear_preds, aes(x = post_hour, y = .pred), color = "blue") +
                       ggtitle(paste0("K = ", k_min))
facebook_plot_final