In [None]:
library(tidyverse)
library(repr)
library(tidymodels)

In [None]:
players_data <- read_csv("players.csv")|>
    select(-hashedEmail, -name, -gender)

head(players_data)

In [None]:
players_data$experience <- factor(players_data$experience,
                             levels = c("Beginner", "Amateur", "Regular", "Veteran", "Pro"),
                             labels = c(1, 2, 3, 4, 5))

head(players_data)

In [None]:
players <- players_data |>
    mutate(experience = as.numeric(experience)) |>
    mutate(subscribe = as_factor(subscribe))

head(players)

In [None]:
averages <- players|>
    summarize(avg_hrs = mean(played_hours, na.rm = TRUE), avg_age = mean(Age, na.rm = TRUE), avg_exp = mean(experience, na.rm = TRUE))
averages

In [None]:
age_groups <- players |> 
    arrange(Age) |> 
    mutate(Age = as_factor(Age)) |>
    count(experience, Age)

age_exp_plot <- ggplot(age_groups, aes(x = experience, y = n, fill = Age)) + 
    geom_bar(stat = "identity", position = "fill") + 
    labs(x = "Experience", y = "Count (0-1)", fill = "Age", title = "Age count of each Experience category")
age_exp_plot

The bar plot above shows the age counts of players in each experience category. Each experience value is labelled as follows: 1 = Beginner, 2 = Amateur, 3 = Regular, 4 = Veteran, 5 = Pro. This bar plot helps visualize if there are any significant relationships between age and the experience level of the player (ie. if a certain age occupies more of an experience level).  

In [None]:
options(repr.plot.width = 13)

exploratory_plot <- players |>
    ggplot(aes(x = Age, y = played_hours, color = subscribe)) +
    geom_point() +
    labs(x = "Player's age", y = "Hours played", color = "Subscribed to game-related newsletter") +
    theme(text = element_text(size = 13))

exploratory_plot

The scatterplot above shows hours of the video game played vs the player's age. The colour of a dot (blue = subscribed, orange = not subscribed) represents whether or not the player has subscribed to a game-related newsletter. Since our goal is to predict whether a player will subscribe to a game-related newsletter based on the player's age and the number of hours they played the game for, the plot can help estimate whether or not a player will subscribe. If a new point was placed on the scatterplot, one can observe the class of the points closest to it and make a estimate of the new point's class.

In [None]:
# set the seed
set.seed(1)

# creating train / test split
players_split <- initial_split(players, prop = 0.75, strata = subscribe)
players_train <- training(players_split)
players_test <- testing(players_split)

In [None]:
# preprocessing
players_recipe <- recipe(subscribe ~ Age + played_hours, data = players_train) |>
  step_scale(all_predictors()) |>
  step_center(all_predictors())

knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 3) |>
  set_engine("kknn") |>
  set_mode("classification")

knn_fit <- workflow() |>
  add_recipe(players_recipe) |>
  add_model(knn_spec) |>
  fit(data = players_train)

knn_fit

In [None]:
players_test_predictions <- predict(knn_fit, players_test) |>
    bind_cols(players_test)

players_test_predictions |>
    metrics(truth = subscribe, estimate = .pred_class) |>
    filter(.metric == "accuracy")

players_test_predictions |> pull(subscribe) |> levels()

# positive label ('TRUE') is second level
players_test_predictions |>
    precision(truth = subscribe, estimate = .pred_class, event_level = "second") 

players_test_predictions |>
    recall(truth = subscribe, estimate = .pred_class, event_level = "second")

In [None]:
confusion <- players_test_predictions |>
             conf_mat(truth = subscribe, estimate = .pred_class)

confusion

# note that the accuracy, precision, and recall are not very high

In [None]:
set.seed(1)

k_vals <- tibble(neighbors = c(1:20))
training_10_fold <- vfold_cv(players_train, v = 5, strata = subscribe)
tune_spec <- nearest_neighbor(weight_func = "rectangular", mode = "classification", neighbors = tune()) |> 
                    set_engine("kknn") |> set_mode("classification")

cross_val <- workflow() |> add_recipe(players_recipe) |> add_model(tune_spec) |> 
                        tune_grid(resamples = training_10_fold, grid = k_vals)
cross_val_metrics <- collect_metrics(cross_val)
accuracies <- cross_val_metrics |> filter(.metric == "accuracy") 
cross_val_plot <- accuracies |> ggplot(aes(x = neighbors, y = mean)) + geom_point() + geom_line() +
                                                labs(x = "Neighbours", y = "Accuracy Estimate", title = "Accuracy of Predictions vs K-values (1-20)") 
cross_val_plot
                                                                                                                      

In [None]:
set.seed(1)

k_vals <- tibble(neighbors = seq(from = 1, to = 20, by = 1))

training_10_fold <- vfold_cv(players_train, v = 5, strata = subscribe)

tune_spec <- nearest_neighbor(weight_func = "rectangular", 
                              neighbors = tune()) |> 

                set_engine("kknn") |> 
                set_mode("classification")


cross_val <- workflow() |> 
    add_recipe(players_recipe) |> 
    add_model(tune_spec) |> 
    tune_grid(resamples = training_10_fold, grid = k_vals)


cross_val_metrics <- collect_metrics(cross_val)

accuracies <- cross_val_metrics |> 
filter(.metric == "accuracy") 
cross_val_plot <- accuracies |> 
ggplot(aes(x = neighbors, y = mean)) + geom_point() + geom_line() +
                                                labs(x = "Neighbours", y = "Accuracy Estimate", title = "Accuracy of Predictions vs K-values (1-20)") 
cross_val_plot
                

In [None]:
#error from NA in age or hours played. Clean players data to drop NAs.

players_clean <- players |> 
drop_na(Age, played_hours)

# set the seed
set.seed(1)

# creating train / test split
players_split <- initial_split(players_clean, prop = 0.75, strata = subscribe)
players_train <- training(players_split)
players_test <- testing(players_split)

#preprocessing
players_recipe <- recipe(subscribe ~ Age + played_hours, data = players_train) |>
  step_scale(all_predictors()) |>
  step_center(all_predictors())

knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 3) |>
  set_engine("kknn") |>
  set_mode("classification")

knn_fit <- workflow() |>
  add_recipe(players_recipe) |>
  add_model(knn_spec) |>
  fit(data = players_train)

knn_fit


In [None]:
k_vals <- tibble(neighbors = seq(from = 1, to = 20, by = 1))

training_10_fold <- vfold_cv(players_train, v = 5, strata = subscribe)

tune_spec <- nearest_neighbor(weight_func = "rectangular", 
                              neighbors = tune()) |> 

                set_engine("kknn") |> 
                set_mode("classification")


cross_val <- workflow() |> 
    add_recipe(players_recipe) |> 
    add_model(tune_spec) |> 
    tune_grid(resamples = training_10_fold, grid = k_vals)
