# Predicting Newsletter Subscription from Player Age in a Minecraft Research Server


# Introduction:






# Methods & Results:

In [5]:
library(tidyverse)
library(repr)
library(tidymodels)
library(ggplot2)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.1.1 ──

[32m✔[39m [34mbroom       [39m 1.0.6     [32m✔[39m [34mrsample     [39

In [6]:
players <- read_csv("https://raw.githubusercontent.com/calentynes/dsci_group_project/refs/heads/master/players.csv")
sessions <- read_csv("https://raw.githubusercontent.com/calentynes/dsci_group_project/refs/heads/master/sessions.csv")

[1mRows: [22m[34m196[39m [1mColumns: [22m[34m7[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (4): experience, hashedEmail, name, gender
[32mdbl[39m (2): played_hours, Age
[33mlgl[39m (1): subscribe

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m1535[39m [1mColumns: [22m[34m5[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (3): hashedEmail, start_time, end_time
[32mdbl[39m (2): original_start_time, original_end_time

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.



# Discussion:
- summarize what you found
- discuss whether this is what you expected to find
- discuss what impact could such findings have
- discuss what future questions could this lead to
  
# References

You may include references if necessary, as long as they all have a consistent citation style.

Combining two datasets together:

In [7]:
sessions <- sessions |>
  mutate(session_length = as.numeric(difftime(end_time, start_time, units = "hours")))
player_summary <- sessions |>
  group_by(hashedEmail) |>
  summarize(
    total_sessions = n(),
    avg_session_length = mean(session_length, na.rm = TRUE),
    total_play_time = sum(session_length, na.rm = TRUE))
player_data <- players |>
  left_join(player_summary, by = "hashedEmail")

In [None]:
players_clean <- players |> filter(!is.na(subscribe))
p_age <- players_clean |>
  filter(!is.na(Age)) |>
  mutate(age_group = cut(Age, breaks = seq(5, 60, by = 5), right = TRUE)) |>
  ggplot(aes(x = age_group, fill = subscribe)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = percent) +
  labs(
    title = "Figure 1. Subscription rate by age group",
    x = "Age group (years)",
    y = "Proportion of players"
  )
p_age

In [None]:
players_age <- players |>
  filter(!is.na(Age),
         !is.na(subscribe)) |>
  mutate(subscribe = factor(subscribe,
                       levels = c(FALSE, TRUE),
                       labels = c("No", "Yes")))

In [None]:
players_age |>
  summarise(
    n = n(),
    mean_age = mean(Age),
    sd_age   = sd(Age)
  )

players_age |>
  count(subscribe)

In [None]:
set.seed(123)
players_split <- initial_split(players_age, prop = 0.8, strata = subscribe)
players_train <- training(players_split)
players_test  <- testing(players_split)

nrow(players_train); nrow(players_test)

In [None]:
subscribe_recipe <- recipe(subscribe ~ Age, data = players_train) |>
  step_normalize(all_predictors())

subscribe_recipe

In [2]:
knn_spec <- nearest_neighbor(
    weight_func = "rectangular",
    neighbors   = tune()
  ) |>
  set_engine("kknn") |>
  set_mode("classification")

set.seed(123)
players_vfold <- vfold_cv(players_train, v = 5, strata = subscribe)

players_wflow <- workflow() |>
  add_recipe(subscribe_recipe) |>
  add_model(knn_spec)

grid_k <- tibble(neighbors = seq(1, 51, by = 2))

set.seed(123)
knn_results <- players_wflow |>
  tune_grid(resamples = players_vfold,
            grid      = grid_k) |>
  collect_metrics()

knn_results |>
  filter(.metric == "accuracy") |>
  arrange(desc(mean)) |>
  slice(1)

ERROR: Error in set_mode(set_engine(nearest_neighbor(weight_func = "rectangular", : could not find function "set_mode"


In [3]:
best_k <- knn_results |>
  filter(.metric == "accuracy") |>
  slice_max(mean, n = 1) |>
  pull(neighbors)

best_k

ERROR: Error in pull(slice_max(filter(knn_results, .metric == "accuracy"), mean, : could not find function "pull"


In [4]:
final_knn_spec <- nearest_neighbor(
    weight_func = "rectangular",
    neighbors   = best_k
  ) |>
  set_engine("kknn") |>
  set_mode("classification")

final_wflow <- workflow() |>
  add_recipe(subscribe_recipe) |>
  add_model(final_knn_spec)

final_fit <- final_wflow |>
  fit(data = players_train)

ERROR: Error in set_mode(set_engine(nearest_neighbor(weight_func = "rectangular", : could not find function "set_mode"


In [20]:
test_pred <- predict(final_fit, players_test, type = "prob") |>
  bind_cols(
    predict(final_fit, players_test),   
    players_test |> select(subscribe, Age)
  )

head(test_pred)

.pred_No,.pred_Yes,.pred_class,subscribe,Age
<dbl>,<dbl>,<fct>,<fct>,<dbl>
0.44,0.56,Yes,Yes,17
0.44,0.56,Yes,No,17
0.2,0.8,Yes,No,21
0.44,0.56,Yes,Yes,17
0.48,0.52,Yes,Yes,58
0.6,0.4,No,No,18
